In [32]:
import pandas as pd
import numpy as np
import string as st
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

import matplotlib.pyplot as plt

lemmatizer = WordNetLemmatizer()
sent = SentimentIntensityAnalyzer() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [34]:
wikidf = pd.read_excel('C:/Users/mani7/Desktop/smu datathon/wikileaks_parsed.xlsx')

In [11]:
wikidf.head(10)

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,1.pdf,Investigative details\n\nIn his/her interviews...
2,10.pdf,"""An interoffice memorandum providing an “outst..."
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ..."
4,10.pdf,"""When asked about this in interview, the Divis..."
5,10.pdf,"""INVESTIGATION DETAILS\n\nThis part of the inv..."
6,10.pdf,"""At paragraph 4 of the Cargo Apron Extension e..."
7,10.pdf,"""Until the end of June 2002, responsibility fo..."
8,10.pdf,"""METHODOLOGY\n\nThis investigation was conduct..."
9,10.pdf,"""Allegation 4:\n\n(Specifically, that the Vend..."


In [12]:
len(wikidf)

143

In [13]:
empty_columnss = wikidf.isnull().all(axis=0)
print(empty_columnss)

PDF Path    False
Text        False
dtype: bool


In [14]:
print("Original text (Text) for the first row:")
print("")
print(wikidf['Text'].iloc[0])  

Original text (Text) for the first row:

Pristina Airport – Possible administrative irregularity regarding tender procedures involving Vendor 1 and Vendor 2

Allegation

Two companies with the same owner took part at least three times in the same Airport tenders.

Background Information

The Kosovo citizen, Vendor 1 and Vendor 2 Representative, is the owner and Director of the Pristina-based Vendor 1 and also a 51% shareholder of the Pristina-Ljubljana-based company Vendor 2. Both companies have their residences at the same address in Pristina.

Both Vendor 1 and Vendor 2 submitted three times in 2003 for the same tenders:

Supply and Mounting of Sonic System in the Fire Station Building. Winner was Vendor 2 with €1,530 followed by Vendor 1 with €1,620. The third company, Vendor 3, did not provide a price offer.

Cabling of Flat Display Information System (FIDS). Winner was Vendor 1 with €15,919 followed by Vendor 2 with €19,248.70. The other two competitors, Vendor 3 and Vendor 4, off

In [12]:
def preprocess_text(text):
    # Remove punctuation except for spaces
    text = "".join([ch for ch in text if ch not in st.punctuation])
    
    # Tokenize and convert to lowercase
    tokens = re.split(r'\s+', text.lower())
    
    # Keep "number + unit" patterns together
    combined_tokens = []
    i = 0
    while i < len(tokens):
        # Check for "number or currency + unit"
        if i + 1 < len(tokens) and re.match(r'^(€|\$|£)?\d+(\.\d+)?$', tokens[i]) and tokens[i + 1] in {"million", "billion", "thousand"}:
            combined_tokens.append(f"{tokens[i]} {tokens[i + 1]}")
            i += 2  # Skip the next token as it is already combined
        else:
            combined_tokens.append(tokens[i])
            i += 1
    
    # Remove stopwords and lemmatize
    custom_stopwords = set(stopwords.words('english')).union({'said', 'also', 'one', 'mr', 'per', 'u', '–', 'm'})
    filtered_tokens = [word for word in combined_tokens if word not in custom_stopwords]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_tokens

In [13]:
wikidf['cleaned text'] = wikidf['Text'].apply(lambda x: preprocess_text(x))
wikidf.head()

Unnamed: 0,PDF Path,Text,cleaned text
0,1.pdf,Pristina Airport – Possible administrative irr...,"[pristina, airport, possible, administrative, ..."
1,1.pdf,Investigative details\n\nIn his/her interviews...,"[investigative, detail, hisher, interview, con..."
2,10.pdf,"""An interoffice memorandum providing an “outst...","[interoffice, memorandum, providing, “outstand..."
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ...","[allegation, 2, 3, specifically, three, person..."
4,10.pdf,"""When asked about this in interview, the Divis...","[asked, interview, divisional, manager, stated..."


In [16]:
print("Text after cleaning for the first row:")
print("")
print(wikidf['cleaned text'].iloc[5]) 

Text after cleaning for the first row:

['investigation', 'detail', 'part', 'investigation', 'relates', 'tender', 'contract', 'extension', 'cargo', 'terminal', 'apron', 'pristina', 'airport', 'value', 'contract', '€77431863', 'allegation', '1', 'specifically', 'tender', 'document', 'cargo', 'apron', 'comply', 'requirement', 'article', '234', 'unmik', 'finance', 'administrative', 'instruction', '19992', 'public', 'procurement', 'using', 'kosovo', 'consolidated', 'budget', 'fund', 'based', 'summary', 'document', 'include', 'clear', 'instruction', 'drawing', 'plan', 'note', 'project', 'engineer', 'evaluation', 'company', 'peap', 'official', 'dated', '31', 'january', '2001', 'indicates', 'evaluation', 'company', 'commissioned', 'produce', 'preliminary', 'design', 'cargo', 'terminal', 'pristina', 'airport', 'design', 'provided', 'construction', 'cargo', 'apron', 'extension']


In [20]:
from collections import Counter
from itertools import chain

# Combine all preprocessed tokens into a single list
all_tokens = list(chain.from_iterable(wikidf['cleaned text']))

# Extract bigrams
bigrams = list(ngrams(all_tokens, 2))
bigram_counts = Counter(bigrams)

# Extract trigrams
trigrams = list(ngrams(all_tokens, 3))
trigram_counts = Counter(trigrams)

# Extract 4 word phrases
fourgrams = list(ngrams(all_tokens, 4))
fourgram_counts = Counter(fourgrams)

# Display top 10 bigrams and trigrams

print("\nTop 10 Bigrams:")
for bigram, count in bigram_counts.most_common(10):
    print(f"{' '.join(bigram)}: {count}")

print("\nTop 10 Trigrams:")
for trigram, count in trigram_counts.most_common(10):
    print(f"{' '.join(trigram)}: {count}")

print("\nTop 10 Four word phrases:")
for trigram, count in fourgram_counts.most_common(10):
    print(f"{' '.join(trigram)}: {count}")

Top 10 Unigrams:
official: 255
airport: 242
1: 227
staff: 150
pristina: 134
2: 132
vendor: 129
member: 126
officer: 117
procurement: 111

Top 10 Bigrams:
staff member: 92
pristina airport: 82
official 1: 60
vendor 1: 46
doti official: 46
atcs official: 39
officer 1: 38
official 2: 37
finance officer: 37
vendor 2: 35

Top 10 Trigrams:
doti official 1: 27
pristina international airport: 27
non staff member: 24
staff member 1: 23
procurement officer 1: 21
doti official 2: 18
end june 2002: 16
2002 responsibility administration: 15
dra logistics officer: 13
unep staff member: 13


# sentimental analysis using vader

### identifying link domains  

In [16]:
wikidf["PDF Path"]. value_counts()

PDF Path
2.pdf      17
10.pdf     10
82.pdf     10
47.pdf      9
16.pdf      8
4.pdf       8
69.pdf      7
49.pdf      6
24.pdf      5
27.pdf      4
13.pdf      4
38.pdf      4
51.pdf      3
44.pdf      3
52.pdf      3
89.pdf      3
9.pdf       3
73.pdf      2
1.pdf       2
35.pdf      2
26.pdf      2
105.pdf     2
15.pdf      2
14.pdf      2
108.pdf     2
11.pdf      2
111.pdf     1
106.pdf     1
107.pdf     1
8.pdf       1
63.pdf      1
60.pdf      1
110.pdf     1
5.pdf       1
31.pdf      1
112.pdf     1
113.pdf     1
45.pdf      1
114.pdf     1
43.pdf      1
39.pdf      1
36.pdf      1
21.pdf      1
91.pdf      1
Name: count, dtype: int64

In [17]:
wikidf.iloc[0][1]

  wikidf.iloc[0][1]


'Pristina Airport – Possible administrative irregularity regarding tender procedures involving Vendor 1 and Vendor 2\n\nAllegation\n\nTwo companies with the same owner took part at least three times in the same Airport tenders.\n\nBackground Information\n\nThe Kosovo citizen, Vendor 1 and Vendor 2 Representative, is the owner and Director of the Pristina-based Vendor 1 and also a 51% shareholder of the Pristina-Ljubljana-based company Vendor 2. Both companies have their residences at the same address in Pristina.\n\nBoth Vendor 1 and Vendor 2 submitted three times in 2003 for the same tenders:\n\nSupply and Mounting of Sonic System in the Fire Station Building. Winner was Vendor 2 with €1,530 followed by Vendor 1 with €1,620. The third company, Vendor 3, did not provide a price offer.\n\nCabling of Flat Display Information System (FIDS). Winner was Vendor 1 with €15,919 followed by Vendor 2 with €19,248.70. The other two competitors, Vendor 3 and Vendor 4, offered prices of Euro 19,702

In [18]:
sent.polarity_scores(wikidf.iloc[0][1])

  sent.polarity_scores(wikidf.iloc[0][1])


{'neg': 0.013, 'neu': 0.91, 'pos': 0.078, 'compound': 0.9042}

In [19]:
senti_rating = sent.polarity_scores(wikidf.iloc[0][1])

  senti_rating = sent.polarity_scores(wikidf.iloc[0][1])


In [20]:
type(senti_rating)

dict

In [21]:
senti_rating['compound']

0.9042

### assigning scores

In [36]:
score_com = []
score_pos = []
score_neg = []
for i in range(0, wikidf.shape[0]):
    score = sent.polarity_scores(wikidf.iloc[i][1])
    score1 = score['compound']
    score_com.append(score1)

    score2 = score['pos']
    score_pos.append(score2)

    score3 = score['neg']
    score_neg.append(score3)

  score = sent.polarity_scores(wikidf.iloc[i][1])


In [37]:
wikidf["Pos Score"] = score_pos
wikidf["Neg Score"] = score_neg
wikidf["Comp Score"] = score_com

In [38]:
wikidf.shape

(143, 5)

In [39]:
wikidf.head()

Unnamed: 0,PDF Path,Text,Pos Score,Neg Score,Comp Score
0,1.pdf,Pristina Airport – Possible administrative irr...,0.078,0.013,0.9042
1,1.pdf,Investigative details\n\nIn his/her interviews...,0.092,0.092,-0.25
2,10.pdf,"""An interoffice memorandum providing an “outst...",0.0,0.0,0.0
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ...",0.051,0.014,0.6249
4,10.pdf,"""When asked about this in interview, the Divis...",0.056,0.053,0.09


In [40]:
wikidf["Comp Score"].mean()

0.27020139860139863

In [41]:
wikidf['Overall Rating'] = wikidf['Comp Score'].apply(lambda x: 'Positive' if x > 0.5 else 'Negative')

wikidf.head(10)

Unnamed: 0,PDF Path,Text,Pos Score,Neg Score,Comp Score,Overall Rating
0,1.pdf,Pristina Airport – Possible administrative irr...,0.078,0.013,0.9042,Positive
1,1.pdf,Investigative details\n\nIn his/her interviews...,0.092,0.092,-0.25,Negative
2,10.pdf,"""An interoffice memorandum providing an “outst...",0.0,0.0,0.0,Negative
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ...",0.051,0.014,0.6249,Positive
4,10.pdf,"""When asked about this in interview, the Divis...",0.056,0.053,0.09,Negative
5,10.pdf,"""INVESTIGATION DETAILS\n\nThis part of the inv...",0.019,0.017,0.0557,Negative
6,10.pdf,"""At paragraph 4 of the Cargo Apron Extension e...",0.012,0.033,-0.4559,Negative
7,10.pdf,"""Until the end of June 2002, responsibility fo...",0.035,0.0,0.7845,Positive
8,10.pdf,"""METHODOLOGY\n\nThis investigation was conduct...",0.067,0.025,0.7506,Positive
9,10.pdf,"""Allegation 4:\n\n(Specifically, that the Vend...",0.02,0.009,0.2263,Negative


In [42]:
wikidf['Overall Rating'].value_counts()

Overall Rating
Negative    79
Positive    64
Name: count, dtype: int64