In [2]:
import pandas as pd
import numpy as np
import string as st
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

import matplotlib.pyplot as plt

lemmatizer = WordNetLemmatizer()
sent = SentimentIntensityAnalyzer() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mani7\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [42]:
newsdf = pd.read_excel('C:/Users/mani7/Desktop/smu datathon/news_excerpts_parsed.xlsx')
wikidf = pd.read_excel('C:/Users/mani7/Desktop/smu datathon/wikileaks_parsed.xlsx')

In [38]:
newsdf.head(10)

Unnamed: 0,Link,Text
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...
5,https://www.euronews.com/2024/02/23/judge-conv...,"After twelve days of deliberation, a Vienna co..."
6,https://edition.cnn.com/2022/07/21/economy/chi...,China’s cyberspace regulator fined Didi Global...
7,https://www.brusselstimes.com/justice-belgium/...,"On Thursday, the Brussels Criminal Court’s pro..."
8,https://www.expats.cz/czech-news/article/forme...,The Prague 3 District Court this morning sente...
9,https://www.thelocal.dk/20240311/british-trade...,"Sanjay Shah, who was arrested in June 2022 in ..."


In [15]:
wikidf.head(10)

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,1.pdf,Investigative details\n\nIn his/her interviews...
2,10.pdf,"""An interoffice memorandum providing an “outst..."
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ..."
4,10.pdf,"""When asked about this in interview, the Divis..."
5,10.pdf,"""INVESTIGATION DETAILS\n\nThis part of the inv..."
6,10.pdf,"""At paragraph 4 of the Cargo Apron Extension e..."
7,10.pdf,"""Until the end of June 2002, responsibility fo..."
8,10.pdf,"""METHODOLOGY\n\nThis investigation was conduct..."
9,10.pdf,"""Allegation 4:\n\n(Specifically, that the Vend..."


In [20]:
len(newsdf)

1509

In [21]:
len(wikidf)

143

### checking for empty cells in datasets

In [17]:
empty_columns = newsdf.isnull().all(axis=0)
print(empty_columns)

Link    False
Text    False
dtype: bool


In [23]:
empty_columnss = wikidf.isnull().all(axis=0)
print(empty_columnss)

PDF Path    False
Text        False
dtype: bool


### hence, both datasets hv no empty cells in the both columns

# pre-processing text in newsdf dataset

In [39]:
print("Original text (Text) for the first row:")
print("")
print(newsdf['Text'].iloc[0])  

Original text (Text) for the first row:

Starbucks violated federal labor law when it increased wages and offered new perks and benefits only to non-union employees, a National Labor Relations Board judge found Thursday.

The decision is the latest in a series of NLRB rulings finding that Starbucks has violated labor law in its efforts to stop unions from forming in its coffee shops.

“The issue at the heart of this case is whether, under current Board law, [Starbucks] was entitled to explicitly reward employees,” for not participating in union activity, “while falsely telling its workers that the federal labor law forced it to take this action,” wrote administrative law judge Mara-Louise Anzalone. “It was not.”


In [54]:
def preprocess_text(text):
    # Remove punctuation except for spaces
    text = "".join([ch for ch in text if ch not in st.punctuation])
    
    # Tokenize and convert to lowercase
    tokens = re.split(r'\s+', text.lower())
    
    # Keep "number + unit" patterns together
    combined_tokens = []
    i = 0
    while i < len(tokens):
        # Check for "number or currency + unit"
        if i + 1 < len(tokens) and re.match(r'^(€|\$|£)?\d+(\.\d+)?$', tokens[i]) and tokens[i + 1] in {"million", "billion", "thousand"}:
            combined_tokens.append(f"{tokens[i]} {tokens[i + 1]}")
            i += 2  # Skip the next token as it is already combined
        else:
            combined_tokens.append(tokens[i])
            i += 1
    
    # Remove stopwords and lemmatize
    custom_stopwords = set(stopwords.words('english')).union({'said', 'also', 'one', 'mr', 'per', 'u', '–', 'm'})
    filtered_tokens = [word for word in all_tokens if word not in custom_stopwords]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_tokens

In [55]:
newsdf['cleaned text'] = newsdf['Text'].apply(lambda x: preprocess_text(x))
newsdf.head()

Unnamed: 0,Link,Text,cleaned text
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,"[starbucks, violated, federal, labor, law, inc..."
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,"[starbucks, violated, federal, labor, law, inc..."
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,"[starbucks, violated, federal, labor, law, inc..."
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,"[starbucks, violated, federal, labor, law, inc..."
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,"[starbucks, violated, federal, labor, law, inc..."


In [56]:
print("Text after cleaning for the first row:")
print("")
print(newsdf['cleaned text'].iloc[2]) 

Text after cleaning for the first row:



# checking for common words and phrases

In [60]:
from collections import Counter
from itertools import chain

# Combine all preprocessed tokens into a single list
all_tokens = list(chain.from_iterable(newsdf['cleaned text']))

# Count unigrams
unigram_counts = Counter(all_tokens)

# Extract bigrams
bigrams = list(ngrams(all_tokens, 2))
bigram_counts = Counter(bigrams)

# Extract trigrams
trigrams = list(ngrams(all_tokens, 3))
trigram_counts = Counter(trigrams)

# Display top 10 unigrams, bigrams, and trigrams
print("Top 10 Unigrams:")
for word, count in unigram_counts.most_common(10):
    print(f"{word}: {count}")

print("\nTop 10 Bigrams:")
for bigram, count in bigram_counts.most_common(10):
    print(f"{' '.join(bigram)}: {count}")

print("\nTop 10 Trigrams:")
for trigram, count in trigram_counts.most_common(10):
    print(f"{' '.join(trigram)}: {count}")

Top 10 Unigrams:
year: 890310
singapore: 801279
company: 666978
two: 535695
new: 519096
china: 479862
last: 425538
first: 417993
time: 411957
people: 402903

Top 10 Bigrams:
last year: 173535
united state: 141846
prime minister: 117702
social medium: 98085
national university: 81486
hong kong: 78468
last week: 70923
year ago: 67905
north korea: 63378
south korea: 61869

Top 10 Trigrams:
national university singapore: 30180
prime minister lee: 21126
national university hospital: 21126
social medium platform: 19617
president joe biden: 19617
minister lee hsien: 19617
lee hsien loong: 19617
university singapore nu: 19617
people familiar matter: 18108
united arab emirate: 16599


# sentimental analysis using vader

In [38]:
newsdf["Link"].value_counts()

Link
https://cnalifestyle.channelnewsasia.com/dining/ilmiri-korean-fusion-cuisine-singapore-373046                           2
https://edition.cnn.com/2023/09/29/business/starbucks-union-wages/index.html                                            1
https://cnalifestyle.channelnewsasia.com/dining/simbian-chua-century-bakkwa-coffres-singapore-food-277991               1
https://www.bbc.com/news/uk-66592354                                                                                    1
https://www.bbc.com/news/world-asia-china-66636705                                                                      1
                                                                                                                       ..
https://cnalifestyle.channelnewsasia.com/dining/ingen-kyoto-ion-orchard-teahouse-hvala-382296                           1
https://cnalifestyle.channelnewsasia.com/dining/three-little-coconut-cafe-punggol-376741                                1
https://cnalifestyl

In [53]:
from urllib.parse import urlparse

newsdf['domain'] = newsdf['Link'].apply(lambda x: urlparse(x).netloc)

newsdf.head()

Unnamed: 0,Link,Text,Pos Score,Neg Score,Comp Score,Overall Rating,domain
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,0.086,0.099,-0.3612,Negative,edition.cnn.com
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,0.043,0.166,-0.9442,Negative,www.channelnewsasia.com
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,0.063,0.042,0.2263,Negative,edition.cnn.com
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,0.091,0.148,-0.765,Negative,www.channelnewsasia.com
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,0.178,0.178,-0.4404,Negative,edition.cnn.com


In [54]:
newsdf['domain'].unique()

array(['edition.cnn.com', 'www.channelnewsasia.com', 'www.euronews.com',
       'www.brusselstimes.com', 'www.expats.cz', 'www.thelocal.dk',
       'news.postimees.ee', 'yle.fi', 'icelandmonitor.mbl.is',
       'www.ndtv.com', 'www.straitstimes.com', 'www.reuters.com',
       'www.cbsnews.com', 'www.cnbc.com', 'www.nasdaq.com',
       'www.businesstimes.com.sg', 'apnews.com', 'www.ctvnews.ca',
       'www.hrw.org', 'www.bbc.com', 'uniglobalunion.org',
       'www.koreatimes.co.kr', 'in-cyprus.philenews.com',
       'au.news.yahoo.com', 'www.inqld.com.au', 'www.independent.co.ug',
       'web.archive.org', 'reliefweb.int', 'www.theguardian.com',
       'www.newsweek.com', 'www.africanews.com', 'www.dw.com',
       'www.thenation.com', 'www.nytimes.com', 'www.nbcnews.com',
       'www.sundaytimes.lk', 'www.aljazeera.com', 'www.irishtimes.com',
       'www.washingtonpost.com', 'www.independent.co.uk',
       'cnalifestyle.channelnewsasia.com', 'money.cnn.com',
       'straitstimes.com', '

In [58]:
import tldextract

newsdf['main_domain'] = newsdf['Link'].apply(lambda x: tldextract.extract(x).registered_domain)

newsdf.head()

Unnamed: 0,Link,Text,Pos Score,Neg Score,Comp Score,Overall Rating,domain,main_domain
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,0.086,0.099,-0.3612,Negative,edition.cnn.com,cnn.com
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,0.043,0.166,-0.9442,Negative,www.channelnewsasia.com,channelnewsasia.com
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,0.063,0.042,0.2263,Negative,edition.cnn.com,cnn.com
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,0.091,0.148,-0.765,Negative,www.channelnewsasia.com,channelnewsasia.com
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,0.178,0.178,-0.4404,Negative,edition.cnn.com,cnn.com


In [59]:
newsdf['main_domain'].unique()

array(['cnn.com', 'channelnewsasia.com', 'euronews.com',
       'brusselstimes.com', 'expats.cz', 'thelocal.dk', 'postimees.ee',
       'yle.fi', 'mbl.is', 'ndtv.com', 'straitstimes.com', 'reuters.com',
       'cbsnews.com', 'cnbc.com', 'nasdaq.com', 'businesstimes.com.sg',
       'apnews.com', 'ctvnews.ca', 'hrw.org', 'bbc.com',
       'uniglobalunion.org', 'koreatimes.co.kr', 'philenews.com',
       'yahoo.com', 'inqld.com.au', 'independent.co.ug', 'archive.org',
       'reliefweb.int', 'theguardian.com', 'newsweek.com',
       'africanews.com', 'dw.com', 'thenation.com', 'nytimes.com',
       'nbcnews.com', 'sundaytimes.lk', 'aljazeera.com', 'irishtimes.com',
       'washingtonpost.com', 'independent.co.uk', 'singhealth.com.sg',
       'nuhsplus.edu.sg', 'nationalgeographic.com',
       'nationalgeographic.co.uk', 'todayonline.com', 'time.com',
       'scmp.com', 'foxnews.com', 'taipeitimes.com', 'gulfnews.com'],
      dtype=object)

In [56]:
newsdf.iloc[0][1]

  newsdf.iloc[0][1]


'Starbucks violated federal labor law when it increased wages and offered new perks and benefits only to non-union employees, a National Labor Relations Board judge found Thursday.\n\nThe decision is the latest in a series of NLRB rulings finding that Starbucks has violated labor law in its efforts to stop unions from forming in its coffee shops.\n\n“The issue at the heart of this case is whether, under current Board law, [Starbucks] was entitled to explicitly reward employees,” for not participating in union activity, “while falsely telling its workers that the federal labor law forced it to take this action,” wrote administrative law judge Mara-Louise Anzalone. “It was not.”'

In [25]:
sent.polarity_scores(newsdf.iloc[0][1])

  sent.polarity_scores(newsdf.iloc[0][1])


{'neg': 0.099, 'neu': 0.815, 'pos': 0.086, 'compound': -0.3612}

In [26]:
senti_rating = sent.polarity_scores(newsdf.iloc[0][1])

  senti_rating = sent.polarity_scores(newsdf.iloc[0][1])


In [27]:
type(senti_rating)

dict

In [28]:
senti_rating['compound']

-0.3612

In [44]:
score_com = []
score_pos = []
score_neg = []
for i in range(0, newsdf.shape[0]):
    score = sent.polarity_scores(newsdf.iloc[i][1])
    score1 = score['compound']
    score_com.append(score1)

    score2 = score['pos']
    score_pos.append(score2)

    score3 = score['neg']
    score_neg.append(score3)

  score = sent.polarity_scores(newsdf.iloc[i][1])


In [45]:
newsdf["Pos Score"] = score_pos
newsdf["Neg Score"] = score_neg
newsdf["Comp Score"] = score_com

In [46]:
newsdf.shape

(1509, 5)

In [47]:
newsdf.head()

Unnamed: 0,Link,Text,Pos Score,Neg Score,Comp Score
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,0.086,0.099,-0.3612
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,0.043,0.166,-0.9442
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,0.063,0.042,0.2263
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,0.091,0.148,-0.765
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,0.178,0.178,-0.4404


In [48]:
newsdf["Comp Score"].mean()

0.14752001325381045

In [49]:
newsdf['Overall Rating'] = newsdf['Comp Score'].apply(lambda x: 'Positive' if x > 0.5 else 'Negative')

newsdf.head(10)

Unnamed: 0,Link,Text,Pos Score,Neg Score,Comp Score,Overall Rating
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,0.086,0.099,-0.3612,Negative
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,0.043,0.166,-0.9442,Negative
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,0.063,0.042,0.2263,Negative
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,0.091,0.148,-0.765,Negative
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,0.178,0.178,-0.4404,Negative
5,https://www.euronews.com/2024/02/23/judge-conv...,"After twelve days of deliberation, a Vienna co...",0.09,0.083,-0.296,Negative
6,https://edition.cnn.com/2022/07/21/economy/chi...,China’s cyberspace regulator fined Didi Global...,0.069,0.094,-0.5106,Negative
7,https://www.brusselstimes.com/justice-belgium/...,"On Thursday, the Brussels Criminal Court’s pro...",0.083,0.109,-0.7269,Negative
8,https://www.expats.cz/czech-news/article/forme...,The Prague 3 District Court this morning sente...,0.066,0.181,-0.9345,Negative
9,https://www.thelocal.dk/20240311/british-trade...,"Sanjay Shah, who was arrested in June 2022 in ...",0.067,0.127,-0.8809,Negative


In [50]:
newsdf['Overall Rating'].value_counts()

Overall Rating
Negative    817
Positive    692
Name: count, dtype: int64