In [1]:
## Importing relevant libraries

import pandas as pd
import numpy as np

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')

## Stop warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\salma\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
## Loading the data

df = pd.read_csv(r'data\articles_preprocessed.csv')
print(df.shape)
df.head()

(142570, 5)


Unnamed: 0,title,content,author,publication,content_lemmatized
0,house republicans fret winning health care sui...,washington congressional republicans new fear ...,Carl Hulse,New York Times,washington congressional republican new fear c...
1,rift officers residents killings persist south...,bullet shells get counted blood dries votive c...,Benjamin Mueller and Al Baker,New York Times,bullet shell get counted blood dry votive cand...
2,tyrus wong bambi artist thwarted racial bias d...,walt disney bambi opened critics praised spare...,Margalit Fox,New York Times,walt disney bambi opened critic praised spare ...
3,among deaths heavy toll pop music new york times,death may great equalizer necessarily evenhand...,William McDonald,New York Times,death may great equalizer necessarily evenhand...
4,kim jong un says north korea preparing test lo...,seoul south korea north korea leader kim said ...,Choe Sang-Hun,New York Times,seoul south korea north korea leader kim said ...


In [3]:
## Checking for missing values

df.isnull().sum()

title                  37
content               108
author                  0
publication             0
content_lemmatized    108
dtype: int64

In [4]:
## Dropping missing values

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.shape)

(142426, 5)


In [5]:
## Function to perform sentiment analysis

def analyze_sentiment(article):
    
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(article)

    compound_score = sentiment_score['compound']

    if compound_score >= 0.05:
        sentiment = 'positive'
    elif compound_score <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    return sentiment, compound_score

In [6]:
## Performing sentiment analysis

df['sentiment'] = np.nan
df['compound_score'] = np.nan

for i in range(len(df)):
    df['sentiment'][i], df['compound_score'][i] = analyze_sentiment(df['content_lemmatized'][i])

    if i % 10000 == 0:
        print(i, sep=' ')

df.head()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000


Unnamed: 0,title,content,author,publication,content_lemmatized,sentiment,compound_score
0,house republicans fret winning health care sui...,washington congressional republicans new fear ...,Carl Hulse,New York Times,washington congressional republican new fear c...,positive,0.6497
1,rift officers residents killings persist south...,bullet shells get counted blood dries votive c...,Benjamin Mueller and Al Baker,New York Times,bullet shell get counted blood dry votive cand...,negative,-0.9999
2,tyrus wong bambi artist thwarted racial bias d...,walt disney bambi opened critics praised spare...,Margalit Fox,New York Times,walt disney bambi opened critic praised spare ...,positive,0.9888
3,among deaths heavy toll pop music new york times,death may great equalizer necessarily evenhand...,William McDonald,New York Times,death may great equalizer necessarily evenhand...,negative,-0.8609
4,kim jong un says north korea preparing test lo...,seoul south korea north korea leader kim said ...,Choe Sang-Hun,New York Times,seoul south korea north korea leader kim said ...,positive,0.9789


In [7]:
## class distribution in the dataset

df['sentiment'].value_counts()

positive    85085
negative    55802
neutral      1539
Name: sentiment, dtype: int64

In [8]:
## saving the data

df.to_csv(r'data\articles_sentiment.csv', index=False)