In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
!pip install NRCLex

In [None]:
#load all required packages
#general data handling and processing
import pandas as pd
import numpy as np
#text data processing and sentiment analysis tools
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
from nrclex import NRCLex

#visualization
import matplotlib.pyplot as plt



In [None]:
#import data
abc = pd.read_csv('/kaggle/input/million-headlines/abcnews-date-text.csv')

In [None]:
#take a look at the structure and basic information of this dataset
abc.info()
#identify missing values from the dataset
abc.isnull().sum() 

In [None]:
#change publish_date column to datetime as it is currently integer
abc['publish_date'] = pd.to_datetime(abc['publish_date'], format='%Y%m%d') 

### NLTK VADER

Firstly, I'using NLTK VADER package to perform a sentiment analysis and plot it to see the change over the years. 
According to [Schumacher (2019)](https://opendatagroup.github.io/data%20science/2019/03/21/preprocessing-text.html#:~:text=The%20general%20rule%20for%20whether,improve%20performance%2C%20do%20not%20lemmatize.&text=For%20example%2C%20a%20popular%20sentiment,not%20be%20stemmed%20or%20lemmatized), "VADER, has different ratings depending on the form of the word and therefore the input should not be stemmed or lemmatized." I decided not to stem or lemmatize the words for this sentiment analysis. 

I will be using thresholds values of -0.05 and 0.05, based on "About the Scoring" section on [this github page](https://github.com/cjhutto/vaderSentiment). If compound score is larger than 0.05, the headline will be classified as positive; if compound score is smaller than -0.05 it will be negative. 

In [None]:
#create polarity scores
abc['senti_score'] = abc['headline_text'].apply(lambda headline: sia.polarity_scores(headline))

#extract compound scores to a new column
abc['compound']  = abc['senti_score'].apply(lambda score_dict: score_dict['compound'])

#create a new column for sentiment labels 
abc['senti_label'] = abc['compound'].apply(lambda c: 'positive' if c >=0.05 else 'neutral' if c>-0.05 else 'negative')

#counts of sentiment labels
abc['senti_label'].value_counts()

The above result shows that nearly half of the news headlines are neutral. While in the other half, there are more negative headlines identified than positive ones. 

#### Visualizations of sentiments over the years

In [None]:
#calculate the average compound scores per month and per year respectively
yearly_averages = abc.resample('A',on='publish_date').mean()
monthly_averages = abc.resample('M',on='publish_date').mean()

In [None]:
monthly_averages.head(5)

In [None]:
#visualization of vader sentiment scores
plt.figure(figsize=(20,10))
plt.plot(yearly_averages.index,yearly_averages['compound'],  color='olive', linewidth=2, linestyle='dashed', label='Yearly mean compound scores')
plt.plot(monthly_averages.index, monthly_averages['compound'], color = 'blue', linewidth=2, label='Monthly mean compound scores')
plt.legend()
plt.show()

This graph indicates that there might be a cycle in the sentiment of news headlines as there are two peaks-around 2003 and 2015, and two troughs - in 2010 and end of 2019. 
It's worth looking into the topics in those years to understand the peaks and troughs in sentiment.There could be major events happening around those times which had made the media change their sentiment. 
 

### [NRC Emotion Lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm#:~:text=The%20NRC%20Emotion%20Lexicon%20is,sentiments%20(negative%20and%20positive).)

Next, I will be using [NRCLex package](https://pypi.org/project/NRCLex/) to further detect emotions in news headlines. 

In [None]:
#function to retrieve nrc affect frequencies
def emotion_freq(headline):
    res1 = {'anger': 0.0, 'fear': 0.0, 'negative': 0.0, 'positive': 0.0, 'sadness': 0.0, 'trust': 0.0, 'anticipation': 0.0, 'joy': 0.0, 'disgust': 0.0, 'surprise': 0.0}
    headline = NRCLex(headline)
    freq = headline.affect_frequencies
    for k, fq in freq.items():
      res1[k] = res1.get(k, 0.0) + fq
    return res1

#function to calculate word count in each headline
def word_count(row):
    row = nltk.word_tokenize(row)
    cnt = len(row)
    return cnt

In [None]:
#create a new dataset without vader analysis
abc_nrc = abc.iloc[:,0:2].copy()

In [None]:
#retrieve affect frequencies in each headline
abc_nrc['emo_freq']=abc_nrc['headline_text'].apply(emotion_freq)

In [None]:
#take a look at our new column with affected frequencies
abc_nrc.head()

In [None]:
#extract out the emotions to new columns for further analysis
abc_nrc = pd.concat((abc_nrc.drop(['emo_freq'],axis=1), abc_nrc['emo_freq'].apply(pd.Series)), axis=1)

In [None]:
#calculate word count in each headline
abc_nrc['word_count']=abc_nrc['headline_text'].apply(word_count)

In [None]:
abc_nrc.head()

In [None]:
#normalize emotion frequencies by having it divided by word counts in each headline
emotions = ['anger','fear','negative','positive','sadness','trust','anticipation','joy','disgust','surprise']
for emotion in emotions:
    abc_nrc[emotion] = abc_nrc[emotion]/abc_nrc['word_count']

In [None]:
#now we have our dataframe as below
abc_nrc.head()

In [None]:
nrc_yearly_averages = abc_nrc.resample('A',on='publish_date').mean()
nrc_monthly_averages = abc_nrc.resample('M',on='publish_date').mean()

In [None]:
for emotion in emotions:
    plt.figure(figsize=(20,10))
    plt.plot(nrc_yearly_averages.index,nrc_yearly_averages[emotion], color='orange', linewidth=2, linestyle='dashed', label='Yearly average scores')
    plt.plot(nrc_monthly_averages.index, nrc_monthly_averages[emotion], color = 'tab:blue', linewidth=2, label='Monthly average scores')
    plt.title('{} Sentiment of ABC News Headlines'.format(emotion.title()), fontsize=15)
    plt.legend()
    plt.show()

Anger, fear and negative are going downward. 

In [None]:
x= nrc_yearly_averages.index
y= [nrc_yearly_averages[emotion].tolist() for emotion in emotions]
plt.figure(figsize=(20,10))
plt.stackplot(x,y, colors=('#1f77b4',
                          '#ff7f0e',
                          '#2ca02c',
                          '#d62728',
                          '#9467bd',
                          '#8c564b',
                          '#e377c2',
                          '#7f7f7f',
                          '#bcbd22',
                          '#17becf'), labels=emotions)
plt.legend()
plt.show()