<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h1>


In [None]:
import re
import os
import nltk
import string
import random
import warnings
import numpy                               as np
import pandas                              as pd 
import matplotlib.pyplot                   as plt
import seaborn                             as sns
import plotly.express                      as ex
import plotly.graph_objs                   as go
import plotly.offline                      as pyo
import pymc3                               as pm
import theano.tensor                       as T
from plotly.subplots                       import make_subplots
from sklearn.decomposition                 import TruncatedSVD,PCA
from sklearn.feature_extraction.text       import CountVectorizer
nltk.download('vader_lexicon')
from sklearn.cluster                       import KMeans
from nltk.sentiment.vader                  import SentimentIntensityAnalyzer as SIA
from wordcloud                             import WordCloud,STOPWORDS
from pandas.plotting                       import autocorrelation_plot
from statsmodels.graphics.tsaplots         import plot_acf
from statsmodels.graphics.tsaplots         import plot_pacf
from statsmodels.tsa.seasonal              import seasonal_decompose
from statsmodels.tsa.ar_model              import AR,AutoReg
from statsmodels.tsa.statespace.sarimax    import SARIMAX
from nltk.util                             import ngrams
from nltk                                  import word_tokenize
from nltk.stem                             import PorterStemmer
from nltk.stem                             import WordNetLemmatizer
from wordcloud                             import STOPWORDS
from tqdm.notebook                         import tqdm

%matplotlib inline

pyo.init_notebook_mode()
tqdm.pandas()

warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)


plt.style.use('ggplot')
pyo.init_notebook_mode()
plt.rc('figure',figsize=(18,11))
sns.set_context('paper',font_scale=2)

def set_seed(seed=31415):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
set_seed()

<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Loading and Preprocessing</h1>


In [None]:
n_data = pd.read_csv('/kaggle/input/million-headlines/abcnews-date-text.csv',parse_dates=['publish_date'])
n_data = n_data.rename(columns={'publish_date':'date','headline_text':'text'})
n_data = n_data.groupby(['date'], as_index = False).agg({'text': ' '.join})
n_data = n_data.drop_duplicates()
n_data.head(3)

<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Feature Engineering</h1>


<a id="3"></a>

<h1 style="background-color:x;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Time Series Based Features</h1>


In [None]:
#Sorting And Feature Engineering
n_data['year']         = pd.DatetimeIndex(n_data['date']).year
n_data['month']        = pd.DatetimeIndex(n_data['date']).month
n_data['day']          = pd.DatetimeIndex(n_data['date']).day
n_data['day_of_year']  = pd.DatetimeIndex(n_data['date']).dayofyear
n_data['quarter']      = pd.DatetimeIndex(n_data['date']).quarter
n_data['season']       = n_data.month%12 // 3 + 1

In [None]:
yearly = n_data.groupby(['year'], as_index = False).agg({'text': ' '.join})
monthly = n_data.groupby(['month'], as_index = False).agg({'text': ' '.join})


<a id="3"></a>

<h1 style="background-color:x;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Sentiment Analysis via the Vader Lexicon</h1>


In [None]:
#Vader Setiment Analysis
sid = SIA()
n_data['sentiments']           = n_data['text'].progress_apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
n_data['Positive Sentiment']   = n_data['sentiments'].progress_apply(lambda x: x['pos']+1*(10**-6)) 
n_data['Neutral Sentiment']    = n_data['sentiments'].progress_apply(lambda x: x['neu']+1*(10**-6))
n_data['Negative Sentiment']   = n_data['sentiments'].progress_apply(lambda x: x['neg']+1*(10**-6))
n_data.drop(columns=['sentiments'],inplace=True)


<a id="3"></a>

<h1 style="background-color:x;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Naive Text Features</h1>


In [None]:
#Number of Words
n_data['Number_Of_Words'] = n_data.text.apply(lambda x:len(x.split(' ')))
#Average Word Length
n_data['Mean_Word_Length'] = n_data.text.apply(lambda x:np.round(np.mean([len(w) for w in x.split(' ')]),2) )


In [None]:
yearly_average = n_data.groupby(by='year', as_index = False).mean()
monthly_average = n_data.groupby(by='month', as_index = False).mean()

yearly['Number_Of_Words'] =  yearly_average['Number_Of_Words'] 
monthly['Number_Of_Words'] =  monthly_average['Number_Of_Words'] 

yearly['Mean_Word_Length'] =  yearly_average['Mean_Word_Length'] 
monthly['Mean_Word_Length'] =  monthly_average['Mean_Word_Length']

for i in ['Positive Sentiment','Neutral Sentiment','Negative Sentiment']:
    yearly[i] =  yearly_average[i] 
    monthly[i] =  monthly_average[i] 

<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h1>


In [None]:
yearly_d = n_data.year.value_counts()
fig = go.Figure()
fig.add_trace(go.Bar(x=yearly_d.index,y=yearly_d.values,name='Article Count'))


fig.update_layout(title='Amount of New Headlines per Year')


fig.show()

**Observation**: Looking at the distribution of yearly new headlines, we see that we have a uniform distribution, i.e., each year we have the same amount of headlines; knowing this fact allows us to compare and analyze quantitative features derived from the text between the years without considering sample size inequality. 

In [None]:
yearly_n1_word =[]
yearly_n2_word =[]
yearly_n3_word =[]

yearly_word_freqs = []
for year in tqdm(yearly.year):
    F = nltk.FreqDist([i for i in ' '.join(yearly.query(f'year=={year}').text).split() if i not in STOPWORDS])
    yearly_word_freqs.append(F)
    result = [i for i in F.most_common(3)]
    yearly_n1_word.append(result[0])
    yearly_n2_word.append(result[1])
    yearly_n3_word.append(result[2])
    

In [None]:
fig = make_subplots(rows=3, cols=1,shared_xaxes=True)#,subplot_titles=(f'{scope}ly Deviation in Positive Sentiment',  f'{scope}ly Deviation in Negative Sentiment'))

for i,dt in zip(range(3),[yearly_n1_word,yearly_n2_word,yearly_n3_word]):
    words  = [i[0] for i in dt]
    counts = [i[1] for i in dt]
    fig.add_trace(go.Scatter(x=yearly.year,y=counts,text=words,mode="lines+markers+text",textposition="top center"
    ,name=f'#{i+1} Most Used Word'),row=i+1,col=1)



#fig['layout']['xaxis2']['title'] = scope
fig.update_layout(height=700, width=900, title_text="Yearly Top 3 Most Used Word")


fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

**Observation**: Looking at the yearly top three words, we can observe the most recurrent "topics" , also while going down in the word rank, we see that we get a more "distilled" understanding of some main events discussed at a given year.

In [None]:
disaster_tags = ['earthquake','death','killed','tornado','hurricane','flood','fire','dead','epidemic','wildfire',
                 'drought','terrorism','landslide','flu','virus']
yearly_disaster =[]
for y in yearly_word_freqs:
    aux = 0
    for w in disaster_tags:
        aux+=y[w]
    yearly_disaster.append(aux)
    
fig = go.Figure()
fig.add_trace(go.Scatter(x=yearly.year,y=yearly_disaster,name='Number of Disaster Tags'))
fig.update_layout(title='Yearly Number of Disaster Related Words Used')
fig.show()

**Explanation**: The above data was derived by counting at each year the number of "disaster" related terms appearing in the news headlines; the list of terms used includes the following words: 'earthquake,' 'death,' 'killed,' 'tornado,' 'hurricane,' 'flood,' 'fire,' 'dead,' 'epidemic,' 'wildfire,' 'drought,' 'terrorism,' 'landslide,' 'flu,' 'virus.'


**Observation**: from the above graph, we learn something very fascinating considering the situation the world is in right now with the coronavirus.
Apparently, there were much worse years in terms of "disasters" as an insight it is fascinating and gives us some perspective when we joke about "2020 being the worst year".

<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Time Based Analysis</h1>


In [None]:

plt.subplot(2,1,1)
plt.title('Distriubtion Of Sentiments Across Our Tweets',fontsize=19,fontweight='bold')
sns.kdeplot(n_data['Negative Sentiment'],label='Negative Sentiment',lw=2.5)
sns.kdeplot(n_data['Positive Sentiment'],label='Positive Sentiment',lw=2.5)
sns.kdeplot(n_data['Neutral Sentiment'], label='Neutral Sentiment',lw=2.5 )
plt.legend()
plt.subplot(2,1,2)
plt.title('CDF Of Sentiments Across Our Tweets',fontsize=19,fontweight='bold')
sns.kdeplot(n_data['Negative Sentiment'],cumulative=True ,label='Negative Sentiment',lw=2.5)
sns.kdeplot(n_data['Positive Sentiment'],cumulative=True ,label='Positive Sentiment',lw=2.5)
sns.kdeplot(n_data['Neutral Sentiment'],cumulative=True  ,label='Neutral Sentiment' ,lw=2.5)
plt.xlabel('Sentiment Value',fontsize=19)
plt.legend()
plt.tight_layout()
plt.show()

**Observation**: Looking at the distributions of sentiments across our data, we see very distinct differences between the means and standard deviations of the distribution.
We see that most of the headlines have a high neutral sentiment percentage, and at most, it is accompanied with negative sentiment percentages, which makes sense considering they really tell us positive things on the new, such claim can be strengthened by looking at the tight distribution around zero of our positive sentiments percentage.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=n_data.date,y=n_data['Positive Sentiment'],name='Positive Sentiment'))
fig.add_trace(go.Scatter(x=yearly.year,y=yearly['Positive Sentiment'],name='Yearly Mean Positive Sentiment'))

fig.add_trace(go.Scatter(x=n_data.date,y=n_data['Negative Sentiment'],name='Negative Sentiment'))
fig.add_trace(go.Scatter(x=yearly.year,y=yearly['Negative Sentiment'],name='Yearly Mean Negative Sentiment'))

fig.update_layout(title='Daily Sentiments Throughout or Time Line')
fig.show()

**Observation**: From a brief look at the average progression of our sentiments through time, we see that overall the sentiment are stationary, excluding a brief period where there is a change in dominance between the negative and positive sentiments - apparently, 2014 is conceded a bit more positive in comparison to following years.

In [None]:
f_data = n_data
partitions = []
partitions.append(f_data.loc[44:np.round(len(f_data)/3,0)-1,:])
partitions.append(f_data.loc[np.round(len(f_data)/3,0):2*int(len(f_data)/3)-1,:])
partitions.append(f_data.loc[2*np.round(len(f_data)/3,0):3*int(len(f_data)/3)-1,:])



neg_part_means =[]
neg_part_std   =[]
pos_part_means =[]
pos_part_std   =[]
for part in partitions:
    neg_part_means.append(part['Negative Sentiment'].mean())
    neg_part_std.append(part['Negative Sentiment'].std())
    pos_part_means.append(part['Positive Sentiment'].mean())
    pos_part_std.append(part['Positive Sentiment'].std())
    
res_df = pd.DataFrame({'Positive Sentiment Mean':pos_part_means,'Negative Sentiment Mean':neg_part_means,'Positive Sentiment SD':pos_part_std,'Negative Sentiment SD':neg_part_std},
                     index = [f'Partition_{i}' for i in range(1,4)])


def highlight_greater(x):
    temp = x.copy()
    temp = temp.round(0).astype(int)
    m1 = (temp['Partition_1_Mean'] == temp['Partition_2_Mean'])
    m2 = (temp['Partition_1_SD'] == temp['Partition_2_SD'])
    m3 = (temp['Partition_1_Mean'] < temp['Partition_2_Mean']+3) & (temp['Partition_1_Mean'] > temp['Partition_2_Mean']-3)
    m4 = (temp['Partition_1_SD'] < temp['Partition_2_SD']+3) & (temp['Partition_1_SD'] > temp['Partition_2_SD']-3)

    df1 = pd.DataFrame('background-color: ', index=x.index, columns=x.columns)
    #rewrite values by boolean masks
    df1['Partition_1_Mean'] = np.where(~m1, 'background-color: {}'.format('salmon'),        df1['Partition_1_Mean'])
    df1['Partition_2_Mean'] = np.where(~m1, 'background-color: {}'.format('salmon'),        df1['Partition_2_Mean'])
    df1['Partition_1_Mean'] = np.where(m3, 'background-color: {}'.format('gold'),           df1['Partition_1_Mean'])
    df1['Partition_2_Mean'] = np.where(m3, 'background-color: {}'.format('gold'),           df1['Partition_2_Mean'])
    df1['Partition_1_Mean'] = np.where(m1, 'background-color: {}'.format('mediumseagreen'), df1['Partition_1_Mean'])
    df1['Partition_2_Mean'] = np.where(m1, 'background-color: {}'.format('mediumseagreen'), df1['Partition_2_Mean'])

    df1['Partition_1_SD'] = np.where(~m2, 'background-color: {}'.format('salmon'),        df1['Partition_1_SD'])
    df1['Partition_2_SD'] = np.where(~m2, 'background-color: {}'.format('salmon'),        df1['Partition_2_SD'])
    df1['Partition_1_SD'] = np.where(m4, 'background-color: {}'.format('gold'),           df1['Partition_1_SD'])
    df1['Partition_2_SD'] = np.where(m4, 'background-color: {}'.format('gold'),           df1['Partition_2_SD'])
    df1['Partition_1_SD'] = np.where(m2, 'background-color: {}'.format('mediumseagreen'), df1['Partition_1_SD'])
    df1['Partition_2_SD'] = np.where(m2, 'background-color: {}'.format('mediumseagreen'), df1['Partition_2_SD'])

    return df1



#res_df.style.apply(highlight_greater,axis=None)
res_df = res_df.T
res_df = pd.DataFrame(res_df.values,columns=res_df.columns,index=['Positive Sentiment','Negative Sentiment','Positive Sentiment','Negative Sentiment'])
res_df = pd.concat([res_df.iloc[:2,:],res_df.iloc[2:,:]],axis=1)
res_df.columns = ['Partition_1_Mean','Partition_2_Mean','Partition_3_Mean','Partition_1_SD','Partition_2_SD','Partition_3_SD']
res_df.style.apply(highlight_greater,axis=None)

In [None]:
fig = make_subplots(rows=3, cols=2)

for idx,prt in enumerate(partitions):
    fig.add_trace(
    go.Scatter(x=prt['date'], y=prt['Positive Sentiment'],name=f'Positive Part {idx+1}'),
    row=idx+1, col=1)
    fig.add_trace(
    go.Scatter(x=prt['date'], y=prt['Negative Sentiment'],name=f'Negative Part {idx+1}'),
    row=idx+1, col=2)

fig.update_layout(height=600, width=900, title_text="Distibution Of Daily Sentiments Over Our Time Line For Each Partition")
fig.show()

In [None]:
fig = make_subplots(rows=4, cols=2, subplot_titles=('Observed Pos', 'Observed Neg', 'Trend Pos','Trend Neg','Seasonal Pos','Seasonal Neg','Residual Pos','Residual Neg'))
scope = 'year'
b_date_mean = yearly


lbl = ['Positive','Negative']

for idx,column in enumerate(['Positive Sentiment','Negative Sentiment']):
    
    res = seasonal_decompose(b_date_mean[column], period=5, model='additive', extrapolate_trend='freq')
    
    fig.add_trace(
    go.Scatter(x=b_date_mean[scope], y=res.observed,name='{} Observed'.format(lbl[idx])),
    row=1, col=idx+1)
    
    fig.add_trace(
    go.Scatter(x=b_date_mean[scope], y=res.trend,name='{} Trend'.format(lbl[idx])),
    row=2, col=idx+1)
    
    fig.add_trace(
    go.Scatter(x=b_date_mean[scope], y=res.seasonal,name='{} Seasonal'.format(lbl[idx])),
    row=3, col=idx+1)
    
    fig.add_trace(
    go.Scatter(x=b_date_mean[scope], y=res.resid,name='{} Residual'.format(lbl[idx])),
    row=4, col=idx+1)
            
fig.update_layout(height=600, width=900, title_text="Decomposition Of Our Yearly Average Sentiments into Trend,Level,Seasonality and Residuals")
fig.show()

**Observation**: Decomposing our yearly sentiments into trend, seasonality, and residual components, we see that there is a weak trend in the positive sentiment, which indicates a decrease in overall positivity as we progress in time, which is a bit sad as an insight, but it may be not as bad as we think, at the same time there is a drastic decrease in negative sentiment and because of the properties of sentiment percentage we know that all those proportions in term go to the neutral sentiment which in my opinion is the default for new headlines.

In [None]:
f, ax = plt.subplots(nrows=2, ncols=1, figsize=(16, 10))

ax[0].set_title('Positive Autocorrelation Analysis ',fontsize=18,fontweight='bold')
autocorrelation_plot(b_date_mean['Positive Sentiment'],ax=ax[0],lw=3)
ax[1].set_title('Negative Autocorrelation Analysis ',fontsize=18,fontweight='bold')
autocorrelation_plot(b_date_mean['Negative Sentiment'],ax=ax[1],color='tab:red',lw=3)

plt.tight_layout()
plt.show()

In [None]:
f, ax = plt.subplots(nrows=2, ncols=1, figsize=(16, 10))
ax[0].set_ylim(-1.1,1.1)
ax[1].set_ylim(-1.1,1.1)

plot_pacf(b_date_mean['Negative Sentiment'],lags=5, ax=ax[0],title='Partial Autocorrelation Negative Sentiment')
plot_pacf(b_date_mean['Positive Sentiment'],lags=5, ax=ax[1],color='tab:blue',title='Partial Autocorrelation Positive Sentiment')
plt.show()

**Observation**: Interestingly, we see a significant correlation between a given year's sentiment and the lagged one sentiment; such insight sparks a question: Can we use a relatively simple model to model such a connection? Well, we will have to investigate!

<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Modeling Future Posetive Sentiment</h1>



In [None]:
arma_5 = SARIMAX(endog=b_date_mean['Positive Sentiment'],order=(1,0,1)).fit()


In [None]:
fig = plt.figure(figsize=(16,9))
fig = arma_5.plot_diagnostics(fig=fig, lags=2)

In [None]:
predicted_AR_1 = arma_5.predict(1)

output = pd.DataFrame({'Prediction':predicted_AR_1,'Actual':b_date_mean['Positive Sentiment']})

fig = make_subplots(
    rows=3, cols=2,subplot_titles=('','Actual','Predictions','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "table","rowspan": 3}     ,{"type": "scatter"}] ,
           [None                               ,{"type": "scatter"}]            ,           
           [None                               ,{"type": "scatter"}]                           
          ]
)

fig.add_trace(
    go.Scatter(
        x=b_date_mean[scope],
        y=output["Actual"],
        mode="lines+markers",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(
        x=b_date_mean[scope],
        y=output["Prediction"],
        mode="lines+markers",
    ),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Prediction"])),
        y=output["Prediction"]-output["Actual"],
        mode="lines+markers",
    ),
    row=3, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=['Prediction','Actual'],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[output[k].tolist() for k in output.columns],
            align = "left")
    ),
    row=1, col=1
)



fig.add_shape(type="line",
    x0=0, y0=(output["Prediction"]-output["Actual"]).mean(), x1=len(output["Prediction"]), y1=(output["Prediction"]-output["Actual"]).mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
        xref='x3', 
        yref='y3'
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Prediction Evaluation",
)

fig.show()

**Observation**: We see that using a simple moving average model with integrated components that are based on the first lag of our data, we get fairly good results!


<a id="3"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Text Decomposition and Analysis</h1>



In [None]:
w1_dict = dict()
june_text = ' '.join(yearly.text)
l_t=' '.join(i for i in june_text.split(' ') if i not in STOPWORDS and i.isalpha())

for word in l_t.split():
    w= word.strip()
    if w in STOPWORDS:
        continue
    else:
        w1_dict[w] = w1_dict.get(w,0)+1
w1_dict = {k: v for k, v in sorted(w1_dict.items(), key=lambda item: item[1],reverse=True)}

w2_dict = dict()

top_10_w1 = list(w1_dict.keys())[:10]
token=nltk.word_tokenize(l_t)
trigram =ngrams(token,3)
trigram = [k for k in trigram if k[0] in top_10_w1]

token=nltk.word_tokenize(l_t)
bigram=ngrams(token,2)
bigram_dict = dict()
for i in bigram:
    bigram_dict[i] = bigram_dict.get(i,0)+1
    
trigram_dict = dict()
for i in trigram:
    trigram_dict[i] = trigram_dict.get(i,0)+1
    
tri_gram =pd.DataFrame(list(trigram_dict.keys())[:15],columns=['One Of Top 10 Words','Second Word','Third Word'])

def get_prob(sir):
    key = (sir['One Of Top 10 Words'],sir['Second Word'],sir['Third Word'])
    w3 = trigram_dict[key]
    w2 = bigram_dict[(sir['One Of Top 10 Words'],sir['Second Word'])]
    return w3/w2

tri_gram['Probabilty Of Sentence'] = tri_gram.apply(get_prob,axis=1)

tri_gram.style.background_gradient(cmap='coolwarm')    

In [None]:
NUMBER_OF_COMPONENTS=400

CV = CountVectorizer()
svd = TruncatedSVD(NUMBER_OF_COMPONENTS)

c_matrix = CV.fit_transform(n_data.text)

dec_matrix = svd.fit_transform(c_matrix)
dec_df=pd.DataFrame(dec_matrix,columns=['PC_{}'.format(i) for i in range(1,NUMBER_OF_COMPONENTS+1)])

ex_var = svd.explained_variance_ratio_
variance_cum = np.cumsum(ex_var)
data = [go.Scatter(x=np.arange(0,len(variance_cum)),y=variance_cum,name='Cumulative Explained Variance',mode='lines+markers'),
        go.Scatter(x=np.arange(0,len(variance_cum)),y=ex_var,name='Explained Variance',mode='lines+markers')]
layout = dict(title='Explained Variance Ratio Using {} Words'.format(NUMBER_OF_COMPONENTS),
             xaxis_title='# Componenets',yaxis_title='Explained Variance',height=650,width=900)
fig = go.Figure(data=data,layout=layout)
fig.update_layout(template='seaborn')
fig.show()

**Observation**: After representing our data in a bag-of-words representation, we see that we need a hefty amount of words in order to explain a meer 70% of the variance in the original domain; such an indicator teaches us how extreme is the variation in the text is in the original domain.


In [None]:
dec_DF = dec_df[['PC_1','PC_2']].copy()
dec_DF['year'] = n_data.year
dec_DF['Number_Of_Words'] = n_data.Number_Of_Words
dec_DF['Positive Sentiment'] = n_data['Positive Sentiment']
dec_DF['Negative Sentiment'] = n_data['Negative Sentiment']

ex.scatter(dec_DF,x='PC_1',y='PC_2',color='year',title='R^2 Representation of Headlines Colored by Year')

**Observation**: In the above plot, we see that there are distinct clusters in the original domain of our data which is clearly visible when projected to R^2, and what unites those clusters is the year of the headline publication.

In [None]:
ex.scatter(dec_DF,x='PC_1',y='PC_2',color='Number_Of_Words',title='R^2 Representation of Headlines Colored by Number of Words')

**Observation**: One potential reason for the distinct clustering we observed in the previous year may be partially due to the average yearly headline length, which, as can be seen in the current plot, also shows a distinct pattern.