# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import math
import calendar

from scipy.stats import boxcox, yeojohnson

In this section, I looked for trends, patterns and possible predictors of article popularity. I also split my data into a train and test dataset. My train dataset contains articles from January 1 to September 31, and my test dataset contains articles from October 1 to December 31. I did this EDA with the goal of informing my modelling efforts to predict whether an article is popular (has more than 90 comments).

In [None]:
# Load train dataset and keywords column as a list
train = pd.read_csv('/kaggle/input/new-york-times-articles-comments-2020/train.csv', converters={'keywords': eval})

### Initial Analysis

In [None]:
# Our train dataset has 12,792 articles
train.shape

In [None]:
# Our train dataset has 12 features which are ordinal, continous and text-based features.
train.columns

In [None]:
train.head(3)

In [None]:
# We have 62 unique newsdesks, 41 sections, and 61 subsections
train['newsdesk'].nunique(), train['section'].nunique(), train['subsection'].nunique()

#### Checking for Null Values
Almost 2/3 of our articles don't have subsections. It's a pretty important predictor for article popularity, so we won't drop it. There are three articles that don't have abstracts -- we'll impute fill this in with a whitespace character.

In [None]:
# Checking for null values
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
train['abstract'] = train['abstract'].fillna('')

#### Checking for Duplicates
There are a couple of duplicated headlines in our data. These are likely recurring weekly or monthly articles.

In [None]:
# Checking for duplicated values
train['headline'].duplicated().sum()

In [None]:
train[train['headline'].duplicated()]['headline'].value_counts().head(5)

In [None]:
# Articles with duplicate headlines are generally pretty unpopular
train[train['headline'].duplicated() == True]['is_popular'].value_counts()

## Popularity vs Number of Comments
There's a large group of articles that have less than 90 comments -- this is where I chose to split the data. We can see that `n_comments` has a heavy positive skew, with the number of articles decreasing in proportion to the number of comments. You can change this into a binary classification problem by using the following code: `train['is_popular'] = train['n_comments'].apply(lambda x: 1 if x > 90 else 0)`. 

Of course, if you want to do this, you'll need to drop `n_comments` at some point.

It's important to note here that not all NYT articles are open for comments. The NYT moderation team chooses articles to open for public commentary. Our data only reflects articles that were opened for commentary AND recieved at least one comment.

In [None]:
# Average number of comments
train['n_comments'].mean()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(16,8))
sns.histplot(train['n_comments'].drop(train[train['n_comments'] > 3000].index), bins=35)
mean = train['n_comments'].mean()
plt.axvline(90, ls='-', c='red', label='Split', lw=4)
plt.legend(fontsize=12, loc=1)
plt.xlabel('Number of Comments')
plt.ylabel('Number of Articles')
plt.title(f'Number of Comments', fontsize=18);

### Checking Class Balance

In [None]:
plt.figure(figsize=(10,6))
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Overall, our classes are pretty much evenly balanced
    g = sns.countplot(train['is_popular'])
    g.set_xticklabels(['Unpopular (< 90 comments)', 'Popular (> 90 comments)'])
    plt.xlabel('')
    plt.ylabel('Number of Articles')
    plt.title('Class Balance', fontsize=16);

In [None]:
train['is_popular'].value_counts(normalize=True)

In [None]:
# There are a few extreme outliers in our data
plt.figure(figsize=(16,4))
sns.boxplot(data=train['n_comments'], orient='h')
plt.xlabel('n_comments')
plt.yticks([])
plt.xlabel('Number of Comments')
plt.title('Number of Comments', fontsize=18);

In [None]:
# Top 3 outliers
train[train['n_comments'] > 4000][['headline', 'abstract', 'n_comments', 'pub_date']] \
.sort_values(by='n_comments', ascending=False).head(3)

## Word Count

Dealing with word count is slightly tricky. We can see that the feature is normally distributed in general with a heavy positive skew. We can also see that there are a large number of articles that have a word count of 0. These are interactive features that don't have 'words' in the conventional sense. Because this is not technically 'missing' data, I'm not going to impute it.

In [None]:
# Close to a normal distribution, with a positive skew
plt.figure(figsize=(16,8))
mean = train['word_count'].mean()
plt.axvline(mean, ls='--', color='black')
sns.histplot(train['word_count'])
plt.xlabel('Word Count')
plt.title(f'Word Count (Mean: {mean:.0f} words)', fontsize=18);

In general, machine learning algorithms tend to perform better when the distribution of variables is normal -- in other words, performance tends to improve for variables that have a standard distribution.

#### Feature Transformation

Here, we can see the effectiveness of various transformation methods. The Boxcox transformation seems to work best here -- our data is still slightly skewed but much closer to a normal distribution.

In [None]:
def transform_var(col, df):
    skew_dict = {} # Creating dictionary to store skew values
    df[f'{col}_log'] = np.log1p(df[f'{col}'])
    df[f'{col}_box'] = df[f'{col}'].replace(0, 0.001) # Replacing as Boxcox can't transform values that are 0
    df[f'{col}_box'] = boxcox(df[f'{col}_box'])[0]
    df[f'{col}_sqrt'] = np.sqrt(df[f'{col}'])
    
    skew_dict['Original'] = df[f'{col}'].skew()
    skew_dict['Log1p'] = df[f'{col}_log'].skew()
    skew_dict['Boxcox'] = df[f'{col}_box'].skew()
    skew_dict['Square Root'] = df[f'{col}_sqrt'].skew()
    return skew_dict

In [None]:
def plot_transform(col, df):
    fig, ax = plt.subplots(2, 2, figsize=(13,9), sharey=True)
    ax = ax.ravel()
    sns.histplot(df[f'{col}'], ax=ax[0])
    ax[0].set_title(f"Original (Skew: {skew_dict['Original']:.3f})", fontsize=14)
    sns.histplot(df[f'{col}_log'], ax=ax[1])
    ax[1].set_title(f"Log1p (Skew: {skew_dict['Log1p']:.3f})", fontsize=14)
    sns.histplot(df[f'{col}_box'], ax=ax[2])
    ax[2].set_title(f"Boxcox (Skew: {skew_dict['Boxcox']:.3f})", fontsize=14)
    sns.histplot(df[f'{col}_sqrt'], ax=ax[3])
    ax[3].set_title(f"Square Root (Skew: {skew_dict['Square Root']:.3f})", fontsize=14)
    for ax in ax:
        ax.set_xlabel('')
        ax.set_ylabel('')
    plt.suptitle('Transformed Word Count', fontsize=18)
    plt.tight_layout()

In [None]:
skew_dict = transform_var('word_count', train)
plot_transform('word_count', train)

In [None]:
# There are a few extreme outliers in our data
plt.figure(figsize=(16,4))
sns.boxplot(data=train['word_count'], orient='h')
plt.yticks([])
plt.xlabel('Word Count')
plt.title('Word Count', fontsize=18);

### Interactive Features

In [None]:
# Stories with a word count of 0 seem to be interactive features
train[train['word_count'] == 0] \
    [['headline', 'newsdesk', 'section', 'material', 'word_count']].head()

In [None]:
# All articles with a word count of 0 are interactive features
train[train['word_count'] == 0]['material'].value_counts()

In [None]:
# They mostly come from these particular newsdesks
train[train['word_count'] == 0]['newsdesk'].value_counts().head(5)

In [None]:
# 'The Upshot' and 'Opinion' appear to be newsdesks specifically created for interactive features
train['newsdesk'].value_counts()[train['newsdesk'].value_counts().index.str.contains('Upshot|Magazine|U.S.|Op|Clim')]

In [None]:
# They generally seem to be more popular than the average article
train[train['word_count'] == 0][['is_popular']].mean()

### Number of Comments versus Word Count
Does word count, or the length of an article affect the number of comments on each article? In the plot below, we can see that there's generally a positive relationship between these two variables, except for OpEd articles where it seems that word count doesn't affect number of comments at all. OpEd articles have an average of around 1100 words.

In [None]:
# Combining different newsdesk names
plt.figure(figsize=(12, 10))
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'The Upshot' if x=='Upshot' else x)
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'OpEd' if x=='Opinion' else x)
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'AtHome' if x=='At Home' else x)

In [None]:
# OpEd length doesn't affect number of comments -- but other desks tend to have more comments as word count increases
top_news_df = train['newsdesk'].value_counts().head(5).index

g = sns.lmplot(data=train.loc[train['newsdesk'].isin(top_news_df)], x='word_count', y='n_comments', 
               hue='newsdesk', palette='tab10', height=8, aspect=1.10, scatter_kws={'alpha':0.3, 's':15}, legend_out=False)

for lh in g._legend.legendHandles: 
    lh.set_alpha(1)
    lh._sizes = [20] 

g._legend.set_title('News Desk')

plt.ylabel('Number of Comments', fontsize=11)
plt.xlabel('Word Count', fontsize=11)
plt.legend(fontsize=14)
plt.title('Number of Comments versus Word Count', fontsize=18);

In [None]:
# There's a moderate positive correlation between these two variables
train.corr()['is_popular']['word_count']

From this point, I'll be shifting to looking at popularity (as denoted by `is_popular`) instead of number of comments.

In [None]:
# We won't be using number of comments moving forward
train = train.drop(columns=['n_comments'])

## Headline / Abstract Length

In [None]:
train['headline_len'] = train['headline'].apply(lambda x: len(x))
train['abstract_len'] = train['abstract'].apply(lambda x: len(x))
train['head_abs_len'] = train['headline_len'] + train['abstract_len']

### Headline Length

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='headline_len');

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data=train, x='headline_len', y='is_popular')
mean = train.groupby('headline_len').mean()['is_popular'].mean()
#plt.axhline(mean, color='black', ls='--', label=f'Avg. Popularity: {mean:.2f}')
#plt.legend(fontsize=12)
plt.axhline(0.5, color='black', ls='--')
plt.xlabel('Headline Length')
plt.title('Average Popularity vs Headline Length')

### Abstract Length

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='abstract_len');

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data=train, x='abstract_len', y='is_popular')
mean = train.groupby('abstract_len').mean()['is_popular'].mean()
plt.axhline(0.5, color='black', ls='--')
#plt.axhline(mean, color='black', ls='--', label=f'Avg. Popularity: {mean:.2f}')
#plt.legend(fontsize=12);
plt.xlabel('Abstract Length')
plt.xlim(15, 250)
plt.title('Average Popularity vs Abstract Length');

### Headline & Abstract Length

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='head_abs_len');

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data=train, x='head_abs_len', y='is_popular')
mean = train.groupby('head_abs_len').mean()['is_popular'].mean()
plt.axhline(0.5, color='black', ls='--')
#plt.axhline(mean, color='black', ls='--', label=f'Avg. Popularity: {mean:.2f}')
#plt.legend(fontsize=12);
plt.xlabel('Headline + Abstract Length')
plt.title('Average Popularity vs Headline & Abstract Length');

## News Desk

These are the newsdesks with the most number of comments. Unsurprisingly, OpEd articles are at the top, followed by Foreign and Business. In 2017, the NYT implemented a [new commenting system](https://www.nytimes.com/2017/06/13/insider/have-a-comment-leave-a-comment.html) that opened up OpEd articles and other selected news articles for 24 hours. This is likely part of the reason why OpEd articles seem to draw a higher frequency of comments.

In [None]:
# Grouping largest 20 newsdesks and sorting by popularity
df = train['newsdesk'].value_counts(ascending=False).reset_index()
df.columns=['newsdesk', 'n_articles']
temp = pd.merge(df, train.groupby('newsdesk').mean()['is_popular'].reset_index()).head(10)
temp['n_articles'].sum()

In [None]:
g_index = df['newsdesk'].head(20).values
g_df = train[train['newsdesk'].isin(g_index)]
g_data = g_df.groupby('newsdesk').mean()['is_popular'].sort_values(ascending=False)
g_data = g_data.to_frame().reset_index()
g_data.head()

In [None]:
# Top 20 newsdesks
plt.figure(figsize=(12, 12))
sns.barplot(data=g_data, y=g_data['newsdesk'], x=g_data['is_popular'], orient='h', palette='coolwarm_r')
plt.xlabel('Average Popularity')
plt.ylabel('Newsdesk')
plt.xticks(np.arange(0.0, 1.1, 0.1), fontsize=12)
plt.yticks(fontsize=12)
plt.title('Newsdesk Avg. Popularity', fontsize=18);

Some newsdesks have many more popular than unpopular articles such as OpEd, Politics, Games, and Washington. Other newsdesks have many more unpopular than popular articles, like Culture and Podcasts.

In [None]:
# Viewing top newsdesks by proportion of popularity
g_index = df['newsdesk'].head(20).values
g_df = train[train['newsdesk'].isin(g_index)]
order = g_df.groupby('newsdesk').mean()['is_popular'].sort_values(ascending=False).index
plt.figure(figsize=(12,12))
sns.countplot(data=g_df, y='newsdesk', hue='is_popular', order=order)
plt.legend(labels=['Not Popular', 'Popular'], fontsize=12, loc='best')
plt.title('Largest News Desks Sorted By Avg. Popularity', fontsize=18);

## Section

In [None]:
df = train['section'].value_counts(ascending=False).reset_index()
df.columns=['section', 'n_articles']

In [None]:
g_index = df['section'].head(20).values
g_df = train[train['section'].isin(g_index)]
g_data = g_df.groupby('section').mean()['is_popular'].sort_values(ascending=False)
g_data = g_data.to_frame().reset_index()

In [None]:
# Top 20 sections
plt.figure(figsize=(12, 10))
sns.barplot(data=g_data, y=g_data['section'], x=g_data['is_popular'], orient='h', palette='coolwarm_r')
plt.xlabel('Average Popularity')
plt.ylabel('Section')
plt.yticks(fontsize=12)
plt.xticks(np.arange(0.0, 1.1, 0.1), fontsize=12)
plt.title('Section Avg. Popularity', fontsize=18);

Opinion is the most popular section, followed by Crossword & Games and U.S. Most OpEd newsdesk articles fall into the Opinion section, except for a couple that fall under Sunday Review.

In [None]:
train[(train['newsdesk'] == 'OpEd') & (train['section'] != 'Opinion')][['newsdesk', 'section', 'subsection', 
                                                                        'material', 'headline']]

In [None]:
# Viewing top newsdesks by proportion of popularity
g_index = df['section'].head(20).values
g_df = train[train['section'].isin(g_index)]
order = g_df.groupby('section').mean()['is_popular'].sort_values(ascending=False).index
plt.figure(figsize=(12,12))
sns.countplot(data=g_df, y='section', hue='is_popular', order=order)
plt.legend(labels=['Not Popular', 'Popular'], fontsize=12, loc='best')
plt.title('Largest Sections Sorted By Avg. Popularity', fontsize=18);

## Subsection

In [None]:
train['subsection'] = train['subsection'].fillna('N/A')

In [None]:
df = train['subsection'].value_counts(ascending=False).reset_index()
df.columns=['subsection', 'n_articles']

In [None]:
g_index = df['subsection'].head(20).values
g_df = train[train['subsection'].isin(g_index)]
g_data = g_df.groupby('subsection').mean()['is_popular'].sort_values(ascending=False)
g_data = g_data.to_frame().reset_index()

In [None]:
# Top 20 sections
plt.figure(figsize=(12, 12))
sns.barplot(data=g_data, y=g_data['subsection'], x=g_data['is_popular'], orient='h', palette='coolwarm_r')
plt.xlabel('Average Popularity')
plt.ylabel('Subsection')
plt.xticks(np.arange(0.0, 1.1, 0.1), fontsize=12)
plt.title('Subsection Avg. Popularity', fontsize=18)
plt.yticks(fontsize=12);

In [None]:
# Viewing top newsdesks by proportion of popularity
g_index = df['subsection'].head(20).values
g_df = train[train['subsection'].isin(g_index)]
order = g_df.groupby('subsection').mean()['is_popular'].sort_values(ascending=False).index
plt.figure(figsize=(12,16))
sns.countplot(data=g_df, y='subsection', hue='is_popular', order=order)
plt.legend(labels=['Not Popular', 'Popular'], fontsize=12, loc='best')
plt.title('Largest Subsections Sorted By Avg. Popularity', fontsize=18);

## Material

In [None]:
df = train['material'].value_counts(ascending=False).reset_index()
df.columns=['material', 'n_articles']

In [None]:
g_index = df['material'].head(20).values
g_df = train[train['material'].isin(g_index)]
g_data = g_df.groupby('material').mean()['is_popular'].sort_values(ascending=False)
g_data = g_data.to_frame().reset_index()

In [None]:
# Top materials
plt.figure(figsize=(12, 6))
sns.barplot(data=g_data, y=g_data['material'], x=g_data['is_popular'], orient='h', palette='coolwarm_r')
plt.xlabel('Average Popularity')
plt.ylabel('Material')
plt.yticks(fontsize=12)
plt.xticks(np.arange(0.0, 1.1, 0.1), fontsize=12)
plt.title('Material Avg. Popularity', fontsize=18)
plt.axvline(0.5, ls='--', color='black')

## Keywords

In [None]:
train['keywords']

In [None]:
# Creating df that will allow us to calculate frequency of keywords
keyword_df = pd.DataFrame(list(train['keywords'].values), index=train.index)
keyword_df = keyword_df.stack().reset_index()
keyword_df

In [None]:
keyword_df[0].value_counts().head()

In [None]:
train['n_keywords'] = train['keywords'].apply(lambda x: len(x))
mean = train['n_keywords'].mean()

plt.figure(figsize=(16,8))
plt.axvline(mean, ls='--', color='black')
sns.histplot(train['n_keywords'], bins=60)
plt.xlabel('Number of Keywords')
plt.xlim(0, 41)
plt.title(f'Number of Keywords (Mean: {mean:.1f} keywords)', fontsize=18);

In [None]:
plt.figure(figsize=(16,4))
sns.boxplot(data=train['n_keywords'], orient='h')
plt.title('Number of Keywords Box Plot', fontsize=18);

In [None]:
# Looking at articles with more than 40 keywords
train[train['n_keywords'] > 40][['newsdesk', 'section', 'material', 'headline', 'n_keywords', 'is_popular']].sort_values(by='n_keywords', ascending=False)

In [None]:
# There are only 2210 articles with more than 10 keywords
train['n_keywords'].value_counts()[train['n_keywords'].value_counts().index > 10].sum()

In [None]:
# There are only 370 articles with more than 15 keywords
train['n_keywords'].value_counts()[train['n_keywords'].value_counts().index > 15].sum()

In [None]:
g4_df = train.groupby('n_keywords').mean().reset_index().drop(columns='word_count')

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data=g4_df, x='n_keywords', y='is_popular', color='orange')
plt.axhline(train.groupby('n_keywords').mean()['is_popular'].mean(), color='gray', ls='--')
plt.title('Avg Popularity vs Number of Keywords', fontsize=18);

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train[train['n_keywords'] < 30], x='n_keywords', hue='is_popular', bins=np.arange(0,30))
plt.legend(labels=['Popular', 'Unpopular'], fontsize=12)
plt.title('Popularity by Number of Keywords', fontsize=18);

In [None]:
# There's a faint positive correlation between number of keywords and popularity
train.corr()['is_popular']['n_keywords']

## Time Variables

In this section, we're going to look at how time affects both the frequency and popularity of articles.

In [None]:
train['pub_date'] = pd.to_datetime(train['pub_date'])
train['day_of_month'] = train['pub_date'].apply(lambda x: x.day)
train['month'] = train['pub_date'].apply(lambda x: x.month)
train['day_of_week'] = train['pub_date'].apply(lambda x: x.dayofweek)
train['hour'] = train['pub_date'].apply(lambda x: x.hour)
train['ymd'] = train['pub_date'].apply(lambda x: str(x)[:10])
train['ymd'] = pd.to_datetime(train['ymd'])

In [None]:
train['is_trump'] = train['keywords'].apply(lambda x: 1 if 'Trump, Donald J' in x else 0)
train['is_covid'] = train['keywords'].apply(lambda x: 1 if 'Coronavirus (2019-nCoV)' in x else 1 if 'Coronavirus Risks and Safety Concerns' in x else 0)
train['is_racial'] = train['keywords'].apply(lambda x: 1 if 'Black People' in x else 1 if 'Race and Ethnicity' in x else 1 if 'Discrimination' in x \
                                             else 1 if 'Black Lives Matter Movement' in x else 0)
train['is_re'] = train['keywords'].apply(lambda x: 1 if 'Real Estate and Housing (Residential)' in x else 0) 

We can see that the news cycle here is somewhat cyclical -- within a single month, there are multiple peaks and valleys where the amount of news rapidly increases before falling. This pattern is consistent throughout the year.

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(data=train['ymd'].value_counts())
plt.title('Daily Article Frequency', fontsize=18);

### Month
Articles that were published in January, July, August and September have a slightly higher average popularity.

In [None]:
plt.figure(figsize=(10,6))
ax = sns.lineplot(data=train['month'].value_counts(), label='Number of Articles')
ax.set_ylabel('Number of Articles')
ax.legend(loc=2)
ax2 = ax.twinx()
sns.lineplot(data=train.groupby('month').mean()['is_popular'], color='orange', ax=ax2, label='Average Popularity')
ax2.set_ylabel('Average Popularity')
ax2.legend(loc=1)
plt.title('Avg Popularity vs Frequency (Monthly)', fontsize=18);

### Day of Month

There doesn't appear to be a particular increase or decrease in popularity according to the day of the month

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='day_of_month', hue='is_popular')
plt.legend(labels=['Popular', 'Unpopular'], fontsize=12)
plt.title('Popularity by Day of Month', fontsize=18);

### Day of Week
There seems to be an inverse relationship between average popularity and the number articles opened for commentary. Average article popularity tends to be lower during the week, and higher on the weekend.

In [None]:
plt.figure(figsize=(10,6))
ax = sns.lineplot(data=train['day_of_week'].value_counts(), label='Number of Articles')
ax.set_ylabel('Number of Articles')
ax.legend(loc=2)
ax2 = ax.twinx()
sns.lineplot(data=train.groupby('day_of_week').mean()['is_popular'], color='orange', ax=ax2, label='Average Popularity')
ax2.set_ylabel('Average Popularity')
ax2.legend(loc=1)
plt.title('Avg Popularity vs Frequency (Day of Week)', fontsize=18);

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='day_of_week', hue='is_popular', bins = np.arange(0, 8))
plt.legend(labels=['Popular', 'Unpopular'], fontsize=12)
plt.title('Popularity by Day of Week', fontsize=18);

### Hour
Articles published between 10pm and 2am have a much higher average popularity.

In [None]:
plt.figure(figsize=(10,6))
ax = sns.lineplot(data=train['hour'].value_counts(), label='Number of Articles')
ax.set_ylabel('Number of Articles')
ax.legend(loc=2)
ax2 = ax.twinx()
sns.lineplot(data=train.groupby('hour').mean()['is_popular'], color='orange', ax=ax2, label='Average Popularity')
ax2.set_ylabel('Average Popularity')
ax2.legend(loc=1)
plt.title('Avg Popularity vs Frequency (Hour)', fontsize=18);

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='hour', hue='is_popular', bins=24)
plt.xticks(np.arange(0, 24, 1))
plt.legend(labels=['Popular', 'Unpopular'], fontsize=12)
plt.title('Popularity by Hour', fontsize=18)

I created the two graphs below to look at changes in the popularity certain keywords (a proxy for topics) over time. 80% of the articles mentioning Donald Trump are popular, while only 25% of articles mentioning real estate are popular. We can also see the number of popular COVID articles spiked in April.

In [None]:
trump_data = train[train['is_trump'] > 0].groupby('month').mean()
covid_data = train[train['is_covid'] > 0].groupby('month').mean()
racial_data = train[train['is_racial'] > 0].groupby('month').mean()
re_data = train[train['is_re'] > 0].groupby('month').mean()

g1_data = train.groupby('month').mean().reset_index()
plt.figure(figsize=(14,8))
sns.lineplot(data=trump_data, x='month', y='is_popular', label='Trump')
sns.lineplot(data=covid_data, x='month', y='is_popular', label='COVID-19')
sns.lineplot(data=racial_data, x='month', y='is_popular', label='Race & Ethnicity')
sns.lineplot(data=re_data, x='month', y='is_popular', label='Real Estate')
plt.legend(fontsize=14, loc=1)
plt.ylabel('Average Popularity', fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.title('Average Article Popularity by Month by Topic', fontsize=22);
plt.xticks(np.arange(1,10,1), fontsize=12);
plt.axhline(0.5, ls='--', color='black')
plt.ylim(0, 1.0)

In [None]:
trump_data = train[(train['is_trump'] > 0) & (train['is_popular'] > 0)].groupby('month').sum()
covid_data = train[(train['is_covid'] > 0) & (train['is_popular'] > 0)].groupby('month').sum()
racial_data = train[(train['is_racial'] > 0) & (train['is_popular'] > 0)].groupby('month').sum()
re_data = train[(train['is_re'] > 0) & (train['is_popular'] > 0)].groupby('month').sum()

plt.figure(figsize=(14, 8))
sns.lineplot(data=trump_data, x='month', y='is_popular', label='Trump')
sns.lineplot(data=covid_data, x='month', y='is_popular', label='COVID-19')
sns.lineplot(data=racial_data, x='month', y='is_popular', label='Race & Ethnicity')
sns.lineplot(data=re_data, x='month', y='is_popular', label='Real Estate')
plt.legend(fontsize=14)
plt.ylabel('Number of Popular Articles', fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.title('Topic Popularity by Month', fontsize=22);
plt.xticks(np.arange(1,10,1), fontsize=12);