# Feature Engineering & Preprocessing

In [None]:
import pickle
import re
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from scipy.stats import boxcox, yeojohnson
import nltk
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN

# Connect tqdm to pandas
tqdm.pandas()

### Summary of Variables Created
In this notebook, I created a significant number of additional features including:

- Box-cox transformation of word count
- Time variables based on publication date
        - Day of week, day of month, hour, weekend
        - Number of articles posted per day
- Keywords
        - Donald Trump
        - Republican / Democrat news
        - COVID-19
- Headline / abstract length
- Whether headline / abstract contains a question mark 
- Categorical features transformed into ordinal features based on average popularity (newsdesk, section, subsection, material)
- Sentiment based on headline + abstract
        - Positive, neutral, negative and compound sentiment

In [None]:
train = pd.read_csv('/kaggle/input/new-york-times-articles-comments-2020/train.csv', converters={'keywords': eval}, parse_dates=['pub_date'])

In [None]:
test = pd.read_csv('/kaggle/input/new-york-times-articles-comments-2020/test.csv', converters={'keywords': eval}, parse_dates=['pub_date'])

In [None]:
test.isnull().sum()[test.isnull().sum() > 0]

In [None]:
test['keywords'] = test['keywords'].fillna('None')

In [None]:
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
train['abstract'] = train['abstract'].fillna('')

## Word Count
We saw previously that the Boxcox transformation seems to work best, so we'll use that going forward. We'll also keep the original word count variable as removing it led to a drop in model accuracy.

In [None]:
section_avg = train.groupby('section').mean()['word_count']

In [None]:
train['boxcox_word'] = train['word_count'].apply(lambda x: 0.001 if x == 0 else x)
train['boxcox_word'] = boxcox(train['boxcox_word'])[0]

In [None]:
test['boxcox_word'] = test['word_count'].apply(lambda x: 0.001 if x == 0 else x)
test['boxcox_word'] = boxcox(test['boxcox_word'])[0]

## Time Variables

Time affects the frequency of published articles, which correspondingly affects the popularity of articles. Articles published at a time where less articles are published are more likely to be more popular -- there's less places for commentators to go.

In [None]:
train['day_of_month'] = train['pub_date'].apply(lambda x: x.day)
train['day_of_week'] = train['pub_date'].apply(lambda x: x.dayofweek)
train['hour'] = train['pub_date'].apply(lambda x: x.hour)
train['is_weekend'] = train['day_of_week'].apply(lambda x: 1 if x==5 or x==6 else 0)

In [None]:
test['day_of_month'] = test['pub_date'].apply(lambda x: x.day)
test['day_of_week'] = test['pub_date'].apply(lambda x: x.dayofweek)
test['hour'] = test['pub_date'].apply(lambda x: x.hour)
test['is_weekend'] = test['day_of_week'].apply(lambda x: 1 if x==5 or x==6 else 0)

I also created an additional variable that tracks when an article was published. Articles published between 10PM and 2AM seem to have to have a much higher average popularity.

In [None]:
train['is_primehour'] = train['hour'].apply(lambda x: 1 if x > 22 else 1 if x < 4 else 0)

In [None]:
train.corr()['is_popular']['is_primehour']

In [None]:
test['is_primehour'] = test['hour'].apply(lambda x: 1 if x > 22 else 1 if x < 4 else 0)

### Articles Per Day

Similar to our time variables, I created a variable that tracks the number of articles posted in a day. The idea is that the less articles there are, the higher the popularity and vice versa.

In [None]:
train['group_date'] = train['pub_date'].astype(str).apply(lambda x: x[:10])
group_dates = train['group_date'].value_counts()
train['posts_per_day'] = train['group_date'].apply(lambda x: group_dates[x])

In [None]:
# More posts in a day correlated with lower popularity
train.corr()['is_popular'][['posts_per_day']]

In [None]:
test['group_date'] = test['pub_date'].astype(str).apply(lambda x: x[:10])
group_dates = test['group_date'].value_counts()
test['posts_per_day'] = test['group_date'].apply(lambda x: group_dates[x])

## Keywords

Having a certain number of keywords seems important -- the ideal number of keywords seems to be between 11 and 16. This could suggest that people are more interested in articles that cover a range of topics, people and organizations.

In [None]:
train['n_keywords'] = train['keywords'].apply(lambda x: len(x))
test['n_keywords'] = test['keywords'].apply(lambda x: len(x) if type(x) is list else 0)

In [None]:
train['ideal_n_keywords'] = train['n_keywords'].apply(lambda x: 1 if x == 1 else 1 if (x > 11 and x < 16) else 0)
test['ideal_n_keywords'] = test['n_keywords'].apply(lambda x: 1 if x == 1 else 1 if (x > 11 and x < 16) else 0)

In [None]:
train.corr()['is_popular'][['n_keywords', 'ideal_n_keywords']]

### Trump / Republican / Democrat

We saw that Donald Trump and Republican/Democrat keywords are among the most frequent keywords, so we'll create a variable here to keep track of that. Both these features have a significant correlation with popularity.

In [None]:
train['is_trump'] = train['keywords'].apply(lambda x: 1 if 'Trump, Donald J' in x else 0)

test['is_trump'] = test['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Trump, Donald J' in x else 0))

In [None]:
train['is_party'] = train['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Democratic Party' in x 
                                                else 1 if 'Republican Party' in x else 0))

test['is_party'] = test['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Democratic Party' in x 
                                                else 1 if 'Republican Party' in x else 0))

In [None]:
train.corr()['is_popular'][['is_trump', 'is_party']]

### Race & Ethnicity

Race and ethnicity has always been a hot topic in the US, and especially so this year with the death of George Floyd. This has a slight correlation with article popularity.

In [None]:
train['is_racial'] = train['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Black People' in x 
                                                else 1 if 'Race and Ethnicity' in x 
                                                else 1 if 'Discrimination' in x
                                                else 1 if 'Black Lives Matter Movement' in x
                                                else 0))

test['is_racial'] = test['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Black People' in x 
                                                else 1 if 'Race and Ethnicity' in x 
                                                else 1 if 'Discrimination' in x
                                                else 1 if 'Black Lives Matter Movement' in x
                                                else 0))

In [None]:
train.corr()['is_popular'][['is_racial']]

### COVID-19

COVID-19 has drastically changed the world as we know it -- it was also the most frequent keyword in our entire dataset. All three features below have a faint correlation with popularity.

In [None]:
train['is_covid'] = train['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Coronavirus (2019-nCoV)' in x \
                                                else 1 if 'Coronavirus Risks and Safety Concerns' in x 
                                                else 0))

test['is_covid'] = test['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Coronavirus (2019-nCoV)' in x \
                                                else 1 if 'Coronavirus Risks and Safety Concerns' in x else 0))

In [None]:
train['is_epidemic'] = train['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Epidemics' in x else 0))

test['is_epidemic'] = test['keywords'].apply(lambda x: 0 if type(x) is not list 
                                          else (1 if 'Epidemics' in x else 0))

In [None]:
train['is_death'] = train['keywords'].apply(lambda x: 1 if 'Deaths (Fatalities)' in x else 0)

test['is_death'] = test['keywords'].apply(lambda x: 0 if type(x) is not list else 1 if 'Deaths (Fatalities)' in x else 0)

In [None]:
train.corr()['is_popular'][['is_covid', 'is_epidemic', 'is_death']]

## Question

If the headline or abstract contains a question mark, there's a good chance that the article has been written in a way to invite commentary. Alternatively, people might view the question mark as a friendly invitation to comment.

In [None]:
train['headline_question'] = train['headline'].apply(lambda x: 1 if '?' in x else 0)
test['headline_question'] = test['headline'].apply(lambda x: 1 if '?' in x else 0)

In [None]:
train[train['headline_question'] == 1]['is_popular'].value_counts()

In [None]:
train['abs_question'] = train['abstract'].apply(lambda x: 1 if '?' in x else 0)
test['abs_question'] = test['abstract'].apply(lambda x: 1 if '?' in x else 0)

In [None]:
train[train['abs_question'] == 1]['is_popular'].value_counts()

In [None]:
train.corr()['is_popular'][['headline_question', 'abs_question']]

## Newsdesk / Section / Subsection / Material

An article's newsdesk, section, subsection are likely the most powerful predictors of popularity. Opinion Editorials (OpEds) are much more likely to draw comments because they're likely written in a way to attract attention or controversy. These OpEds tackle recent events and issues, and attempt to formulate viewpoints based on an objective analysis of happenings and conflicting/contrary opinions. NYT Opinion pieces are also always open for comments, which would naturally increase the likelihood of having a popular article.

It was pretty difficult to decide on how to map / encode these features. There are 60 newsdesks, 42 sections, 67 subsections, and 10 types of material. Performing a one hot encoding on each variable would leave me with over 180 features in total.

There are several approaches that I tried:
- One hot encode Newsdesk, Section and Subsection (this returned poor results)
- Combine Newsdesk, Section and Subsection into a single `NewsType` variable 
    - For example: Foreign newsdesk, World section, Australia Subsection --> #Foreign#World#Australia.
    - Group similar articles together e.g. #Foreign#World#Australia and #Foreign#World#Asia Pacific. This naturally presents some difficulty though -- how do we decide which sections and subsections to group together? The Australia subsection has more popular articles but only has 46 articles compared to the 327 articles in the Asia Pacific subsection.
- Create an ordinal interaction feature (number of popular articles * total number of articles) that gives a higher weight to features that have many popular articles and a large number of total articles.
- Use DBSCAN or other clustering methods to group variables together.

Ultimately, I found that <b>taking a simpler approach returned better results</b>. Below, I created a function that groups features according to their average popularity. In short, I created an ordinal feature that places more weight on newsdesks/sections/subsections/materials that have an average popularity of 0.6 and above. Conversely, I placed a low weight on variables that have a average popularity of 0.4 and below.

To catch unique newsdesks/sections/subsections/materials that are in the test but not in train dataset, I added in an `if` statement that maps these unique sections to 0. This should cause our model to treat them in a neutral way.

In [None]:
# Combining newsdesks -- the different names reflect interactive articles that will be accounted for later
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'The Upshot' if x=='Upshot' else x)
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'OpEd' if x=='Opinion' else x)
train['newsdesk'] = train['newsdesk'].apply(lambda x: 'AtHome' if x=='At Home' else x)

test['newsdesk'] = test['newsdesk'].apply(lambda x: 'The Upshot' if x=='Upshot' else x)
test['newsdesk'] = test['newsdesk'].apply(lambda x: 'OpEd' if x=='Opinion' else x)
test['newsdesk'] = test['newsdesk'].apply(lambda x: 'AtHome' if x=='At Home' else x)

In [None]:
# We have to fill the null values in our subsection
train['subsection'].fillna('N/A', inplace=True)
test['subsection'].fillna('N/A', inplace=True)

In [None]:
def map_popularity(col):
    df = train.groupby(f'{col}').mean().reset_index().sort_values(by='is_popular', ascending=False) \
              [[f'{col}', 'is_popular']]
    df.columns=[f'{col}', 'avg_popularity']
    
    pop_5 = df[df['avg_popularity'] >= 0.7][f'{col}'].values
    pop_4 = df[(df['avg_popularity'] < 0.7) & (df['avg_popularity'] >= 0.6)][f'{col}'].values
    pop_3 = df[(df['avg_popularity'] < 0.6) & (df['avg_popularity'] >= 0.5)][f'{col}'].values
    pop_2 = df[(df['avg_popularity'] < 0.5) & (df['avg_popularity'] >= 0.4)][f'{col}'].values
    pop_1 = df[(df['avg_popularity'] < 0.4) & (df['avg_popularity'] >= 0.3)][f'{col}'].values
    pop_0 = df[df['avg_popularity'] < 0.3][f'{col}'].values
    
    def lambda_fxn(x):
        if x in pop_5:
            return 5
        elif x in pop_4:
            return 4
        elif x in pop_3:
            return 3
        elif x in pop_2:
            return 2
        elif x in pop_1:
            return 1
        elif x in pop_0:
            return -1
        
        # To catch news desks/sections/subsections/material in test but not in train
        else:
            return 0
    
    train[f'{col}_pop'] = train[f'{col}'].apply(lambda_fxn)
    test[f'{col}_pop'] = test[f'{col}'].apply(lambda_fxn)

In [None]:
map_popularity('newsdesk')

In [None]:
map_popularity('section')

In [None]:
map_popularity('subsection')

In [None]:
map_popularity('material')

In [None]:
train.loc[0][['headline', 'newsdesk', 'newsdesk_pop', 'section', 'section_pop', 'subsection',
              'subsection_pop', 'material', 'material_pop']].to_frame().T

In [None]:
train.loc[101][['headline', 'newsdesk', 'newsdesk_pop', 'section', 'section_pop', 'subsection', 
                'subsection_pop', 'material', 'material_pop']].to_frame().T

## Other Features

In [None]:
train['combi_text'] = train['headline'] + '. ' + train['abstract']
train['combi_text2'] = train['combi_text'].str.replace(r'[\!?.]+[\.]+','.', regex=True) # remove extra punctuation in headline

test['combi_text'] = test['headline'] + '. ' + test['abstract']
test['combi_text2'] = test['combi_text'].str.replace(r'[\!?.]+[\.]+','.', regex=True)

### Sentiment

Sentiment plays a notable role in determining popularity. People are more likely to comment on articles with headlines that have negative sentiment, and less likely to comment on articles with headlines that have neutral sentiment. Previous [research](https://jonahberger.com/wp-content/uploads/2013/02/ViralityB.pdf) has shown that content that evokes high-arousal positive (awe) or negative (anger or anxiety) emotions tends to be more viral.

In [None]:
train['combi_text'][0]

In [None]:
# Instantiating sentiment intensity analyzer
sia = SIA()
sia.polarity_scores(train['combi_text'][0])

In [None]:
def get_sentiment(row):
    sentiment_dict = sia.polarity_scores(row['combi_text'])
    row['sentiment_pos'] = sentiment_dict['pos']
    row['sentiment_neu'] = sentiment_dict['neu']
    row['sentiment_neg'] = sentiment_dict['neg']
    row['sentiment_compound'] = sentiment_dict['compound']
    return row

In [None]:
train = train.progress_apply(get_sentiment, axis=1)

In [None]:
# Looks like negative articles tend to be more popular
train.corr()['is_popular'][['sentiment_compound', 'sentiment_pos', 'sentiment_neu', 'sentiment_neg']]

In [None]:
test = test.progress_apply(get_sentiment, axis=1)

### Headline / Abstract Length

The idea here is that longer headlines and abstracts will lead to less comments -- the easier the headline / abstract is to understand, the more comments the article will attract. We can see this seems to be a factor for the abstract, while headline length doesn't seem to have much of an impact.

In [None]:
train['headline_len'] = train['headline'].apply(lambda x: len(x))
train['abstract_len'] = train['abstract'].apply(lambda x: len(x))

In [None]:
test['headline_len'] = test['headline'].apply(lambda x: len(x))
test['abstract_len'] = test['abstract'].apply(lambda x: len(x))

In [None]:
train['head_abs_len'] = train['headline_len'] + train['abstract_len']
test['head_abs_len'] = test['headline_len'] + test['abstract_len']

In [None]:
# Shorter abstracts seem to do better
train.corr()['is_popular'].sort_values(ascending=False)[['abstract_len', 'headline_len']]

### Interactive Features

There are only a few interactive features, but generally I found that including this feature increased my model accuracy.

In [None]:
train['is_interactive'] = train['material'].apply(lambda x: 1 if x == 'Interactive Feature' else 0)
test['is_interactive'] = test['material'].apply(lambda x: 1 if x == 'Interactive Feature' else 0)

### Clustering

I also tried out various clustering methods on my data to see if there was a more efficient way of clustering by newsdesk or section, or by headline. Generally, I found that clustering didn't help my model accuracy much. The headlines also seemed clustered pretty close together, which made it difficult for DBSCAN effectively separate them. Using K-means clustering seemed to work slightly better, but the topics ended up looking pretty similar. I excluded clustering from my final model, but it's worth noting that some clusters had a moderate correlation with popularity.

Credit to [Brandon Rose](http://brandonrose.org/clustering) for this part.

In [None]:
full_df = pd.concat([train, test])

In [None]:
full_df['newsdesk'].nunique(), full_df['section'].nunique(), full_df['subsection'].nunique(), full_df['material'].nunique()

In [None]:
extra_stopwords = ["ain't", "amn't", "aren't", "can't", "could've", "couldn't",
                    "daresn't", "didn't", "doesn't", "don't", "gonna", "gotta", 
                    "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd",
                    "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd",
                    "it'll", "it's", "let's", "mayn't", "may've", "mightn't", 
                    "might've", "mustn't", "must've", "needn't", "o'clock", "ol'",
                    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've",
                    "shouldn't", "somebody's", "someone's", "something's", "that'll",
                    "that're", "that's", "that'd", "there'd", "there're", "there's", 
                    "these're", "they'd", "they'll", "they're", "they've", "this's",
                    "those're", "tis", "twas", "twasn't", "wasn't", "we'd", "we'd've",
                    "we'll", "we're", "we've", "weren't", "what'd", "what'll", 
                    "what're", "what's", "what've", "when's", "where'd", "where're",
                    "where's", "where've", "which's", "who'd", "who'd've", "who'll",
                    "who're", "who's", "who've", "why'd", "why're", "why's", "won't",
                    "would've", "wouldn't", "y'all", "you'd", "you'll", "you're", 
                    "you've", "'s", "'d", "'m", "abov", "afterward", "ai", "alon", "alreadi", "alway", "ani", 
                     "anoth", "anyon", "anyth", "anywher", "becam", "becaus", "becom", "befor", 
                     "besid", "ca", "cri", "dare", "describ", "did", "doe", "dure", "els", 
                     "elsewher", "empti", "everi", "everyon", "everyth", "everywher", "fifti", 
                     "forti", "gon", "got", "henc", "hereaft", "herebi", "howev", "hundr", "inde", 
                     "let", "ll", "mani", "meanwhil", "moreov", "n't", "na", "need", "nobodi", "noon", 
                     "noth", "nowher", "ol", "onc", "onli", "otherwis", "ought", "ourselv", "perhap", 
                     "pleas", "sever", "sha", "sinc", "sincer", "sixti", "somebodi", "someon", "someth", 
                     "sometim", "somewher", "ta", "themselv", "thenc", "thereaft", "therebi", "therefor", 
                     "togeth", "twelv", "twenti", "ve", "veri", "whatev", "whenc", "whenev", 
                    "wherea", "whereaft", "wherebi", "wherev", "whi", "wo", "anywh", "el", "elsewh", "everywh", 
                    "ind", "otherwi", "plea", "somewh", "yourselv"]

custom_stopwords = text.ENGLISH_STOP_WORDS.union(extra_stopwords)

In [None]:
stemmer = SnowballStemmer("english")

In [None]:
def tokenize_and_stem(text, do_stem=True):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    # stem filtered tokens
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    if do_stem:
        return stems
    else:
        return filtered_tokens

In [None]:
# not super pythonic, no, not at all.
# use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in tqdm(full_df['combi_text2']):
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_and_stem(i, False)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
vocab_frame.head()

In [None]:
# Define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_features=20_000,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,2), 
                                   min_df=0.01, stop_words=extra_stopwords)

tfidf_matrix = tfidf_vectorizer.fit_transform(full_df['combi_text2']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

In [None]:
num_clusters = 7
cluster_model = KMeans(n_clusters=num_clusters, random_state=42)

In [None]:
cluster_model.fit(tfidf_matrix)
clusters = cluster_model.labels_.tolist()
full_df['cluster'] = clusters
full_df.head()[['headline', 'cluster']]

In [None]:
full_df['cluster'].value_counts()

In [None]:
print("Top terms per cluster:")
print()

# Sort cluster centers by proximity to centroid
order_centroids = cluster_model.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')

    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print('\n')

    print("Cluster %d titles:" % i, end='')
    print()
    for title in full_df[full_df['cluster'] == i]['headline'].values.tolist()[:8]:
        print(' - %s' % title)
    print('\n')

In [None]:
cluster_df = pd.get_dummies(data=full_df, columns=['cluster'])

Only Cluster 0 has a moderate correlation with popularity -- however we're already accounting for political news with our previous features.

In [None]:
cluster_df.corr()['is_popular'].sort_values(ascending=False).filter(like='cluster')

## Process Test

In [None]:
[i for i in train if i not in test]

In [None]:
[i for i in test if i not in train]

In [None]:
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
test.isnull().sum()[test.isnull().sum() > 0]

In [None]:
plt.figure(figsize=(13,13))
sns.heatmap(train.corr(), cmap='coolwarm', annot=False, square=True, fmt='.2f', cbar=True)

In [None]:
plt.figure(figsize=(10,12))
sns.heatmap(train.corr()[['is_popular']].sort_values(ascending=False, by='is_popular'), 
            cmap='coolwarm', annot=True, vmax=0.8)

In [None]:
train.to_csv('/kaggle/working/train_processed.csv', index=False)

In [None]:
test.to_csv('/kaggle/working/test_processed.csv', index=False)