In [1]:
import pandas as pd
from features import average_length, count_words, sensational_words, unique_ratio, upper_count
from preprocess import preprocess

In [2]:
# import datasets
baseline = pd.read_csv('../../data/baseline_clean.csv')
news = pd.read_csv('../../data/news_clean.csv')

# extend col width to see full headline
pd.set_option('display.max_colwidth', 0)

In [3]:
# add column for number of words in headlines
baseline['headline_length'] = baseline['headline'].apply(count_words)
news['headline_length'] = news['headline'].apply(count_words)

In [4]:
# preprocess headlines by converting into lower case, removing punctuation, then tokenizing
# removing stop words, lemmatizing and stemming words, then joining into a sentence
baseline['headline_processed'] = baseline['headline'].apply(preprocess)
news['headline_processed'] = news['headline'].apply(preprocess)

In [5]:
# add column for ratio of unique to total words in headlines
baseline['unique_words'] = baseline['headline_processed'].apply(unique_ratio)
news['unique_words'] = news['headline_processed'].apply(unique_ratio)

In [6]:
# recall the 20 most frequent words in buzzfeed, entertainment-weekly
tabloids = baseline[(baseline['source'] == 'Buzzfeed') | (baseline['source'] == 'Entertainment Weekly')]
pd.Series(' '.join(tabloids['headline_processed']).split()).value_counts()[:20]

product    38
gift       21
want       15
make       14
say        12
peopl      12
home       12
thatll     10
star       9 
new        9 
season     9 
realli     9 
perri      8 
get        8 
matthew    8 
first      7 
look       7 
share      7 
thing      7 
kid        6 
Name: count, dtype: int64

In [None]:
# from a list of common sensational words in tabloids + top 20 total base words from buzzfeed, entertainment-weekly
# get a ratio of sensational words per total words in a headline

baseline['buzzword_frequency'] = baseline['headline_processed'].apply(lambda row: sum(word == x_word for word in row.split() for x_word in sensational_words))
baseline['buzzword_frequency'] = baseline['buzzword_frequency'] / baseline['headline_length']

news['buzzword_frequency'] = news['headline_processed'].apply(lambda row: sum(word == x_word for word in row.split() for x_word in sensational_words))
news['buzzword_frequency'] = news['buzzword_frequency'] / news['headline_length']

In [None]:
# returns average word length in a sentence for both regular, processed headlines
baseline['word_length'] = baseline['headline'].apply(average_length)
news['word_length'] = news['headline'].apply(average_length)

baseline['word_length_processed'] = baseline['headline_processed'].apply(average_length)
news['word_length_processed'] = news['headline_processed'].apply(average_length)

In [None]:
# returns a ratio of number of words starting w/ an upper case letter by total word count
baseline['upper_words'] = baseline['headline'].apply(upper_count)
news['upper_words'] = news['headline'].apply(upper_count)

In [None]:
# change column order
baseline = baseline[['source','headline','headline_processed','headline_length','unique_words','buzzword_frequency','word_length','word_length_processed','upper_words']]
news = news[['source','headline','headline_processed','headline_length','unique_words','buzzword_frequency','word_length','word_length_processed','upper_words']]

In [None]:
# add class 0 to source: reuters, class 1 to source: buzzfeed, entertainment-weekly
# reuters serves as the non-sensational control, while the other 2 are sensational

baseline['sensational'] = baseline['source']
baseline.loc[baseline['source'] == 'Associated Press', 'sensational'] = 0
baseline.loc[baseline['source'] == 'Buzzfeed', 'sensational'] = 1
baseline.loc[baseline['source'] == 'Entertainment Weekly', 'sensational'] = 1
baseline.loc[baseline['source'] == 'Reuters', 'sensational'] = 0

In [None]:
# save dataframes as csv
baseline.to_csv('baseline_features.csv',index=False)
news.to_csv('news_features.csv',index=False)