In [18]:
import pandas as pd
from features import count_words, sensational_words, unique_ratio
from preprocess import preprocess

In [19]:
# import datasets
baseline = pd.read_csv('../../data/baseline_clean.csv')
news = pd.read_csv('../../data/news_clean.csv')

# extend col width to see full headline
pd.set_option('display.max_colwidth', 0)

In [20]:
# add column for number of words in headlines
baseline['length'] = baseline['headline'].apply(count_words)
news['length'] = news['headline'].apply(count_words)

In [21]:
# add column for ratio of unique to total words in headlines
baseline['unique'] = baseline['headline'].apply(unique_ratio)
news['unique'] = news['headline'].apply(unique_ratio)

In [22]:
# preprocess headlines by converting into lower case, removing punctuation, then tokenizing
# removing stop words, lemmatizing and stemming words, then joining into a sentence
baseline['headline_processed'] = baseline['headline'].apply(preprocess)
news['headline_processed'] = news['headline'].apply(preprocess)

In [23]:
# recall the 20 most frequent words in buzzfeed, entertainment-weekly
tabloids = baseline[baseline['source'] != 'Reuters']
pd.Series(' '.join(tabloids['headline_processed']).split()).value_counts()[:20]

product    37
gift       17
thing      15
make       15
thatll     14
season     13
new        13
realli     12
home       12
day        12
star       11
want       11
youll      10
peopl      10
say        9 
perri      9 
matthew    9 
movi       7 
one        7 
get        7 
Name: count, dtype: int64

In [24]:
# get frequency of headlines using the list of sensational words
baseline['sensational_count'] = baseline['headline_processed'].apply(lambda row: sum([row.split().count(word) for word in sensational_words]))
baseline['frequency'] = baseline['sensational_count'] / baseline['length']
baseline = baseline.drop(columns='sensational_count')

news['sensational_count'] = news['headline_processed'].apply(lambda row: sum([row.split().count(word) for word in sensational_words]))
news['frequency'] = news['sensational_count'] / news['length']
news = news.drop(columns='sensational_count')

In [29]:
# change column order
baseline = baseline[['source','headline','headline_processed','length','unique','frequency']]
news = news[['source','headline','headline_processed','length','unique','frequency']]

In [30]:
baseline.head()

Unnamed: 0,source,headline,headline_processed,length,unique,frequency
0,Reuters,"Germany's Kuehne examines offer for Signa's Hamburg skyscraper, Handelsblatt reports",germani kuehn examin offer signa hamburg skyscrap handelsblatt report,10,1.0,0.0
1,Reuters,Shoppers click 'buy' as retailers slash prices ahead of Cyber Monday,shopper click buy retail slash price ahead cyber monday,11,1.0,0.0
2,Reuters,US Black Friday sales rise 2.5% -Mastercard Spendingpulse,u black friday sale rise mastercard spendingpuls,8,1.0,0.0
3,Reuters,X may lose up to $75M by year-end on advertiser exodus,x may lose yearend advertis exodu,11,1.0,0.0
4,Reuters,Sri Lanka to OK Sinopec's $4.5 bln refinery proposal on Monday -minister,sri lanka ok sinopec bln refineri propos monday minist,12,1.0,0.0
