In [50]:
import pandas as pd

In [51]:
# extend col width to see full headline
pd.set_option('display.max_colwidth', 0)

baseline = pd.read_csv('../data/baseline.csv')
baseline.head()

Unnamed: 0,headline,source,sensational
0,"Germany's Kuehne examines offer for Signa's Hamburg skyscraper, Handelsblatt reports",reuters,0
1,Shoppers click 'buy' as retailers slash prices ahead of Cyber Monday,reuters,0
2,US Black Friday sales rise 2.5% -Mastercard Spendingpulse,reuters,0
3,X may lose up to $75M by year-end on advertiser exodus,reuters,0
4,Sri Lanka to OK Sinopec's $4.5 bln refinery proposal on Monday -minister,reuters,0


In [52]:
# creating feature for number of words in each headline
def count_words(text):
    return len(text.split())

baseline['headline_length'] = baseline['headline'].apply(count_words)

In [53]:
# creating feature for a ratio of unique words to total words in a headline
def unique_ratio(text):
    words = text.split()
    unique_words = len(set(words)) / len(words)
    return unique_words

baseline['unique_ratio'] = baseline['headline'].apply(unique_ratio)

In [54]:
# recall function to preprocess text for NLP
from preprocess_text import preprocess

baseline['headline_processed'] = baseline['headline'].apply(preprocess)

In [55]:
# get dataframe w/ just tabloids
tabloids = baseline[baseline['source'] != 'reuters']
tabloids.head()

Unnamed: 0,headline,source,sensational,headline_length,unique_ratio,headline_processed
98,Fans Are Defending Blue Ivy After Beyoncé Revealed She Saw The Online Criticism About Her First Renaissance Tour,buzzfeed,1,18,1.0,fan defend blue ivi beyoncé reveal saw onlin critic first renaiss tour
99,34 Living Room Products That’ll Make Your TV-Watching Spot Even Comfier,buzzfeed,1,11,1.0,live room product thatll make tvwatch spot even comfier
100,Indulge In A Self-Care Day With These 25 Personal Care Products,buzzfeed,1,11,1.0,indulg selfcar day person care product
101,Here The Funniest Weekend Tweets,buzzfeed,1,5,1.0,funniest weekend tweet
102,Check Out These 30 Budget-Friendly Alternatives To Fancy Name Brand Beauty Products,buzzfeed,1,12,1.0,check budgetfriendli altern fanci name brand beauti product


In [56]:
# recall top 20 most frequent base words in tabloids
pd.Series(' '.join(tabloids['headline_processed']).split()).value_counts()[:20]

product    38
gift       17
thing      15
make       15
thatll     14
season     13
new        13
home       12
day        12
realli     12
youll      11
want       11
say        11
star       11
peopl      10
matthew    9 
perri      9 
get        8 
one        7 
first      7 
Name: count, dtype: int64

In [57]:
# commonly found words in tabloids joined w/ top 20 base words from buzzfeed, ew
sensational_words = ['Shocking','Exclusive','Scandal','Secrets','Revealed','Explosive',
'Bizarre','Sensational','Outrageous','Forbidden','Terrifying','Stunning','Jaw-Dropping',
'Unbelievable','Extraordinary','Suddenly','Shockingly','Secretly','Allegedly',
'Dramatically','Mysteriously','Surprisingly','Eerily','Incredibly','Unexpectedly']

sensational_words = ' '.join(sensational_words)
sensational_words = preprocess(sensational_words)
sensational_words = sensational_words + 'product gift thing make thatll season new home day realli youll want say star peopl get one first'
sensational_words = list(sensational_words.split(' '))
sensational_words

['shock',
 'exclus',
 'scandal',
 'secret',
 'reveal',
 'explos',
 'bizarr',
 'sensat',
 'outrag',
 'forbidden',
 'terrifi',
 'stun',
 'jawdrop',
 'unbeliev',
 'extraordinari',
 'suddenli',
 'shockingli',
 'secretli',
 'allegedli',
 'dramat',
 'mysteri',
 'surprisingli',
 'eerili',
 'incred',
 'unexpectedliproduct',
 'gift',
 'thing',
 'make',
 'thatll',
 'season',
 'new',
 'home',
 'day',
 'realli',
 'youll',
 'want',
 'say',
 'star',
 'peopl',
 'get',
 'one',
 'first']

In [58]:
# get frequency, frequency ratio of commonly used sensational words
baseline['frequency'] = baseline['headline_processed'].apply(lambda row: sum([row.split().count(word) for word in sensational_words]))
baseline['frequency_ratio'] = baseline['frequency'] / baseline['headline_length']

In [62]:
baseline.head()

Unnamed: 0,headline,source,sensational,headline_length,unique_ratio,headline_processed,frequency,frequency_ratio
0,"Germany's Kuehne examines offer for Signa's Hamburg skyscraper, Handelsblatt reports",reuters,0,10,1.0,germani kuehn examin offer signa hamburg skyscrap handelsblatt report,0,0.0
1,Shoppers click 'buy' as retailers slash prices ahead of Cyber Monday,reuters,0,11,1.0,shopper click buy retail slash price ahead cyber monday,0,0.0
2,US Black Friday sales rise 2.5% -Mastercard Spendingpulse,reuters,0,8,1.0,u black friday sale rise mastercard spendingpuls,0,0.0
3,X may lose up to $75M by year-end on advertiser exodus,reuters,0,11,1.0,x may lose yearend advertis exodu,0,0.0
4,Sri Lanka to OK Sinopec's $4.5 bln refinery proposal on Monday -minister,reuters,0,12,1.0,sri lanka ok sinopec bln refineri propos monday minist,0,0.0
