In [1]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv("./bigram_frequencies.csv", index_col=0)
df.index = df.index.set_names(['bigram_text'])
df = df.reset_index()
df

Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq
0,"('the', 'first')",36498,59794
1,"('first', 'official')",135,245
2,"('official', 'trailer')",25,2
3,"('trailer', 'for')",216,52
4,"('mockingjay', 'part')",30,14
...,...,...,...
2437335,"('paso', 'had')",1,18
2437336,"('tour', 'some')",1,4
2437337,"('tremendously', 'i')",1,3
2437338,"('damage', 'here')",1,5


In [25]:
import re

def str2tuple(x):
    match = re.match(r"\(\'(.+)\'.+\'(.+)\'", x)
    if match:
        return (match.group(1), match.group(2))

In [26]:
df['bigram'] = df['bigram_text'].map(convert)
df

Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq,bigram
0,"('the', 'first')",36498,59794,"(the, first)"
1,"('first', 'official')",135,245,"(first, official)"
2,"('official', 'trailer')",25,2,"(official, trailer)"
3,"('trailer', 'for')",216,52,"(trailer, for)"
4,"('mockingjay', 'part')",30,14,"(mockingjay, part)"
...,...,...,...,...
2437335,"('paso', 'had')",1,18,"(paso, had)"
2437336,"('tour', 'some')",1,4,"(tour, some)"
2437337,"('tremendously', 'i')",1,3,"(tremendously, i)"
2437338,"('damage', 'here')",1,5,"(damage, here)"


In [4]:
articles = pd.read_csv("./content.csv")
N_hyperpartasian = articles["hyperpartasian"].sum()
N_non_hyperpartasian = len(articles) - N_hyperpartasian

In [35]:
with open("stopwords.txt", 'r') as f:
    stopwords = set(f.read().split())

contains_stopword = df["bigram"].map(lambda x: (x[0] in stopwords) or (x[1] in stopwords))
df = df[~contains_stopword]

df

Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq,bigram,freq
1,"('first', 'official')",135,245,"(first, official)",380
22,"('first', 'thing')",584,679,"(first, thing)",1263
40,"('first', 'two')",766,1237,"(first, two)",2003
41,"('two', 'movies')",34,26,"(two, movies)",60
70,"('destroy', 'us')",87,46,"(destroy, us)",133
...,...,...,...,...,...
2003867,"('de', 'le')",38,145,"(de, le)",183
2047220,"('girls', 'revolt')",35,20,"(girls, revolt)",55
2092404,"('perez', 'molina')",36,27,"(perez, molina)",63
2120951,"('transportation', 'commission')",44,126,"(transportation, commission)",170


In [36]:
df["freq"] = df["non_hyperpartisan_freq"] + df["hyperpartisan_freq"]
df

Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq,bigram,freq
1,"('first', 'official')",135,245,"(first, official)",380
22,"('first', 'thing')",584,679,"(first, thing)",1263
40,"('first', 'two')",766,1237,"(first, two)",2003
41,"('two', 'movies')",34,26,"(two, movies)",60
70,"('destroy', 'us')",87,46,"(destroy, us)",133
...,...,...,...,...,...
2003867,"('de', 'le')",38,145,"(de, le)",183
2047220,"('girls', 'revolt')",35,20,"(girls, revolt)",55
2092404,"('perez', 'molina')",36,27,"(perez, molina)",63
2120951,"('transportation', 'commission')",44,126,"(transportation, commission)",170


In [37]:
frequent = (df[["hyperpartisan_freq", "non_hyperpartisan_freq"]] >= 20).all(axis=1)
# frequent = (df["freq"] >= 20)
df = df[frequent]
df

Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq,bigram,freq
1,"('first', 'official')",135,245,"(first, official)",380
22,"('first', 'thing')",584,679,"(first, thing)",1263
40,"('first', 'two')",766,1237,"(first, two)",2003
41,"('two', 'movies')",34,26,"(two, movies)",60
70,"('destroy', 'us')",87,46,"(destroy, us)",133
...,...,...,...,...,...
2003867,"('de', 'le')",38,145,"(de, le)",183
2047220,"('girls', 'revolt')",35,20,"(girls, revolt)",55
2092404,"('perez', 'molina')",36,27,"(perez, molina)",63
2120951,"('transportation', 'commission')",44,126,"(transportation, commission)",170


In [38]:
df["p_hyperpartisan"] = df["hyperpartisan_freq"] / N_hyperpartasian
df["p_non_hyperpartisan"] = df["non_hyperpartisan_freq"] / N_non_hyperpartasian

df["o_hyperpartisan"] = df["p_hyperpartisan"] / (1 - df["p_hyperpartisan"])
df["o_non_hyperpartisan"] = df["p_non_hyperpartisan"] / (1 - df["p_non_hyperpartisan"])

df["r"] = np.log(df["o_hyperpartisan"] / df["o_non_hyperpartisan"])

df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,bigram_text,hyperpartisan_freq,non_hyperpartisan_freq,bigram,freq,p_hyperpartisan,p_non_hyperpartisan,o_hyperpartisan,o_non_hyperpartisan,r
1,"('first', 'official')",135,245,"(first, official)",380,0.001800,0.003267,0.001803,0.003277,-0.597454
22,"('first', 'thing')",584,679,"(first, thing)",1263,0.007787,0.009053,0.007848,0.009136,-0.151998
40,"('first', 'two')",766,1237,"(first, two)",2003,0.010213,0.016493,0.010319,0.016770,-0.485627
41,"('two', 'movies')",34,26,"(two, movies)",60,0.000453,0.000347,0.000454,0.000347,0.268371
70,"('destroy', 'us')",87,46,"(destroy, us)",133,0.001160,0.000613,0.001161,0.000614,0.637814
...,...,...,...,...,...,...,...,...,...,...
2003867,"('de', 'le')",38,145,"(de, le)",183,0.000507,0.001933,0.000507,0.001937,-1.340576
2047220,"('girls', 'revolt')",35,20,"(girls, revolt)",55,0.000467,0.000267,0.000467,0.000267,0.559816
2092404,"('perez', 'molina')",36,27,"(perez, molina)",63,0.000480,0.000360,0.000480,0.000360,0.287802
2120951,"('transportation', 'commission')",44,126,"(transportation, commission)",170,0.000587,0.001680,0.000587,0.001683,-1.053187


In [41]:
non_hyper = df.sort_values("r").head(50).reset_index(drop=True)
hyper = df.sort_values("r", ascending=False).head(50).reset_index(drop=True)

In [43]:
pd.concat([hyper, non_hyper]).to_csv("bigrams_highest.csv", index=False)