In [62]:
import pandas as pd
import numpy as np

In [63]:
df = pd.read_csv("./frequencies.csv", index_col=0)
df.index = df.index.set_names(['word'])
df = df.reset_index()
df

Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq
0,the,4202586,6898286
1,first,77581,126242
2,official,14619,27086
3,trailer,997,607
4,for,704952,1063722
...,...,...,...
103157,essentialize,1,5
103158,tasia,1,5
103159,hamami,1,5
103160,ziti,1,1


In [64]:
articles = pd.read_csv("./content.csv")
N_hyperpartasian = articles["hyperpartasian"].sum()
N_non_hyperpartasian = len(articles) - N_hyperpartasian

In [65]:
with open("stopwords.txt", 'r') as f:
    stopwords = set(f.read().split())

df = df[~df["word"].isin(stopwords)]
df

Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq
1,first,77581,126242
2,official,14619,27086
3,trailer,997,607
5,mockingjay,52,14
6,part,41270,74901
...,...,...,...
103157,essentialize,1,5
103158,tasia,1,5
103159,hamami,1,5
103160,ziti,1,1


In [66]:
df["freq"] = df["non_hyperpartisan_freq"] + df["hyperpartisan_freq"]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["freq"] = df["non_hyperpartisan_freq"] + df["hyperpartisan_freq"]


Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq,freq
1,first,77581,126242,203823
2,official,14619,27086,41705
3,trailer,997,607,1604
5,mockingjay,52,14,66
6,part,41270,74901,116171
...,...,...,...,...
103157,essentialize,1,5,6
103158,tasia,1,5,6
103159,hamami,1,5,6
103160,ziti,1,1,2


In [75]:
# frequent = (df[["hyperpartisan_freq", "non_hyperpartisan_freq"]] >= 20).all(axis=1)
frequent = (df["freq"] >= 20)
df = df[frequent]
df

Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq,freq,p_hyperpartisan,p_non_hyperpartisan,o_hyperpartisan,o_non_hyperpartisan,r
1,first,77581,126242,203823,1.034413,1.683227,-30.058504,-2.463643,2.501504
2,official,14619,27086,41705,0.194920,0.361147,0.242113,0.565305,-0.847962
3,trailer,997,607,1604,0.013293,0.008093,0.013472,0.008159,0.501478
5,mockingjay,52,14,66,0.000693,0.000187,0.000694,0.000187,1.312693
6,part,41270,74901,116171,0.550267,0.998680,1.223540,756.575758,-6.427054
...,...,...,...,...,...,...,...,...,...
103062,valeriano,1,54,55,0.000013,0.000720,0.000013,0.000721,-3.989691
103064,trichet,1,128,129,0.000013,0.001707,0.000013,0.001710,-4.853725
103070,glendening,1,19,20,0.000013,0.000253,0.000013,0.000253,-2.944679
103089,conseil,1,23,24,0.000013,0.000307,0.000013,0.000307,-3.135788


In [76]:
df["p_hyperpartisan"] = df["hyperpartisan_freq"] / N_hyperpartasian
df["p_non_hyperpartisan"] = df["non_hyperpartisan_freq"] / N_non_hyperpartasian

df["o_hyperpartisan"] = df["p_hyperpartisan"] / (1 - df["p_hyperpartisan"])
df["o_non_hyperpartisan"] = df["p_non_hyperpartisan"] / (1 - df["p_non_hyperpartisan"])

df["r"] = np.log(df["o_hyperpartisan"] / df["o_non_hyperpartisan"])

df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq,freq,p_hyperpartisan,p_non_hyperpartisan,o_hyperpartisan,o_non_hyperpartisan,r
1,first,77581,126242,203823,1.034413,1.683227,-30.058504,-2.463643,2.501504
2,official,14619,27086,41705,0.194920,0.361147,0.242113,0.565305,-0.847962
3,trailer,997,607,1604,0.013293,0.008093,0.013472,0.008159,0.501478
5,mockingjay,52,14,66,0.000693,0.000187,0.000694,0.000187,1.312693
6,part,41270,74901,116171,0.550267,0.998680,1.223540,756.575758,-6.427054
...,...,...,...,...,...,...,...,...,...
103062,valeriano,1,54,55,0.000013,0.000720,0.000013,0.000721,-3.989691
103064,trichet,1,128,129,0.000013,0.001707,0.000013,0.001710,-4.853725
103070,glendening,1,19,20,0.000013,0.000253,0.000013,0.000253,-2.944679
103089,conseil,1,23,24,0.000013,0.000307,0.000013,0.000307,-3.135788


In [82]:
non_hyper = df.sort_values("r").head(50).reset_index(drop=True)
hyper = df.sort_values("r", ascending=False).head(50).reset_index(drop=True)

pd.concat([hyper, non_hyper]).to_csv("words_highest.csv", index=False)

In [81]:
df

Unnamed: 0,word,hyperpartisan_freq,non_hyperpartisan_freq,freq,p_hyperpartisan,p_non_hyperpartisan,o_hyperpartisan,o_non_hyperpartisan,r
1,first,77581,126242,203823,1.034413,1.683227,-30.058504,-2.463643,2.501504
2,official,14619,27086,41705,0.194920,0.361147,0.242113,0.565305,-0.847962
3,trailer,997,607,1604,0.013293,0.008093,0.013472,0.008159,0.501478
5,mockingjay,52,14,66,0.000693,0.000187,0.000694,0.000187,1.312693
6,part,41270,74901,116171,0.550267,0.998680,1.223540,756.575758,-6.427054
...,...,...,...,...,...,...,...,...,...
103062,valeriano,1,54,55,0.000013,0.000720,0.000013,0.000721,-3.989691
103064,trichet,1,128,129,0.000013,0.001707,0.000013,0.001710,-4.853725
103070,glendening,1,19,20,0.000013,0.000253,0.000013,0.000253,-2.944679
103089,conseil,1,23,24,0.000013,0.000307,0.000013,0.000307,-3.135788
