In [1]:
import pandas as pd
from corpus_toolkit import corpus_tools as ct
from helpers import load_data, load_stop_word_list, is_all_stop_words, get_groups

  import pkg_resources


In [2]:
dataset_df = load_data()
dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017


In [3]:
stopword_list = load_stop_word_list()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def get_keyness_df_for_year(df: pd.DataFrame, year: int):
    target_year_df = df[df["year"] == year]
    reference_df = df[df["year"] != year]

    target_year_tokens = list(ct.tokenize(target_year_df["text"]))
    reference_tokens = list(ct.tokenize(reference_df["text"]))

    target_year_freq = ct.frequency(target_year_tokens)
    reference_freq = ct.frequency(reference_tokens)

    keyness = ct.keyness(target_year_freq, reference_freq)
    keyness_df = pd.DataFrame({
        "token": keyness.keys(),
        "measure": keyness.values()
    }).sort_values("measure", ascending=False)
    
    keyness_df["year"] = year

    keyness_df_filtered = keyness_df[~keyness_df["token"].apply(is_all_stop_words, args=(stopword_list,))]

    return keyness_df_filtered

In [5]:
years = dataset_df["year"].unique()
years.sort()
years

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
       2020, 2021, 2022, 2023, 2024], dtype=int64)

In [6]:
TOP_N = 20

In [7]:
keyness_years_dfs = [get_keyness_df_for_year(dataset_df, year) for year in years]
keyness_years_dfs[0].head(TOP_N)

Unnamed: 0,token,measure,year
565,end-of-life,35.436108,2009
1532,mossadegh,35.213715,2009
2335,medigap,34.950681,2009
307,abortion-on-demand,34.628753,2009
53,geithner,34.213715,2009
1373,cairo,34.213715,2009
1651,recoverygov,34.213715,2009
377,orszag,34.213715,2009
2127,euro-pols,34.213715,2009
1242,interpersonal,33.628753,2009


## Writing dataframes to excel spreadsheet

Save to a single Excel file with tabs for each year

In [8]:
def save_years(writer: pd.ExcelWriter, dfs: list[pd.DataFrame], years: list[int]):
    for year, df in zip(years, dfs):
        df.head(TOP_N).to_excel(
            writer,
            sheet_name=str(year),
            index=False,
        )

In [9]:
writer = pd.ExcelWriter("./output/keyness.xlsx", engine="xlsxwriter")
save_years(writer, keyness_years_dfs, years)
writer.close()

### Now repeat for each text type

In [10]:
types, types_dfs = get_groups(dataset_df, "text_type")
types_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017
6,http://www.politifact.com/california/statement...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017
7,http://www.politifact.com/california/statement...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017


In [13]:
for type, type_df in zip(types, types_dfs):
    years = type_df["year"].unique()
    years.sort()

    keyness_years_dfs = [get_keyness_df_for_year(type_df, year) for year in years]

    type_str = str(type).lower().replace(" ", "_")

    writer = pd.ExcelWriter(f"./output/{type_str}/keyness_{type_str}.xlsx", engine="xlsxwriter")
    save_years(writer, keyness_years_dfs, years)
    writer.close()