In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import Axes

In [None]:
def get_years_dfs(misinfotext_path: str, fakespeak_path: str):
    misinfotext_file = pd.ExcelFile(misinfotext_path)
    
    for year in misinfotext_file.sheet_names:
        df = pd.read_excel(misinfotext_file, sheet_name=year)
        
        df["year"] = year
        df["proportion"] = df["ngram_count"] / df["ngram_count"].sum()
        
        yield df
    
    fakespeak_file = pd.ExcelFile(fakespeak_path)
    
    for year in fakespeak_file.sheet_names:
        df = pd.read_excel(fakespeak_file, sheet_name=year)
        
        df["year"] = year
        df["proportion"] = df["ngram_count"] / df["ngram_count"].sum()
        
        yield df

def get_full_df(misinfotext_path: str, fakespeak_path: str):
    return pd.concat(get_years_dfs(misinfotext_path, fakespeak_path))

In [None]:
def get_ranked_proportion_df(df: pd.DataFrame):
    return  df\
            .groupby(["ngram_text", "n"])["proportion"]\
            .sum()\
            .reset_index()\
            .sort_values(
                by=["n", "proportion"], 
                ascending=[True, False]
            )

In [None]:
def get_ngram_proportion_in_year(df: pd.DataFrame, year: str, ngram: str):
    matches = df[(df["year"] == year) & (df["ngram_text"] == ngram)]
    
    if matches.shape[0] == 0:
        return 0
    
    return matches["proportion"].iloc[0]

In [None]:
def save_ngrams_over_years_plot(df: pd.DataFrame, ranked_prop_df: pd.DataFrame, num_ngrams: int, title_template: str, save_path: str):
    fig, axes = plt.subplots(
        nrows=5, ncols=1, 
        figsize=(20, 40), 
        squeeze=False)

    fig.suptitle(title_template.replace("NUM_NGRAMS", str(num_ngrams)), fontsize=16)
    fig.subplots_adjust(top=0.95)

    axes_flat: list[Axes] = axes.flatten()

    for n in range(1, 6):
        top_ngrams: list[str] = ranked_prop_df[ranked_prop_df["n"] == n]["ngram_text"].head(num_ngrams)

        ax = axes_flat[n - 1]
        
        ax.set_title(f"n = {n}")

        ax.set_xlabel("year")
        ax.set_ylabel("proportion of total n-gram counts")

        ax.grid(True)
        ax.tick_params(axis="x", rotation=45)
        
        years = range(2007, 2025)

        for ngram in top_ngrams:
            data = [
                    get_ngram_proportion_in_year(df, str(year), ngram)
                    for year in range(2007, 2025)
            ]

            ax.plot(years, data, marker="o")
            
            ax.set_xticks(list(years))
            ax.set_xticklabels([str(y) for y in years])
        
        ax.legend(top_ngrams)

    plt.draw()
    plt.savefig(save_path)

    plt.show()

In [None]:
full_df = get_full_df(
    misinfotext_path="./data/MisInfoText/Analysis_output/MisInfoText_ngrams.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_ngrams.xlsx"
)
full_df

In [None]:
full_ranked_proportion_df = get_ranked_proportion_df(full_df)
full_ranked_proportion_df

In [None]:
save_ngrams_over_years_plot(
    df=full_df,
    ranked_prop_df=full_ranked_proportion_df,
    num_ngrams=5,
    title_template="Top NUM_NGRAMS per year, for each n = 1 to 5",
    save_path="./visualization/ngrams/plots_over_time/full.png"
)

In [None]:
news_and_blog_df = get_full_df(
    misinfotext_path="./data/MisInfoText/Analysis_output/news_and_blog/MisInfoText_news_and_blog_ngrams.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/news_and_blog/Fakespeak_news_and_blog_ngrams.xlsx"
)
news_and_blog_df

In [None]:
news_and_blog_ranked_proportion_df = get_ranked_proportion_df(news_and_blog_df)
news_and_blog_ranked_proportion_df

In [None]:
save_ngrams_over_years_plot(
    df=news_and_blog_df,
    ranked_prop_df=news_and_blog_ranked_proportion_df,
    num_ngrams=5,
    title_template="Top NUM_NGRAMS per year, for each n = 1 to 5, for social media",
    save_path="./visualization/ngrams/plots_over_time/news_and_blog.png"
)

In [None]:
press_release_df = get_full_df(
    misinfotext_path="./data/MisInfoText/Analysis_output/press_release/MisInfoText_press_release_ngrams.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/press_release/Fakespeak_press_release_ngrams.xlsx"
)
press_release_df

In [None]:
press_release_ranked_proportion_df = get_ranked_proportion_df(press_release_df)
press_release_ranked_proportion_df

In [None]:
save_ngrams_over_years_plot(
    df=press_release_df,
    ranked_prop_df=press_release_ranked_proportion_df,
    num_ngrams=5,
    title_template="Top NUM_NGRAMS per year, for each n = 1 to 5, for press release",
    save_path="./visualization/ngrams/plots_over_time/press_release.png"
)

In [None]:
social_media_df = get_full_df(
    misinfotext_path="./data/MisInfoText/Analysis_output/social_media/MisInfoText_social_media_ngrams.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/social_media/Fakespeak_social_media_ngrams.xlsx"
)
social_media_df

In [None]:
social_media_ranked_proportion_df = get_ranked_proportion_df(social_media_df)
social_media_ranked_proportion_df

In [None]:
save_ngrams_over_years_plot(
    df=social_media_df,
    ranked_prop_df=social_media_ranked_proportion_df,
    num_ngrams=5,
    title_template="Top NUM_NGRAMS per year, for each n = 1 to 5, for press release",
    save_path="./visualization/ngrams/plots_over_time/social_media.png"
)