In [1]:
import pandas as pd
from scipy.stats import chisquare

In [2]:
def get_years_dfs(misinfotext_path: str, fakespeak_path: str):
    misinfotext_file = pd.ExcelFile(misinfotext_path)
    
    for sheet in misinfotext_file.sheet_names:
        df = pd.read_excel(misinfotext_file, sheet_name=sheet)
        df["year"] = sheet
        yield df
    
    fakespeak_file = pd.ExcelFile(fakespeak_path)
    
    for sheet in fakespeak_file.sheet_names:
        df = pd.read_excel(fakespeak_file, sheet_name=sheet)
        df["year"] = sheet
        yield df

In [3]:
def build_chisquare_table(df: pd.DataFrame, entities: list[str]):
    # Aggregate duplicate Entity labels (sum their counts) before reindexing
    counts = df.groupby("Entity", sort=False)["Count"].sum().reindex(entities, fill_value=0)

    table = pd.DataFrame({
        "Entity": entities,
        "Count": counts.values
    })

    table["year"] = df["year"].iloc[0]

    return table


In [4]:
def run_stats_test(misinfotext_path: str, fakespeak_path: str, min_significant_pvalue=0.05):
    years_dfs = list(get_years_dfs(misinfotext_path, fakespeak_path))

    for df in years_dfs:
        df.sort_values("Entity", inplace=True)

    all_entities = sorted(set([
        str(entity) 
        for df in years_dfs
        for entity in df["Entity"] 
    ]))

    tables = [build_chisquare_table(df, all_entities) for df in years_dfs]

    chisquare_result = chisquare([df["Count"] for df in tables])

    num_insignificant_entities = sum(
        1 for pvalue in chisquare_result.pvalue 
        if pvalue > min_significant_pvalue
    )
    num_significant_entities = len(all_entities) - num_insignificant_entities

    print("There are", num_insignificant_entities, "entities that did not change significantly, and", 
          num_significant_entities, "entities that changed significantly")
    print("Total", len(all_entities), "entities")

    return pd.DataFrame({
        "Entity": all_entities,
        "statistic": chisquare_result.statistic,
        "pvalue": chisquare_result.pvalue
    })

#### Body text

In [7]:
results_all_types = run_stats_test(    
    misinfotext_path="./data/MisInfoText/Analysis_output/MisInfoText_named_entities_frequency.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_named_entities_frequency.xlsx"
)
results_all_types

There are 8387 entities that did not change significantly, and 4080 entities that changed significantly
Total 12467 entities


  terms = (f_obs - f_exp)**2 / f_exp


Unnamed: 0,Entity,statistic,pvalue
0,(−)-nicotine,15.000000,4.514172e-01
1,.com,105.000000,1.460740e-15
2,04,15.000000,4.514172e-01
3,1,137.506329,7.138018e-22
4,1000000,60.800000,1.836534e-07
...,...,...,...
12462,İçerde,30.000000,1.192150e-02
12463,Α,52.000000,5.648497e-06
12464,Ξ,15.000000,4.514172e-01
12465,Ο,84.000000,1.286887e-11


In [8]:
results_news_and_blog = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/news_and_blog/MisInfoText_news_and_blog_named_entities_frequency.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/news_and_blog/Fakespeak_news_and_blog_named_entities_frequency.xlsx"
)
results_news_and_blog

There are 7647 entities that did not change significantly, and 3716 entities that changed significantly
Total 11363 entities


  terms = (f_obs - f_exp)**2 / f_exp


Unnamed: 0,Entity,statistic,pvalue
0,(−)-nicotine,15.000000,4.514172e-01
1,04,15.000000,4.514172e-01
2,1,132.270270,7.633389e-21
3,1000000,79.400000,8.990378e-11
4,1000000000,41.714286,2.486630e-04
...,...,...,...
11358,İçerde,30.000000,1.192150e-02
11359,Α,71.000000,2.962319e-09
11360,Ξ,15.000000,4.514172e-01
11361,Ο,84.000000,1.286887e-11


In [9]:
results_press_release = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/press_release/MisInfoText_press_release_named_entities_frequency.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/press_release/Fakespeak_press_release_named_entities_frequency.xlsx"
)
results_press_release

There are 2294 entities that did not change significantly, and 972 entities that changed significantly
Total 3266 entities


  terms = (f_obs - f_exp)**2 / f_exp


Unnamed: 0,Entity,statistic,pvalue
0,.com,105.000000,1.460740e-15
1,1,17.400000,2.955199e-01
2,1000000,21.720000,1.153588e-01
3,1000000000,17.400000,2.955199e-01
4,10,12.000000,6.790291e-01
...,...,...,...
3261,yield,57.000000,8.206620e-07
3262,young adult,20.000000,1.719327e-01
3263,youth,36.600000,1.447266e-03
3264,½,15.333333,4.276838e-01


In [10]:
results_social_media = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/social_media/MisInfoText_social_media_named_entities_frequency.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/social_media/Fakespeak_social_media_named_entities_frequency.xlsx"
)
results_social_media

There are 56 entities that did not change significantly, and 0 entities that changed significantly
Total 56 entities


Unnamed: 0,Entity,statistic,pvalue
0,1000,1.0,0.317311
1,2012 VP113,1.0,0.317311
2,Americans,1.0,0.317311
3,Chinese,1.0,0.317311
4,Chinese space program,2.0,0.157299
5,Daily Express,1.0,0.317311
6,Democratic Party,1.0,0.317311
7,Early 1970,1.0,0.317311
8,Evidence,1.0,0.317311
9,Federal Government of the United States of Ame...,1.0,0.317311


#### Headline text

In [11]:
results_all_types_headlines = run_stats_test(    
    misinfotext_path="./data/MisInfoText/Analysis_output/MisInfoText_named_entities_frequency_headlines.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_named_entities_frequency_headlines.xlsx"
)
results_all_types_headlines

There are 1474 entities that did not change significantly, and 119 entities that changed significantly
Total 1593 entities


Unnamed: 0,Entity,statistic,pvalue
0,...Hits,15.0,4.514172e-01
1,1000000,13.0,6.022979e-01
2,1000,15.0,4.514172e-01
3,2012 VP113,82.0,3.003547e-11
4,2020,15.0,4.514172e-01
...,...,...,...
1588,world,15.0,4.514172e-01
1589,world government,30.0,1.192150e-02
1590,wrap,15.0,4.514172e-01
1591,year,18.8,2.229404e-01


In [12]:
results_news_and_blog_headlines = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/news_and_blog/MisInfoText_news_and_blog_named_entities_frequency_headlines.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/news_and_blog/Fakespeak_news_and_blog_named_entities_frequency_headlines.xlsx"
)
results_news_and_blog_headlines

There are 1253 entities that did not change significantly, and 95 entities that changed significantly
Total 1348 entities


Unnamed: 0,Entity,statistic,pvalue
0,...Hits,15.000000,4.514172e-01
1,1000000,14.000000,5.255291e-01
2,1000,15.000000,4.514172e-01
3,2012 VP113,98.666667,2.333374e-14
4,2020,15.000000,4.514172e-01
...,...,...,...
1343,world,15.000000,4.514172e-01
1344,world government,30.000000,1.192150e-02
1345,wrap,15.000000,4.514172e-01
1346,year,19.545455,1.900714e-01


In [13]:
results_press_release_headlines = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/press_release/MisInfoText_press_release_named_entities_frequency_headlines.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/press_release/Fakespeak_press_release_named_entities_frequency_headlines.xlsx"
)
results_press_release_headlines

IndexError: single positional indexer is out-of-bounds

In [14]:
results_social_media_headlines = run_stats_test(
    misinfotext_path="./data/MisInfoText/Analysis_output/social_media/MisInfoText_social_media_named_entities_frequency_headlines.xlsx",
    fakespeak_path="./data/Fakespeak-ENG/Analysis_output/social_media/Fakespeak_social_media_named_entities_frequency_headlines.xlsx"
)
results_social_media_headlines

There are 4 entities that did not change significantly, and 0 entities that changed significantly
Total 4 entities


Unnamed: 0,Entity,statistic,pvalue
0,Evidence,1.0,0.317311
1,Lunar Roving Vehicle,1.0,0.317311
2,Moon Landing,1.0,0.317311
3,sodium,3.0,0.083265
