In [1]:
import pandas as pd
from scipy.stats import chisquare
from helpers import get_years_dfs

In [2]:
def build_chisquare_table(df: pd.DataFrame, entities: list[str]):
    # Aggregate duplicate entity labels (sum their counts) before reindexing
    counts = df.groupby("entity", sort=False)["count"].sum().reindex(entities, fill_value=0)

    table = pd.DataFrame({
        "entity": entities,
        "count": counts.values
    })

    table["year"] = df["year"].iloc[0]

    return table

In [3]:
def run_stats_test(input_path: str, min_significant_pvalue=0.05):
    years_dfs = get_years_dfs(input_path)

    for df in years_dfs:
        df.sort_values("entity", inplace=True)

    all_entities = sorted(set([
        str(entity) 
        for df in years_dfs
        for entity in df["entity"] 
    ]))

    tables = [build_chisquare_table(df, all_entities) for df in years_dfs]

    chisquare_result = chisquare([df["count"] for df in tables])

    num_insignificant_entities = sum(
        1 for pvalue in chisquare_result.pvalue 
        if pvalue > min_significant_pvalue
    )
    num_significant_entities = len(all_entities) - num_insignificant_entities

    print("There are", num_insignificant_entities, "entities that did not change significantly, and", 
          num_significant_entities, "entities that changed significantly")
    print("Total", len(all_entities), "entities")

    return pd.DataFrame({
        "entity": all_entities,
        "statistic": chisquare_result.statistic,
        "pvalue": chisquare_result.pvalue
    })

### Body text

#### All text types

In [4]:
results_all_types = run_stats_test("./output/ner.xlsx")
results_all_types

There are 4364 entities that did not change significantly, and 1925 entities that changed significantly
Total 6289 entities


Unnamed: 0,entity,statistic,pvalue
0,.com,45.0,7.657266e-05
1,125th Street,15.0,4.514172e-01
2,14th,15.0,4.514172e-01
3,160th Special Operations Aviation Regiment (Ai...,15.0,4.514172e-01
4,1918-1920 flu pandemic,15.0,4.514172e-01
...,...,...,...
6284,year,15.0,4.514172e-01
6285,İlhan,15.0,4.514172e-01
6286,Α,48.0,2.543044e-05
6287,Ξ,15.0,4.514172e-01


#### News and blog

In [5]:
results_news_and_blog = run_stats_test("./output/news_and_blog/ner_news_and_blog.xlsx")
results_news_and_blog

There are 10279 entities that did not change significantly, and 4862 entities that changed significantly
Total 15141 entities


  terms = (f_obs - f_exp)**2 / f_exp


Unnamed: 0,entity,statistic,pvalue
0,(−)-nicotine,15.000000,4.514172e-01
1,04,15.000000,4.514172e-01
2,1,191.354286,1.205795e-32
3,1000000,79.400000,8.990378e-11
4,1000000000,46.666667,4.162963e-05
...,...,...,...
15136,Δ,45.000000,7.657266e-05
15137,Ξ,15.000000,4.514172e-01
15138,Ο,84.000000,1.286887e-11
15139,Ρ,30.000000,1.192150e-02


#### Press release

In [6]:
results_press_release = run_stats_test("./output/press_release/ner_press_release.xlsx")
results_press_release

There are 2726 entities that did not change significantly, and 1069 entities that changed significantly
Total 3795 entities


  terms = (f_obs - f_exp)**2 / f_exp


Unnamed: 0,entity,statistic,pvalue
0,.com,105.000000,1.460740e-15
1,1,25.333333,4.564201e-02
2,1000000,21.720000,1.153588e-01
3,1000000000,17.400000,2.955199e-01
4,10,12.000000,6.790291e-01
...,...,...,...
3790,yield,45.733333,5.861732e-05
3791,young adult,15.333333,4.276838e-01
3792,youth,36.600000,1.447266e-03
3793,½,15.333333,4.276838e-01


#### Social media

In [7]:
results_social_media = run_stats_test("./output/social_media/ner_social_media.xlsx")
results_social_media

There are 73 entities that did not change significantly, and 0 entities that changed significantly
Total 73 entities


Unnamed: 0,entity,statistic,pvalue
0,1000,1.0,0.317311
1,2012 VP113,1.0,0.317311
2,American Capitalism,1.0,0.317311
3,Americans,1.0,0.317311
4,Chinese,1.0,0.317311
...,...,...,...
68,scandal,1.0,0.317311
69,speculation,1.0,0.317311
70,trace,1.0,0.317311
71,truth,1.0,0.317311


### Headline text

#### All text types

In [8]:
results_all_types_headlines = run_stats_test("./output/ner_headlines.xlsx")
results_all_types_headlines

There are 818 entities that did not change significantly, and 62 entities that changed significantly
Total 880 entities


Unnamed: 0,entity,statistic,pvalue
0,2012 VP113,80.800000,4.985809e-11
1,28th Infantry Division,23.666667,7.097461e-02
2,AAA,15.000000,4.514172e-01
3,AARP,30.000000,1.192150e-02
4,ABC News,15.000000,4.514172e-01
...,...,...,...
875,workforce,15.000000,4.514172e-01
876,world,14.000000,5.255291e-01
877,year,15.000000,4.514172e-01
878,Α,13.000000,6.022979e-01


#### News and blog

In [9]:
results_news_and_blog_headlines = run_stats_test("./output/news_and_blog/ner_news_and_blog_headlines.xlsx")
results_news_and_blog_headlines

There are 1896 entities that did not change significantly, and 165 entities that changed significantly
Total 2061 entities


Unnamed: 0,entity,statistic,pvalue
0,...Hits,15.000000,4.514172e-01
1,1,15.000000,4.514172e-01
2,1000000,14.000000,5.255291e-01
3,1000,15.000000,4.514172e-01
4,2012 VP113,106.200000,8.618150e-16
...,...,...,...
2056,wrap,15.000000,4.514172e-01
2057,year,19.545455,1.900714e-01
2058,Élite,15.000000,4.514172e-01
2059,Α,13.571429,5.582493e-01


#### Press release

In [10]:
results_press_release_headlines = run_stats_test("./output/press_release/ner_press_release_headlines.xlsx")
results_press_release_headlines

There are 414 entities that did not change significantly, and 24 entities that changed significantly
Total 438 entities


Unnamed: 0,entity,statistic,pvalue
0,1000000,15.000000,0.451417
1,2012 VP113,30.000000,0.011921
2,28th Infantry Division,23.666667,0.070975
3,AARP,30.000000,0.011921
4,Adult,15.000000,0.451417
...,...,...,...
433,waste,15.000000,0.451417
434,web page,15.000000,0.451417
435,work,15.000000,0.451417
436,world,15.000000,0.451417


#### Social media

In [11]:
results_social_media_headlines = run_stats_test("./output/social_media/ner_social_media_headlines.xlsx")
results_social_media_headlines

There are 4 entities that did not change significantly, and 0 entities that changed significantly
Total 4 entities


Unnamed: 0,entity,statistic,pvalue
0,Evidence,1.0,0.317311
1,Lunar Roving Vehicle,1.0,0.317311
2,Moon Landing,1.0,0.317311
3,sodium,3.0,0.083265
