In [1]:
import pandas as pd
from scipy.stats import chisquare
from helpers import get_years_dfs

### Using chi-square tests because the POS data is in the form of a frequency table

In [2]:
pos = [
    "NOUN",
    "PUNCT",
    "VERB",
    "ADP",
    "DET",
    "ADJ",
    "PROPN",
    "PRON",
    "AUX",
    "ADV",
    "PART",
    "CCONJ",
    "SCONJ",
    "NUM",
    "SPACE",
    "SYM",
    "INTJ",
    "X",
]

In [3]:
def run_stats_test(input_path: str):
    years_dfs = [
        df.set_index("POS").sort_index().reindex(pos, fill_value=0)
        for df in get_years_dfs(input_path)
    ]

    result = chisquare([df["count"] for df in years_dfs])

    result_df = pd.DataFrame({
        "POS": pos,
        "statistic": result.statistic,
        "pvalue": result.pvalue,
    })

    return result_df

#### All text types

In [4]:
run_stats_test("./output/pos.xlsx")

Unnamed: 0,POS,statistic,pvalue
0,NOUN,99588.216566,0.0
1,PUNCT,68245.153165,0.0
2,VERB,62051.484109,0.0
3,ADP,52739.183547,0.0
4,DET,42523.151224,0.0
5,ADJ,32517.743658,0.0
6,PROPN,61309.298067,0.0
7,PRON,33598.619337,0.0
8,AUX,29044.954253,0.0
9,ADV,19501.120789,0.0


All parts of speech changed significantly throughout the years

#### News and blog

In [5]:
run_stats_test("./output/news_and_blog/pos_news_and_blog.xlsx")

Unnamed: 0,POS,statistic,pvalue
0,NOUN,72294.005058,0.0
1,PUNCT,49533.254028,0.0
2,VERB,43053.62692,0.0
3,ADP,39479.795769,0.0
4,DET,32308.781932,0.0
5,ADJ,23735.827406,0.0
6,PROPN,42613.533266,0.0
7,PRON,20433.075466,0.0
8,AUX,18645.284148,0.0
9,ADV,13230.964796,0.0


All parts of speech changed significantly throughout the years for news and blog

#### Press release

In [6]:
run_stats_test("./output/press_release/pos_press_release.xlsx")

Unnamed: 0,POS,statistic,pvalue
0,NOUN,6653.847534,0.0
1,PUNCT,3400.120124,0.0
2,VERB,3861.789341,0.0
3,ADP,3608.332369,0.0
4,DET,3027.454272,0.0
5,ADJ,1994.448161,0.0
6,PROPN,4652.36115,0.0
7,PRON,2075.669528,0.0
8,AUX,1559.70331,0.0
9,ADV,1260.023658,2.0771589999999997e-259


All parts of speech changed significantly throughout the years for press release

#### Social media

In [7]:
run_stats_test("./output/social_media/pos_social_media.xlsx")

Unnamed: 0,POS,statistic,pvalue
0,NOUN,49534.343092,0.0
1,PUNCT,34772.792165,0.0
2,VERB,34995.11723,0.0
3,ADP,26465.880624,0.0
4,DET,19823.59839,0.0
5,ADJ,16167.078313,0.0
6,PROPN,31824.805096,0.0
7,PRON,24869.414101,0.0
8,AUX,17814.11323,0.0
9,ADV,10565.920551,0.0


All parts of speech changed significantly throughout the years for social media