In [1]:
import pandas as pd
from helpers import run_stats_test

In [2]:
def get_years_dfs(file_path: str):
    excel_file = pd.ExcelFile(file_path)

    def generator():
        # Generator that goes through each year sheet,
        # adds a year column, then returns the dataframe
        
        for sheet in excel_file.sheet_names:
            if sheet == "Summary":
                continue
            
            df = pd.read_excel(excel_file, sheet_name=sheet)

            # Add year column
            df["year"] = sheet

            yield df
    
    # Create a list from the generator and return
    return list(generator())

### First run stats analysis for all text types

In [3]:
years_dfs = get_years_dfs("./output/lexical_density.xlsx")
run_stats_test(years_dfs, "lexical_density")

Running test for non-normal distributions


KruskalResult(statistic=73.93307300508717, pvalue=8.819211344509398e-10)

There is a statistically significant difference in lexical density between the years

### Now repeat for specific text types

#### News and blog

In [5]:
years_dfs = get_years_dfs("./output/news_and_blog/lexical_density_news_and_blog.xlsx")
run_stats_test(years_dfs, "lexical_density")

Running test for non-normal distributions


KruskalResult(statistic=30.797479177648064, pvalue=0.009349434511875452)

There is a statistically significant difference in lexical density between the years for news and blog

#### Press release

In [6]:
years_dfs = get_years_dfs("./output/press_release/lexical_density_press_release.xlsx")
run_stats_test(years_dfs, "lexical_density")

Running test for non-normal distributions


KruskalResult(statistic=13.369140299858152, pvalue=0.5738089258069632)

There is no significant difference in lexical density between the years for press release

#### Social media

In [7]:
years_dfs = get_years_dfs("./output/social_media/lexical_density_social_media.xlsx")
run_stats_test(years_dfs, "lexical_density")

Running test for non-normal distributions


KruskalResult(statistic=71.30514992661972, pvalue=4.6120428422337525e-10)

There is a statistically significant difference in lexical density between the years for social media