In [None]:
import spacy
from spacy.tokens.doc import Doc
import pandas as pd
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "save_cols": [BASE_FAKESPEAK_CONFIG["id_col"], "lexical_diversity"]
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "save_cols": [BASE_MISINFOTEXT_CONFIG["id_col"], "lexical_diversity"]
}

In [3]:
using_dataset = fakespeak_config

In [None]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"]
)
dataset_df.head()

## Calculating lexical diversity using spacy

Lexical diversity is calculated by number of types (unique lemmas) divided by number of tokens (total words).

In [6]:
nlp = spacy.load('en_core_web_md')

  import pkg_resources


In [7]:
dataset_df["doc"] = list(nlp.pipe(dataset_df[using_dataset["text_col"]]))

In [8]:
def calculate_lexical_diversity(doc: Doc):
    tokens = [token for token in doc if token.is_alpha]

    if len(tokens) == 0:
        return 0
    
    types = set(token.lemma_ for token in doc if token.is_alpha)

    return len(types) / len(tokens)

In [9]:
dataset_df["lexical_diversity"] = dataset_df["doc"].apply(calculate_lexical_diversity)
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,lexical_diversity
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,"(Mexico, is, paying, for, the, Wall, through, ...",0.795918
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"(Chuck, Schumer, :, "", why, should, American, ...",0.650000
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,"(Billions, of, dollars, are, sent, to, the, St...",0.851064
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,"(If, 50, Billion, $, $, were, set, aside, to, ...",0.743590
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,"(Huge@#CD, 9, news, ., \n, @ncsbe, \n , sent, ...",0.828571
...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023,"(A, great, lesson, in, Optics, 101, :, The, Mo...",0.433148
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023,"(“, One, of, these, Joe, ’s, is, not, like, th...",0.687500
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"(Autopsies, Prove, that, COVID-19, is, a, Diss...",0.444109
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021,"(She, collapsed, when, she, saw, jfk, jr, ., a...",0.776316


In [10]:
years, years_dfs = get_groups(dataset_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,lexical_diversity
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,"(Mexico, is, paying, for, the, Wall, through, ...",0.795918
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"(Chuck, Schumer, :, "", why, should, American, ...",0.65
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,"(Billions, of, dollars, are, sent, to, the, St...",0.851064
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,"(If, 50, Billion, $, $, were, set, aside, to, ...",0.74359
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,"(Huge@#CD, 9, news, ., \n, @ncsbe, \n , sent, ...",0.828571


In [11]:
types, types_dfs = get_groups(dataset_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,doc,lexical_diversity
16,Politifact_FALSE_News and blog_73653,False,News and blog,Joe Biden has a message for the public on his ...,2019,"(Joe, Biden, has, a, message, for, the, public...",0.793651
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,"(Hollywood, legend, Tom, Selleck, has, praised...",0.517857
21,Politifact_FALSE_News and blog_868147,False,News and blog,"Hundreds of Congolese migrants, with who knows...",2019,"(Hundreds, of, Congolese, migrants, ,, with, w...",0.513575
25,Politifact_FALSE_News and blog_944705,False,News and blog,David Steinberg released his latest report on ...,2019,"(David, Steinberg, released, his, latest, repo...",0.606796
40,Politifact_FALSE_News and blog_691427,False,News and blog,Nancy Pelosi is neck deep in Ukraine politics....,2019,"(Nancy, Pelosi, is, neck, deep, in, Ukraine, p...",0.474104


In [12]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["lexical_diversity"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

In [13]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=using_dataset["save_cols"]
        )

    summary_df = get_summary_df(dfs, years)
    summary_df.to_excel(writer, sheet_name="Summary")

In [14]:
output_path = make_output_path(using_dataset, "lexical_diversity")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

In [15]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "lexical_diversity")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()