In [1]:
import spacy
from spacy.tokens.doc import Doc
import pandas as pd
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type, is_word

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG
misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = misinfotext_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"]
)

# Removing 2007 and 2008 years because little data in them
dataset_df = dataset_df[~(dataset_df[using_dataset["year_col"]] == 2007) & ~(dataset_df[using_dataset["year_col"]] == 2008)]

dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


## Calculating lexical density using spacy

Lexical density is calculated by number of lexical words (nouns, verbs, adjectives, adverbs) divided by number of total words, excluding punctuation.

In [5]:
nlp = spacy.load('en_core_web_md')



In [6]:
dataset_df["doc"] = list(nlp.pipe(dataset_df[using_dataset["text_col"]]))

We count lexical ("content") words to be proper nouns, nouns, verbs, adjectives, and adverbs.

We then calculate lexical density by dividing the number of lexical words by total number of words. This excludes non-word tokens like punctuation.

In [None]:
lexical_pos = ["PROPN", "NOUN", "VERB", "ADJ", "ADV"]

def calculate_lexical_density(doc: Doc):
    word_tokens = [token for token in doc if is_word(token)]
    
    num_lexical_items = len([
        token
        for token in word_tokens
        if token.pos_ in lexical_pos
    ])

    return num_lexical_items / len(word_tokens)

In [8]:
dataset_df["lexical_density"] = dataset_df["doc"].apply(calculate_lexical_density)
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"(Residents, of, multiple, states, will, be, as...",0.548057
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,"(Sacramento, ,, CA, -, United, States, Senator...",0.638298
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"(We, should, anticipate, black, and, gray, mar...",0.579096
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"(As, a, ballot, initiative, calling, for, repe...",0.607623
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,"(WASHINGTON, ,, DC, , The, House, of, Represe...",0.577875
...,...,...,...,...,...,...,...,...,...
686,https://www.politifact.com/factchecks/2015/jul...,https://x.com/DineshDSouza/status/618437306769...,Look closely at this Hillary photo; isn't that...,,Social media,2015-07-07,2015,"(Look, closely, at, this, Hillary, photo, ;, i...",0.411765
687,https://www.politifact.com/factchecks/2015/jun...,https://x.com/Gavin_McInnes/status/61353424690...,Leave the confederate flag up because the Civi...,,Social media,2015-06-24,2015,"(Leave, the, confederate, flag, up, because, t...",0.458333
688,https://www.politifact.com/factchecks/2015/mar...,http://www.wisdems.org/news/blog/view/2015-03-...,The nonpartisan Legislative Fiscal Bureau yest...,More Bad News For Working Families in Scott Wa...,News and blog,2015-03-16,2015,"(The, nonpartisan, Legislative, Fiscal, Bureau...",0.556174
689,https://www.politifact.com/factchecks/2015/feb...,https://urbanintellectuals.com/sheriffs-call-f...,President Obama is a lightening rod for critic...,Sheriffs Call For Obama’s Lynching While Leade...,News and blog,2015-01-15,2015,"(President, Obama, is, a, lightening, rod, for...",0.540000


Get separate dataframes for each year

In [9]:
years, years_dfs = get_groups(dataset_df, using_dataset["year_col"])
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
433,http://www.politifact.com/truth-o-meter/statem...,https://bachmann.house.gov/News/DocumentSingle...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009-03-25,2009,"(Washington, ,, D.C., ,, Mar, 25, -, In, respo...",0.556863
434,http://www.politifact.com/truth-o-meter/statem...,https://healthcare.nationalreview.com/post/?q=...,When most Americans talk about the need for he...,Taxpayer-Funded Abortion Is Not Health-Care Re...,News and blog,2009-07-23,2009,"(When, most, Americans, talk, about, the, need...",0.580347
435,http://www.politifact.com/truth-o-meter/statem...,http://krugman.blogs.nytimes.com/2009/08/05/on...,A number of people in the news analysis busine...,One of these things is not like the other,News and blog,2009-08-05,2009,"(A, number, of, people, in, the, news, analysi...",0.515021
436,http://www.politifact.com/truth-o-meter/statem...,https://www.facebook.com/notes/1020383705144285/,Yesterday President Obama responded to my stat...,,Social media,2009-08-13,2009,"(Yesterday, President, Obama, responded, to, m...",0.5421
437,http://www.politifact.com/truth-o-meter/statem...,https://jumpinginpools.blogspot.com/2009/01/mi...,Secretary of Defense Robert Gates is extremely...,"Military to Pledge Oath To Obama, Not Constitu...",News and blog,2009-01-28,2009,"(Secretary, of, Defense, Robert, Gates, is, ex...",0.531746


Get separate dataframes for each text type

In [10]:
types, types_dfs = get_groups(dataset_df, using_dataset["type_col"])
types_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"(Residents, of, multiple, states, will, be, as...",0.548057
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"(We, should, anticipate, black, and, gray, mar...",0.579096
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"(As, a, ballot, initiative, calling, for, repe...",0.607623
6,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/opinion/openforum/a...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017-10-19,2017,"(Recently, ,, a, group, of, special, interests...",0.573099
7,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/politics/article/Th...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017-09-14,2017,"(COSTA, MESA, ,, Orange, County, , It, was, a...",0.562782


Using pandas's built-in `.describe()` method for dataframes to get summary stats on lexical density for each year.

In [11]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["lexical_density"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

Save to a single Excel file with tabs for each year, and a final tab for the summary

In [12]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=[using_dataset["id_col"], "lexical_density"]
        )

    summary_df = get_summary_df(dfs, years)
    summary_df.to_excel(writer, sheet_name="Summary")

In [13]:
output_path = make_output_path(using_dataset, "lexical_density")

writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

For each text type, we separate further into dataframes per year, and save the Excel files as above

In [14]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, using_dataset["year_col"])

    output_path = make_output_path_for_type(using_dataset, type, "lexical_density")

    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()