In [1]:
import spacy
from spacy.tokens.doc import Doc
import pandas as pd
from helpers import load_data, get_groups, make_output_path, make_output_path_for_type, is_word

In [2]:
dataset_df = load_data()
dataset_df.head()

Unnamed: 0,id,text,headline,text_type,year
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017


## Calculating lexical density using spacy

Lexical density is calculated by number of lexical words (nouns, verbs, adjectives, adverbs) divided by number of total words, excluding punctuation.

In [3]:
nlp = spacy.load('en_core_web_md')

  import pkg_resources


In [4]:
dataset_df["doc"] = list(nlp.pipe(dataset_df["text"]))

We count lexical ("content") words to be proper nouns, nouns, verbs, adjectives, and adverbs.

We then calculate lexical density by dividing the number of lexical words by total number of words. This excludes non-word tokens like punctuation.

In [5]:
lexical_pos = ["PROPN", "NOUN", "VERB", "ADJ", "ADV"]

def calculate_lexical_density(doc: Doc):
    word_tokens = [token for token in doc if is_word(token)]
    
    num_lexical_items = len([
        token
        for token in word_tokens
        if token.pos_ in lexical_pos
    ])

    return num_lexical_items / len(word_tokens)

In [6]:
dataset_df["lexical_density"] = dataset_df["doc"].apply(calculate_lexical_density)
dataset_df

Unnamed: 0,id,text,headline,text_type,year,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"(Residents, of, multiple, states, will, be, as...",0.548057
1,http://www.politifact.com/california/statement...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016,"(Sacramento, ,, CA, -, United, States, Senator...",0.638298
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017,"(We, should, anticipate, black, and, gray, mar...",0.579096
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017,"(As, a, ballot, initiative, calling, for, repe...",0.607623
4,http://www.politifact.com/california/statement...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017,"(WASHINGTON, ,, DC, , The, House, of, Represe...",0.577875
...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,A great lesson in Optics 101: The Monroe Doctr...,,Social media,2023,"(A, great, lesson, in, Optics, 101, :, The, Mo...",0.520295
2957,Politifact_Pants on Fire_Social media_231170,“One of these Joe’s is not like the other… one...,,Social media,2023,"(“, One, of, these, Joe, ’s, is, not, like, th...",0.360000
2958,Politifact_Pants on Fire_Social media_874359,Autopsies Prove that COVID-19 is a Disseminate...,,Social media,2020,"(Autopsies, Prove, that, COVID-19, is, a, Diss...",0.538576
2959,Politifact_Pants on Fire_Social media_635418,She collapsed when she saw jfk jr. as she was ...,,Social media,2021,"(She, collapsed, when, she, saw, jfk, jr, ., a...",0.480519


Get separate dataframes for each year

In [7]:
years, years_dfs = get_groups(dataset_df, "year")
years_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,doc,lexical_density
433,http://www.politifact.com/truth-o-meter/statem...,"Washington, D.C., Mar 25 - In response to sugg...",Bachmann Demands Truth: Will Obama Administrat...,Press release,2009,"(Washington, ,, D.C., ,, Mar, 25, -, In, respo...",0.556863
434,http://www.politifact.com/truth-o-meter/statem...,When most Americans talk about the need for he...,Taxpayer-Funded Abortion Is Not Health-Care Re...,News and blog,2009,"(When, most, Americans, talk, about, the, need...",0.580347
435,http://www.politifact.com/truth-o-meter/statem...,A number of people in the news analysis busine...,One of these things is not like the other,News and blog,2009,"(A, number, of, people, in, the, news, analysi...",0.515021
436,http://www.politifact.com/truth-o-meter/statem...,Yesterday President Obama responded to my stat...,,Social media,2009,"(Yesterday, President, Obama, responded, to, m...",0.5421
437,http://www.politifact.com/truth-o-meter/statem...,Secretary of Defense Robert Gates is extremely...,"Military to Pledge Oath To Obama, Not Constitu...",News and blog,2009,"(Secretary, of, Defense, Robert, Gates, is, ex...",0.531746


Get separate dataframes for each text type

In [8]:
types, types_dfs = get_groups(dataset_df, "text_type")
types_dfs[0].head()

Unnamed: 0,id,text,headline,text_type,year,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016,"(Residents, of, multiple, states, will, be, as...",0.548057
2,http://www.politifact.com/california/statement...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017,"(We, should, anticipate, black, and, gray, mar...",0.579096
3,http://www.politifact.com/california/statement...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017,"(As, a, ballot, initiative, calling, for, repe...",0.607623
6,http://www.politifact.com/california/statement...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017,"(Recently, ,, a, group, of, special, interests...",0.573099
7,http://www.politifact.com/california/statement...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017,"(COSTA, MESA, ,, Orange, County, , It, was, a...",0.562782


Using pandas's built-in `.describe()` method for dataframes to get summary stats on lexical density for each year.

In [9]:
def get_summary_df(dfs: list[pd.DataFrame], years: list[int]):
    return pd.DataFrame(
        [df["lexical_density"].describe() for df in dfs],
        index=pd.Index(data=years, name="year")
    )

## Writing dataframes to excel spreadsheet

Save to a single Excel file with tabs for each year, and a final tab for the summary

In [10]:
def save_years(writer: pd.ExcelWriter, years: list[int], dfs: list[pd.DataFrame]):
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=["id", "lexical_density"]
        )

    summary_df = get_summary_df(dfs, years)
    summary_df.to_excel(writer, sheet_name="Summary")

In [11]:
writer = pd.ExcelWriter("./output/lexical_density.xlsx", engine="xlsxwriter")
save_years(writer, years, years_dfs)
writer.close()

For each text type, we separate further into dataframes per year, and save the Excel files as above

In [13]:
for type, df in zip(types, types_dfs):
    years, years_dfs = get_groups(df, "year")

    type_str = str(type).lower().replace(" ", "_")

    writer = pd.ExcelWriter(f"./output/{type_str}/lexical_density_{type_str}.xlsx", engine="xlsxwriter")
    save_years(writer, years, years_dfs)
    writer.close()