In [1]:
import os
import spacy
from spacy.tokens.doc import Doc
import pandas as pd

In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    sheet_name: str
    usecols: list[str]
    cols_to_save: list[str]

    def __init__(self, input_path: str, output_path: str, sheet_name: str, usecols: list[str], cols_to_save: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.sheet_name = sheet_name
        self.usecols = usecols
        self.cols_to_save = cols_to_save

In [None]:
fakespeak_config = DatasetConfig(
    # file_path="/content/drive/My Drive/fake_news_over_time/Fakespeak_ENG_modified.xlsx",
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_lexical_density.xlsx",
    sheet_name="Working",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
    cols_to_save=["ID", "combinedLabel", "originalTextType", "originalBodyText", "lexical_density"]
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_lexical_density.xlsx",
    sheet_name="Working",
    usecols=None,
    cols_to_save=["factcheckURL", "originalURL", "originalTextType", "originalBodyText", "lexical_density"]
)

In [4]:
using_dataset = misinfotext_config

In [5]:
dataset_df = pd.read_excel(
    using_dataset.input_path, 
    sheet_name=using_dataset.sheet_name, 
    usecols=using_dataset.usecols
)

In [6]:
dataset_df.head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017


## Calculating lexical density using spacy

Lexical density is calculated by number of lexical words (nouns, verbs, adjectives, adverbs) divided by number of total words.

In [7]:
nlp = spacy.load('en_core_web_md')

  import pkg_resources


In [8]:
dataset_df["doc"] = list(nlp.pipe(dataset_df["originalBodyText"]))

In [9]:
lexical_pos = ["PROPN", "NOUN", "VERB", "ADJ", "ADV"]

def calculate_lexical_density(doc: Doc):
    num_lexical_items = len([
        token
        for token in doc
        if token.pos_ in lexical_pos
    ])

    return num_lexical_items / len(doc)

In [10]:
dataset_df["lexical_density"] = dataset_df["doc"].apply(calculate_lexical_density)
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"(Residents, of, multiple, states, will, be, as...",0.474427
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,"(Sacramento, ,, CA, -, United, States, Senator...",0.562212
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"(We, should, anticipate, black, and, gray, mar...",0.496368
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"(As, a, ballot, initiative, calling, for, repe...",0.538767
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,"(WASHINGTON, ,, DC, , The, House, of, Represe...",0.516796
...,...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,"(Road, projects, across, the, state, are, stay...",0.500000
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,"(The, last, thing, we, need, is, more, Madison...",0.469388
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,"(When, \n, @ScottWalker, \n , told, firefighte...",0.433333
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,"(Now, that, its, 2018, ,, an, election, year, ...",0.498442


In [11]:
grouped_by_year = dataset_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,"("", On, Monday, ,, Senator, Clinton, told, an,...",0.472973


In [12]:
summary_df = pd.DataFrame(
    [df["lexical_density"].describe() for df in years_dfs],
    index=pd.Index(data=years, name="year")
)
summary_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007,1.0,0.472973,,0.472973,0.472973,0.472973,0.472973,0.472973
2008,5.0,0.484004,0.0136,0.468451,0.473002,0.484794,0.492174,0.501597
2009,17.0,0.475107,0.021859,0.438023,0.458882,0.473684,0.489045,0.514874
2010,23.0,0.511652,0.046264,0.460345,0.479231,0.496795,0.530775,0.666667
2011,44.0,0.501621,0.029207,0.432432,0.478107,0.504715,0.524787,0.565574
2012,28.0,0.500466,0.033157,0.427907,0.477742,0.505387,0.518935,0.574186
2013,61.0,0.493766,0.047196,0.385263,0.461615,0.490119,0.521242,0.666667
2014,34.0,0.486343,0.040193,0.411765,0.462724,0.485118,0.506516,0.590909
2015,6.0,0.491551,0.044302,0.448324,0.462211,0.479101,0.50829,0.568182
2016,91.0,0.497753,0.053402,0.357143,0.464286,0.491585,0.520891,0.705882


In [13]:
only_news_blog_and_social_media_df = dataset_df[(dataset_df["originalTextType"] == "News and blog") | (dataset_df["originalTextType"] == "Social media")]
only_news_blog_and_social_media_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"(Residents, of, multiple, states, will, be, as...",0.474427
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,"(We, should, anticipate, black, and, gray, mar...",0.496368
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,"(As, a, ballot, initiative, calling, for, repe...",0.538767
6,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/opinion/openforum/a...,"Recently, a group of special interests threate...","Repeal Californias gas tax increase, says GOP ...",News and blog,2017-10-19,2017,"(Recently, ,, a, group, of, special, interests...",0.503425
7,http://www.politifact.com/california/statement...,http://www.sfchronicle.com/politics/article/Th...,"COSTA MESA, Orange County It was a surreal vi...","The pro-Russia, pro-weed, pro-Assange GOP cong...",News and blog,2017-09-14,2017,"(COSTA, MESA, ,, Orange, County, , It, was, a...",0.498021
...,...,...,...,...,...,...,...,...,...
649,http://www.politifact.com/wisconsin/statements...,http://www.jsonline.com/story/opinion/contribu...,CLOSE President Donald Trump outlined his tax ...,Trump: We must fix our self-destructive tax code,News and blog,2017-09-03,2017,"(CLOSE, President, Donald, Trump, outlined, hi...",0.514136
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,"(Road, projects, across, the, state, are, stay...",0.500000
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,"(The, last, thing, we, need, is, more, Madison...",0.469388
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,"(When, \n, @ScottWalker, \n , told, firefighte...",0.433333


In [14]:
grouped_by_year_news_blog_social_media = only_news_blog_and_social_media_df.groupby(by="originalDateYear")
years_news_blog_social_media = grouped_by_year_news_blog_social_media.groups
years_news_blog_social_media_dfs = [grouped_by_year_news_blog_social_media.get_group(year) for year in years_news_blog_social_media]
years_news_blog_social_media_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,doc,lexical_density
429,http://www.politifact.com/truth-o-meter/statem...,http://www.nypost.com/seven/03212008/postopini...,NOW that Hillary Clintons schedule as first la...,HEAVY HITTER? NOT HILLARY,News and blog,2008-03-21,2008,"(NOW, that, Hillary, Clintons, schedule, as, f...",0.492174
430,http://www.politifact.com/truth-o-meter/statem...,https://www.citizenlink.org/focusaction/update...,What does it take to be the most liberal membe...,March 2008 Action Update,News and blog,2008-03-26,2008,"(What, does, it, take, to, be, the, most, libe...",0.468451
432,http://www.politifact.com/truth-o-meter/statem...,http://www.nysun.com/opinion/palin-on-ahmadine...,"Governor Palin, the Republican nominee for vic...",Palin on Ahmadinejad: 'He Must Be Stopped',News and blog,2008-09-22,2008,"(Governor, Palin, ,, the, Republican, nominee,...",0.473002


In [15]:
summary_news_blog_social_media_df = pd.DataFrame(
    [df["lexical_density"].describe() for df in years_news_blog_social_media_dfs],
    index=pd.Index(data=years_news_blog_social_media, name="year")
)
summary_news_blog_social_media_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008,3.0,0.477876,0.01259,0.468451,0.470727,0.473002,0.482588,0.492174
2009,12.0,0.470435,0.021403,0.438023,0.45823,0.463128,0.4865,0.508197
2010,15.0,0.502764,0.050888,0.460345,0.475806,0.484565,0.519292,0.666667
2011,27.0,0.497439,0.029225,0.432432,0.473459,0.500688,0.525731,0.541667
2012,16.0,0.50336,0.029439,0.460177,0.482604,0.502397,0.516716,0.574186
2013,44.0,0.484701,0.046656,0.385263,0.457279,0.482005,0.502332,0.666667
2014,23.0,0.479879,0.043714,0.411765,0.445374,0.480769,0.505307,0.590909
2015,6.0,0.491551,0.044302,0.448324,0.462211,0.479101,0.50829,0.568182
2016,77.0,0.494087,0.055518,0.357143,0.455674,0.49061,0.518519,0.705882
2017,205.0,0.490706,0.057397,0.315789,0.456731,0.485333,0.515707,0.722222


## Writing dataframes to excel spreadsheet

In [16]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

for df, year in zip(years_dfs, years):
    df.to_excel(writer, sheet_name=str(year), index=False, columns=using_dataset.cols_to_save)

summary_df.to_excel(writer, sheet_name="Summary")

writer.close()

In [17]:
output_path = using_dataset.output_path
output_path_split = output_path.split("/")
output_path_split.insert(len(output_path_split) - 1, "news_blog_and_social_media")
output_path_news_blog_social_media = "/".join(output_path_split)
output_path_news_blog_social_media

'./data/MisInfoText/Analysis_output/news_blog_and_social_media/MisInfoText_lexical_density.xlsx'

In [18]:
writer = pd.ExcelWriter(output_path_news_blog_social_media, engine="xlsxwriter")

for df, year in zip(years_news_blog_social_media_dfs, years_news_blog_social_media):
    df.to_excel(writer, sheet_name=str(year), index=False, columns=using_dataset.cols_to_save)

summary_news_blog_social_media_df.to_excel(writer, sheet_name="Summary")

writer.close()