In [2]:
import pandas as pd
from quote_extractor import QuoteExtractor
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG

In [3]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "output_path": "./data/Fakespeak-ENG/Analysis_output/Fakespeak_quotes.xlsx"
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG | {
    "output_path": "./data/MisInfoText/Analysis_output/MisInfoText_quotes.xlsx"
}

In [4]:
using_dataset = fakespeak_config

In [6]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"])
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019
...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021


In [9]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./config/quote_verb_list.txt")

  import pkg_resources


In [10]:
quote_annotations = quote_extractor.run_multiple(
    dataset_df[using_dataset["id_col"]], 
    dataset_df[using_dataset["text_col"]]
)

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


In [11]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quote_lengths"] = [[doc["quote_token_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quotes,quote_lengths
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019,[],[]
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"[""why should American citizens be responsible ...",[18]
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019,[],[]
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019,[],[]
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019,[],[]
...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,Pants on Fire,Social media,A great lesson in Optics 101: The Monroe Doctr...,2023,[that the United States would not tolerate fur...,"[12, 13]"
2957,Politifact_Pants on Fire_Social media_231170,Pants on Fire,Social media,“One of these Joe’s is not like the other… one...,2023,"[""One of these Joe’s is not like the other… on...",[21]
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"[were never needed.\n , they were cured with, ...","[5, 4, 18, 12, 13, 15, 51, 42, 14]"
2959,Politifact_Pants on Fire_Social media_635418,Pants on Fire,Social media,She collapsed when she saw jfk jr. as she was ...,2021,[],[]


In [12]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11
...,...,...,...,...,...,...,...
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", treatment in ICUs is useless if thromboembol...",13
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", inflammation induces thrombosis through a co...",15
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""Thanks to 50 autopsies performed on patients ...",51
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""If we ventilate a lung where blood does not c...",42


In [29]:
grouped_by_year = all_quotes_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11


In [30]:
num_quotes_per_year = grouped_by_year["quote"].count()
num_quotes_per_year

originalDateYear
2019     192
2020     970
2021    1200
2022     648
2023    1196
2024     239
Name: quote, dtype: int64

In [31]:
quote_length_summary_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_dfs],
    index=pd.Index(data=years, name="year")
)
quote_length_summary_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019,192.0,20.567708,16.172357,4.0,10.0,16.0,25.0,98.0
2020,970.0,21.423711,16.149548,4.0,10.0,17.0,27.0,104.0
2021,1200.0,21.8125,15.994391,4.0,10.75,17.0,28.0,110.0
2022,648.0,21.762346,16.050702,4.0,11.0,18.0,27.0,137.0
2023,1196.0,21.866221,14.671938,4.0,12.0,18.0,28.0,120.0
2024,239.0,24.238494,17.143169,4.0,12.0,20.0,31.0,96.0


In [32]:
only_news_blog_social_media_df = all_quotes_df[(all_quotes_df["originalTextType"] == "News and blog") | (all_quotes_df["originalTextType"] == "Social media")]
only_news_blog_social_media_df

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11
...,...,...,...,...,...,...,...
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", treatment in ICUs is useless if thromboembol...",13
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,", inflammation induces thrombosis through a co...",15
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""Thanks to 50 autopsies performed on patients ...",51
2958,Politifact_Pants on Fire_Social media_874359,Pants on Fire,Social media,Autopsies Prove that COVID-19 is a Disseminate...,2020,"""If we ventilate a lung where blood does not c...",42


In [33]:
grouped_by_year_news_blog_social_media = only_news_blog_social_media_df.groupby(by="originalDateYear")
years_news_blog_social_media = grouped_by_year_news_blog_social_media.groups
years_news_blog_social_media_dfs = [grouped_by_year_news_blog_social_media.get_group(year) for year in years_news_blog_social_media]
years_news_blog_social_media_dfs[0].head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019,"""why should American citizens be responsible t...",18
13,Politifact_FALSE_Social media_169258,False,Social media,According to the 2016 Annual Survey of School ...,2019,", U.S. Census Bureau, West Virginia spends mor...",26
14,Politifact_FALSE_Social media_19711,False,Social media,.@realDonaldTrump on people asking for asylum ...,2019,"""These aren't people. These are animals.""",11
18,Politifact_FALSE_Social media_60177,False,Social media,Thank you North Carolina. You showed up in mas...,2019,that the state fund our public school legislat...,9
19,Politifact_FALSE_News and blog_605527,False,News and blog,Hollywood legend Tom Selleck has praised Donal...,2019,I’m completely sure that he is the best so far,11


In [34]:
num_quotes_per_year_news_blog_social_media = grouped_by_year_news_blog_social_media["quote"].count()
num_quotes_per_year_news_blog_social_media

originalDateYear
2019     163
2020     934
2021    1186
2022     619
2023    1192
2024     235
Name: quote, dtype: int64

In [35]:
quote_length_summary_news_blog_social_media_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_news_blog_social_media_dfs],
    index=pd.Index(data=years_news_blog_social_media, name="year")
)
quote_length_summary_news_blog_social_media_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019,163.0,20.06135,15.88022,4.0,10.0,15.0,24.0,98.0
2020,934.0,21.12955,16.110195,4.0,10.0,17.0,26.0,104.0
2021,1186.0,21.620573,15.612265,4.0,10.0,17.0,28.0,110.0
2022,619.0,20.915994,14.668264,4.0,10.0,17.0,26.0,137.0
2023,1192.0,21.833893,14.632903,4.0,12.0,18.0,28.0,120.0
2024,235.0,24.089362,17.010193,4.0,12.0,20.0,30.5,96.0


In [36]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

for df, year in zip(years_dfs, years):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()

In [37]:
output_path = using_dataset.output_path
output_path_split = output_path.split("/")
output_path_split.insert(len(output_path_split) - 1, "news_blog_and_social_media")
output_path_news_blog_social_media = "/".join(output_path_split)
output_path_news_blog_social_media

'./data/Fakespeak-ENG/Analysis_output/news_blog_and_social_media/Fakespeak_quotes.xlsx'

In [38]:
writer = pd.ExcelWriter(output_path_news_blog_social_media, engine="xlsxwriter")

for df, year in zip(years_news_blog_social_media_dfs, years_news_blog_social_media):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year_news_blog_social_media.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_news_blog_social_media_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()