In [1]:
import pandas as pd
from quote_extractor import QuoteExtractor

In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    quote_annotations_path: str
    sheet_name: str
    id_col: str
    usecols: list[str]

    def __init__(self, input_path: str, output_path: str, quote_annotations_path: str, sheet_name: str, id_col: str, usecols: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.quote_annotations_path = quote_annotations_path
        self.sheet_name = sheet_name
        self.id_col = id_col
        self.usecols = usecols

In [3]:
fakespeak_config = DatasetConfig(
    # file_path="/content/drive/My Drive/fake_news_over_time/Fakespeak_ENG_modified.xlsx",
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_quotes.xlsx",
    quote_annotations_path="./data/Fakespeak-ENG/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    id_col="ID",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_quotes.xlsx",
    quote_annotations_path="./data/MisInfoText/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    id_col="factcheckURL",
    usecols=None,
)

In [4]:
using_dataset = misinfotext_config

In [5]:
dataset_df = pd.read_excel(
    using_dataset.input_path, 
    sheet_name=using_dataset.sheet_name, 
    usecols=using_dataset.usecols)
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017
...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018


In [6]:
quote_extractor = QuoteExtractor("en_core_web_lg", "./quote_verb_list.txt")

  import pkg_resources


In [7]:
quote_annotations = quote_extractor.run_multiple(dataset_df[using_dataset.id_col], dataset_df["originalBodyText"])

Preprocessing texts...
Creating spacy docs...
Extracting quotes...
Done extracting quotes


In [8]:
dataset_df["quotes"] = [[doc["quote"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df["quote_lengths"] = [[doc["quote_token_count"] for doc in docs] 
                        for docs in quote_annotations]
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quotes,quote_lengths
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[, abandoned pet rates have sky-rocketed in Te...","[24, 24, 19, 50, 11]"
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,[that deaths in marijuana-related car crashes ...,"[15, 63]"
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,[Organized crime gangs are buying hundreds or ...,"[22, 4, 32, 8, 24, 18, 23]"
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,[hes going to raise their gas taxes to the hig...,"[23, 16, 15, 16, 30, 19, 47]"
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,[],[]
...,...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,[],[]
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,[],[]
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,[],[]
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,[],[]


In [9]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,quote_length
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", abandoned pet rates have sky-rocketed in Tex...",24
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", pet owners within Texas, Arizona, and Missou...",24
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"a state funded, mandatory ‘pet registration’ p...",19
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", an incentive program may be implemented to e...",50
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"""domestic animal the size of a cat or larger""",11
...,...,...,...,...,...,...,...,...,...
642,http://www.politifact.com/wisconsin/statements...,https://x.com/SheriffClarke/status/83266463840...,Michelle Obama said she was never proud of her...,,Social media,2017-02-17,2017,she was never proud of her country til they el...,13
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,"this was an ""extraordinary unusual"" letter and...",21
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,"""We will NOT Cease and Desist!""",9
646,http://www.politifact.com/wisconsin/statements...,http://resistancereport.com/politics/senator-c...,Senator Ron Johnson (R-Wisconsin) apparently d...,Republican Senator Threatens Citizens with Arr...,News and blog,2017-03-02,2017,the citizens who pay his salary should get to ...,12


In [10]:
grouped_by_year = all_quotes_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,quote_length
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,that the surge of troops in Iraq was 'working,10
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,the surge 'has failed' and that we should 'beg...,18
428,http://www.politifact.com/truth-o-meter/statem...,https://www.johnmccain.com/Informing/News/Pres...,"""On Monday, Senator Clinton told an audience a...",Statement By John McCain on Hillary Clinton,Press release,2007-08-23,2007,"""The fact that the New York senator can revers...",81


In [11]:
num_quotes_per_year = grouped_by_year["quote"].count()
num_quotes_per_year

originalDateYear
2007      3
2008     49
2009    105
2010     97
2011    174
2012    160
2013    203
2014    133
2015     17
2016    296
2017    649
2018    426
Name: quote, dtype: int64

In [12]:
quote_length_summary_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_dfs],
    index=pd.Index(data=years, name="year")
)
quote_length_summary_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007,3.0,36.333333,38.88873,10.0,14.0,18.0,49.5,81.0
2008,49.0,22.183673,21.419105,5.0,9.0,12.0,27.0,95.0
2009,105.0,21.47619,13.661495,6.0,11.0,18.0,26.0,74.0
2010,97.0,23.134021,15.727251,4.0,13.0,19.0,29.0,92.0
2011,174.0,22.66092,14.625774,4.0,13.0,20.0,28.75,105.0
2012,160.0,20.70625,14.072408,4.0,10.0,18.0,27.25,71.0
2013,203.0,19.955665,13.542489,4.0,11.0,17.0,25.0,90.0
2014,133.0,20.180451,14.105573,4.0,10.0,18.0,23.0,89.0
2015,17.0,20.0,30.667572,4.0,8.0,12.0,20.0,136.0
2016,296.0,22.185811,15.839532,4.0,11.0,18.0,27.0,93.0


In [13]:
only_news_blog_social_media_df = all_quotes_df[(all_quotes_df["originalTextType"] == "News and blog") | (all_quotes_df["originalTextType"] == "Social media")]
only_news_blog_social_media_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,quote_length
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", abandoned pet rates have sky-rocketed in Tex...",24
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", pet owners within Texas, Arizona, and Missou...",24
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"a state funded, mandatory ‘pet registration’ p...",19
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,", an incentive program may be implemented to e...",50
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"""domestic animal the size of a cat or larger""",11
...,...,...,...,...,...,...,...,...,...
631,http://www.politifact.com/wisconsin/statements...,http://freepatriot.org/2014/07/04/obama-admini...,This past week the Obama administration filed ...,Obama Administration Suing Wisconsin Company f...,News and blog,2014-07-04,2014,"""When speaking English fluently is not, in fac...",33
642,http://www.politifact.com/wisconsin/statements...,https://x.com/SheriffClarke/status/83266463840...,Michelle Obama said she was never proud of her...,,Social media,2017-02-17,2017,she was never proud of her country til they el...,13
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,"this was an ""extraordinary unusual"" letter and...",21
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,"""We will NOT Cease and Desist!""",9


In [14]:
grouped_by_year_news_blog_social_media = only_news_blog_social_media_df.groupby(by="originalDateYear")
years_news_blog_social_media = grouped_by_year_news_blog_social_media.groups
years_news_blog_social_media_dfs = [grouped_by_year_news_blog_social_media.get_group(year) for year in years_news_blog_social_media]
years_news_blog_social_media_dfs[0].head()

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,quote_length
429,http://www.politifact.com/truth-o-meter/statem...,http://www.nypost.com/seven/03212008/postopini...,NOW that Hillary Clintons schedule as first la...,HEAVY HITTER? NOT HILLARY,News and blog,2008-03-21,2008,to be was nowhere evident,5
430,http://www.politifact.com/truth-o-meter/statem...,https://www.citizenlink.org/focusaction/update...,What does it take to be the most liberal membe...,March 2008 Action Update,News and blog,2008-03-26,2008,"he is ""opposed"" to same-sex ""marriage",11
430,http://www.politifact.com/truth-o-meter/statem...,https://www.citizenlink.org/focusaction/update...,What does it take to be the most liberal membe...,March 2008 Action Update,News and blog,2008-03-26,2008,this link was accessed in 2007,6
432,http://www.politifact.com/truth-o-meter/statem...,http://www.nysun.com/opinion/palin-on-ahmadine...,"Governor Palin, the Republican nominee for vic...",Palin on Ahmadinejad: 'He Must Be Stopped',News and blog,2008-09-22,2008,"that Iran is running at least 3,800 centrifuge...",17
432,http://www.politifact.com/truth-o-meter/statem...,http://www.nysun.com/opinion/palin-on-ahmadine...,"Governor Palin, the Republican nominee for vic...",Palin on Ahmadinejad: 'He Must Be Stopped',News and blog,2008-09-22,2008,", U.S. intelligence agencies believe the Irani...",20


In [15]:
num_quotes_per_year_news_blog_social_media = grouped_by_year_news_blog_social_media["quote"].count()
num_quotes_per_year_news_blog_social_media

originalDateYear
2008     11
2009     78
2010     82
2011    107
2012     98
2013    161
2014     81
2015     17
2016    255
2017    610
2018    409
Name: quote, dtype: int64

In [16]:
quote_length_summary_news_blog_social_media_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_news_blog_social_media_dfs],
    index=pd.Index(data=years_news_blog_social_media, name="year")
)
quote_length_summary_news_blog_social_media_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008,11.0,12.454545,6.21874,5.0,7.5,11.0,18.5,22.0
2009,78.0,20.512821,13.587373,6.0,10.0,17.0,25.75,74.0
2010,82.0,20.841463,11.527176,4.0,12.0,17.5,27.75,54.0
2011,107.0,19.411215,11.827337,4.0,12.0,17.0,25.0,91.0
2012,98.0,19.265306,12.845274,4.0,10.0,17.0,23.75,71.0
2013,161.0,18.149068,11.883398,4.0,10.0,16.0,22.0,90.0
2014,81.0,19.333333,12.522979,4.0,10.0,18.0,23.0,89.0
2015,17.0,20.0,30.667572,4.0,8.0,12.0,20.0,136.0
2016,255.0,21.082353,15.368604,4.0,11.0,17.0,26.0,93.0
2017,610.0,20.852459,14.422474,4.0,11.0,17.0,26.0,122.0


In [17]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

for df, year in zip(years_dfs, years):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()

In [18]:
output_path = using_dataset.output_path
output_path_split = output_path.split("/")
output_path_split.insert(len(output_path_split) - 1, "news_blog_and_social_media")
output_path_news_blog_social_media = "/".join(output_path_split)
output_path_news_blog_social_media

'./data/MisInfoText/Analysis_output/news_blog_and_social_media/MisInfoText_quotes.xlsx'

In [19]:
writer = pd.ExcelWriter(output_path_news_blog_social_media, engine="xlsxwriter")

for df, year in zip(years_news_blog_social_media_dfs, years_news_blog_social_media):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year_news_blog_social_media.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_news_blog_social_media_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()