In [1]:
import os
import socket, contextlib
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import json
from stanza.server import CoreNLPClient
import stanza
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    quote_annotations_path: str
    sheet_name: str
    usecols: list[str]

    def __init__(self, input_path: str, output_path: str, quote_annotations_path: str, sheet_name: str, usecols: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.quote_annotations_path = quote_annotations_path
        self.sheet_name = sheet_name
        self.usecols = usecols

In [3]:
fakespeak_config = DatasetConfig(
    # file_path="/content/drive/My Drive/fake_news_over_time/Fakespeak_ENG_modified.xlsx",
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_quotes.xlsx",
    quote_annotations_path="./data/Fakespeak-ENG/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_quotes.xlsx",
    quote_annotations_path="./data/MisInfoText/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    usecols=None,
)

In [4]:
using_dataset = fakespeak_config

In [5]:
dataset_df = pd.read_excel(using_dataset.input_path, sheet_name=using_dataset.sheet_name)
dataset_df

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,factcheckClaim,factcheckHeadline,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear
0,Politifact_FALSE_Social media_687276,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Miriam Valverde,2019-01-04,"['Immigration', 'National', 'Corrections and U...",False,False,Donald Trump,"stated on January 2, 2019 in a tweet:","""Mexico is paying for the Wall through the new...","No, Mexico isn't paying for border wall throug...",https://twitter.com/realDonaldTrump/status/108...,Social media,Twitter/X,Mexico is paying for the Wall through the new ...,,Donald J. Trump,2019-01-02 00:00:00,2019
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",American citizens pay $155 billion annually to...,There's no evidence Americans pay $155 billion...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019
2,Politifact_FALSE_Social media_735424,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Chris Nichols,2019-01-09,"['Environment', 'Fires', 'Government Regulatio...",False,False,Donald Trump,"stated on January 9, 2019 in a tweet:","California's deadly wildfires ""would never hap...","Trump repeats overly simplistic, False claim o...",https://twitter.com/realDonaldTrump/status/108...,Social media,Twitter/X,Billions of dollars are sent to the State of C...,,Donald J. Trump,2019-01-09 00:00:00,2019
3,Politifact_FALSE_Social media_594307,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Amy Sherman,2019-01-22,"['Immigration', 'Fake news', 'Federal Budget',...",False,False,Facebook posts,"stated on January 11, 2019 in tweets and Faceb...",Says Congress set aside $50 billion for the Se...,"No, Congress did not approve $50 billion for b...",https://twitter.com/DiamondandSilk/status/1085...,Social media,Twitter/X,If 50 Billion $$ were set aside to go towards ...,,Diamond and Silk®,2019-01-15 00:00:00,2019
4,Politifact_FALSE_Social media_839325,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Paul Specht,2019-01-30,"['Elections', 'North Carolina']",False,False,Dallas Woodhouse,"stated on January 17, 2019 in a tweet:","Says the NC elections board received ""no calls...",GOP official falsely says Bladen voters didn't...,https://twitter.com/DallasWoodhouse/status/108...,Social media,Twitter/X,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,,Dallas Woodhouse,2019-01-17 00:00:00,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,PolitiFact,https://www.politifact.com/factchecks/2023/jun...,Gabrielle Settles,2023-06-13,"['National', 'Legal Issues', 'Facebook Fact-ch...",Pants on Fire,Pants on Fire,Facebook posts,"stated on June 8, 2023 in a post:","Trump ""hasn't been 'indicted.' It's all part o...","Trump was indicted, despite social media consp...",https://www.facebook.com/The1776Nation/posts/2...,Social media,Facebook,A great lesson in Optics 101: The Monroe Doctr...,,1776 Nation,9 June 2023,2023
2957,Politifact_Pants on Fire_Social media_231170,PolitiFact,https://www.politifact.com/factchecks/2023/mar...,Ciara O'Rourke,2023-03-09,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Facebook posts,"stated on March 8, 2023 in a Facebook post:",President Joe Biden is â€œin Gitmo.â€,"No, President Joe Biden isnâ€™t in the Guantan...",https://www.facebook.com/The1776Nation/posts/p...,Social media,Facebook,“One of these Joe’s is not like the other… one...,,1776 Nation,9 March 2023,2023
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",Autopsies prove that COVID-19 is a blood clot...,"No, COVID-19 won't respond to antibiotics, des...",https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020
2959,Politifact_Pants on Fire_Social media_635418,PolitiFact,https://www.politifact.com/factchecks/2023/jan...,Ciara O'Rourke,2023-01-12,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Viral image,"stated on November 9, 2021 in a Facebook post:",Video shows Hillary Clinton being arrested.,"No, this isnâ€™t a video of Hillary Clinton ge...",https://www.facebook.com/michael.day.1694/post...,Social media,Facebook,She collapsed when she saw jfk jr. as she was ...,,Michael Day,9 November 2021,2021


In [6]:
stanza.install_corenlp()



In [7]:
def get_free_port(host="127.0.0.1"):
    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind((host, 0))                    # 0 asks OS for an ephemeral free port
        return s.getsockname()[1]

In [8]:
should_load_docs_from_json = True

if should_load_docs_from_json and os.path.exists(using_dataset.quote_annotations_path):
    with open(using_dataset.quote_annotations_path, "r+") as file:
        json_quotes = file.read()
    
    quote_annotations = json.loads(json_quotes)
else:
    # Adjust these based on your computer specs
    num_threads = 4
    memory_gb = 20

    # This takes quite a while, give it some time.
    # With the current settings, this only extracts quotes
    # without extracting the speaker (to save memory).
    # If you want speaker extraction as well, need to 
    # include annotators and properties listed on
    # https://stanfordnlp.github.io/CoreNLP/quote.html#sample-command-line.
    with CoreNLPClient(
        properties="./corenlp_server.props",
        endpoint=f"http://localhost:{get_free_port()}",
        timeout=300000,
        memory=f"{memory_gb}G", 
        threads=num_threads,
        ) as client:

        def annotate_one(row: pd.Series):
            text: str = row["originalBodyText"]

            try:
                return client.annotate(text, properties={"outputFormat": "json"})
            except Exception as e:
                print("Error annotating row", row["originalURL"], "with text length", len(text))
                print(e)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            annotated_docs = list(executor.map(annotate_one, [row for _, row in dataset_df.iterrows()]))

    if len(annotated_docs) != dataset_df.shape[0]:
        print("ERROR! Number of annotated docs", len(annotated_docs), "not equal to dataset size", dataset_df.shape[0])

    quote_annotations = [doc["quotes"] for doc in annotated_docs]
    
    json_quotes = json.dumps(quote_annotations, indent=2)
    with open(using_dataset.quote_annotations_path, "w+", encoding="utf8") as file:
        file.write(json_quotes)

    # Explicitly stop just in case
    client.stop()

2025-10-17 17:55:54 INFO: Starting server with command: java -Xmx20G -cp C:\Users\Adam\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 62714 -timeout 300000 -threads 4 -maxCharLength 100000 -quiet False -serverProperties ./corenlp_server.props -preload -outputFormat serialized


In [9]:
quotation_marks = set("\'\"“”‘’«»‹›「」『』„”‚’")

def get_quotes_text_and_length(doc: dict):
    def get_clean_text(quote: dict):
        raw_text: str = quote["text"]
        clean_text = raw_text

        for mark in quotation_marks:
            clean_text = clean_text.replace(mark, '')
        
        return clean_text

    def get_clean_len(quote: dict):
        raw_text: str = quote["text"]
        raw_length: int = quote["endToken"] - quote["beginToken"] + 1
        
        num_quotation_marks = sum(raw_text.count(mark) for mark in quotation_marks)

        clean_length = raw_length - num_quotation_marks
        return clean_length

    return {
        "quotes": [get_clean_text(quote) for quote in doc],
        "quote_lengths": [get_clean_len(quote) for quote in doc],
    }

In [10]:
quote_text_and_length_df = pd.DataFrame([get_quotes_text_and_length(doc) for doc in quote_annotations])

dataset_df["quotes"] = quote_text_and_length_df["quotes"]
dataset_df["quote_lengths"] = quote_text_and_length_df["quote_lengths"]
dataset_df

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,...,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear,quotes,quote_lengths
0,Politifact_FALSE_Social media_687276,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Miriam Valverde,2019-01-04,"['Immigration', 'National', 'Corrections and U...",False,False,Donald Trump,"stated on January 2, 2019 in a tweet:",...,https://twitter.com/realDonaldTrump/status/108...,Social media,Twitter/X,Mexico is paying for the Wall through the new ...,,Donald J. Trump,2019-01-02 00:00:00,2019,[],[]
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019,[why should American citizens be responsible t...,[14]
2,Politifact_FALSE_Social media_735424,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Chris Nichols,2019-01-09,"['Environment', 'Fires', 'Government Regulatio...",False,False,Donald Trump,"stated on January 9, 2019 in a tweet:",...,https://twitter.com/realDonaldTrump/status/108...,Social media,Twitter/X,Billions of dollars are sent to the State of C...,,Donald J. Trump,2019-01-09 00:00:00,2019,[],[]
3,Politifact_FALSE_Social media_594307,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Amy Sherman,2019-01-22,"['Immigration', 'Fake news', 'Federal Budget',...",False,False,Facebook posts,"stated on January 11, 2019 in tweets and Faceb...",...,https://twitter.com/DiamondandSilk/status/1085...,Social media,Twitter/X,If 50 Billion $$ were set aside to go towards ...,,Diamond and Silk®,2019-01-15 00:00:00,2019,[],[]
4,Politifact_FALSE_Social media_839325,PolitiFact,https://www.politifact.com/factchecks/2019/jan...,Paul Specht,2019-01-30,"['Elections', 'North Carolina']",False,False,Dallas Woodhouse,"stated on January 17, 2019 in a tweet:",...,https://twitter.com/DallasWoodhouse/status/108...,Social media,Twitter/X,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,,Dallas Woodhouse,2019-01-17 00:00:00,2019,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,PolitiFact,https://www.politifact.com/factchecks/2023/jun...,Gabrielle Settles,2023-06-13,"['National', 'Legal Issues', 'Facebook Fact-ch...",Pants on Fire,Pants on Fire,Facebook posts,"stated on June 8, 2023 in a post:",...,https://www.facebook.com/The1776Nation/posts/2...,Social media,Facebook,A great lesson in Optics 101: The Monroe Doctr...,,1776 Nation,9 June 2023,2023,"[indicted., The Monroe Doctrine is the best kn...","[2, 49, 1, 11, 1]"
2957,Politifact_Pants on Fire_Social media_231170,PolitiFact,https://www.politifact.com/factchecks/2023/mar...,Ciara O'Rourke,2023-03-09,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Facebook posts,"stated on March 8, 2023 in a Facebook post:",...,https://www.facebook.com/The1776Nation/posts/p...,Social media,Facebook,“One of these Joe’s is not like the other… one...,,1776 Nation,9 March 2023,2023,[One of these Joes is not like the other… one ...,[17]
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",...,https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020,[Thanks to 50 autopsies performed on patients ...,"[51, 40, 12]"
2959,Politifact_Pants on Fire_Social media_635418,PolitiFact,https://www.politifact.com/factchecks/2023/jan...,Ciara O'Rourke,2023-01-12,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Viral image,"stated on November 9, 2021 in a Facebook post:",...,https://www.facebook.com/michael.day.1694/post...,Social media,Facebook,She collapsed when she saw jfk jr. as she was ...,,Michael Day,9 November 2021,2021,[],[]


In [11]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,...,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019,why should American citizens be responsible to...,14
14,Politifact_FALSE_Social media_19711,PolitiFact,https://www.politifact.com/factchecks/2019/apr...,Miriam Valverde,2019-04-08,"['Immigration', 'National', 'Homeland Security']",False,False,Tweets,"stated on April 5, 2019 in a tweet:",...,https://twitter.com/markmobility/status/111428...,Social media,Twitter/X,.@realDonaldTrump on people asking for asylum ...,,Mark Elliott,2019-04-05 00:00:00,2019,These arent people. These are animals.,8
16,Politifact_FALSE_News and blog_73653,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Emily Venezky,2020-05-28,['Candidate Biography'],False,False,Social Media,"stated on May 26, 2020 in Tweet:",...,https://dailycaller.com/2019/04/26/joe-biden-s...,News and blog,The Daily Caller,Joe Biden has a message for the public on his ...,Joe Biden Can’t Keep His Thoughts Straight,\nDAILY CALLER NEWS FOUNDATION PRODUCTIONS,2019-04-26 00:00:00,2019,The View,2
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Im completely sure that he is the best so far....,18
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Hes an answer to our problems. We need to get ...,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,PolitiFact,https://www.politifact.com/factchecks/2023/jun...,Gabrielle Settles,2023-06-13,"['National', 'Legal Issues', 'Facebook Fact-ch...",Pants on Fire,Pants on Fire,Facebook posts,"stated on June 8, 2023 in a post:",...,https://www.facebook.com/The1776Nation/posts/2...,Social media,Facebook,A great lesson in Optics 101: The Monroe Doctr...,,1776 Nation,9 June 2023,2023,Biden,1
2957,Politifact_Pants on Fire_Social media_231170,PolitiFact,https://www.politifact.com/factchecks/2023/mar...,Ciara O'Rourke,2023-03-09,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Facebook posts,"stated on March 8, 2023 in a Facebook post:",...,https://www.facebook.com/The1776Nation/posts/p...,Social media,Facebook,“One of these Joe’s is not like the other… one...,,1776 Nation,9 March 2023,2023,One of these Joes is not like the other… one o...,17
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",...,https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020,Thanks to 50 autopsies performed on patients w...,51
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",...,https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020,If we ventilate a lung where blood does not ci...,40


In [12]:
grouped_by_year = all_quotes_df.groupby(by="originalDateYear")
years = grouped_by_year.groups
years_dfs = [grouped_by_year.get_group(year) for year in years]
years_dfs[0].head()

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,...,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019,why should American citizens be responsible to...,14
14,Politifact_FALSE_Social media_19711,PolitiFact,https://www.politifact.com/factchecks/2019/apr...,Miriam Valverde,2019-04-08,"['Immigration', 'National', 'Homeland Security']",False,False,Tweets,"stated on April 5, 2019 in a tweet:",...,https://twitter.com/markmobility/status/111428...,Social media,Twitter/X,.@realDonaldTrump on people asking for asylum ...,,Mark Elliott,2019-04-05 00:00:00,2019,These arent people. These are animals.,8
16,Politifact_FALSE_News and blog_73653,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Emily Venezky,2020-05-28,['Candidate Biography'],False,False,Social Media,"stated on May 26, 2020 in Tweet:",...,https://dailycaller.com/2019/04/26/joe-biden-s...,News and blog,The Daily Caller,Joe Biden has a message for the public on his ...,Joe Biden Can’t Keep His Thoughts Straight,\nDAILY CALLER NEWS FOUNDATION PRODUCTIONS,2019-04-26 00:00:00,2019,The View,2
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Im completely sure that he is the best so far....,18
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Hes an answer to our problems. We need to get ...,52


In [13]:
num_quotes_per_year = grouped_by_year["quote"].count()
num_quotes_per_year

originalDateYear
2019     184
2020     938
2021    1190
2022     617
2023    1116
2024     287
Name: quote, dtype: int64

In [14]:
quote_length_summary_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_dfs],
    index=pd.Index(data=years, name="year")
)
quote_length_summary_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019,184.0,17.793478,26.19433,1.0,2.0,9.0,21.25,194.0
2020,938.0,16.269723,31.972253,1.0,2.0,5.0,19.0,586.0
2021,1190.0,18.609244,32.518291,1.0,2.0,6.0,24.0,629.0
2022,617.0,20.468395,62.783017,1.0,2.0,6.0,21.0,1373.0
2023,1116.0,15.657706,31.636745,1.0,2.0,7.0,21.0,847.0
2024,287.0,20.090592,31.431446,1.0,2.0,8.0,26.0,323.0


In [15]:
only_news_blog_social_media_df = all_quotes_df[(all_quotes_df["originalTextType"] == "News and blog") | (all_quotes_df["originalTextType"] == "Social media")]
only_news_blog_social_media_df

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,...,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019,why should American citizens be responsible to...,14
14,Politifact_FALSE_Social media_19711,PolitiFact,https://www.politifact.com/factchecks/2019/apr...,Miriam Valverde,2019-04-08,"['Immigration', 'National', 'Homeland Security']",False,False,Tweets,"stated on April 5, 2019 in a tweet:",...,https://twitter.com/markmobility/status/111428...,Social media,Twitter/X,.@realDonaldTrump on people asking for asylum ...,,Mark Elliott,2019-04-05 00:00:00,2019,These arent people. These are animals.,8
16,Politifact_FALSE_News and blog_73653,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Emily Venezky,2020-05-28,['Candidate Biography'],False,False,Social Media,"stated on May 26, 2020 in Tweet:",...,https://dailycaller.com/2019/04/26/joe-biden-s...,News and blog,The Daily Caller,Joe Biden has a message for the public on his ...,Joe Biden Can’t Keep His Thoughts Straight,\nDAILY CALLER NEWS FOUNDATION PRODUCTIONS,2019-04-26 00:00:00,2019,The View,2
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Im completely sure that he is the best so far....,18
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Hes an answer to our problems. We need to get ...,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,Politifact_Pants on Fire_Social media_876628,PolitiFact,https://www.politifact.com/factchecks/2023/jun...,Gabrielle Settles,2023-06-13,"['National', 'Legal Issues', 'Facebook Fact-ch...",Pants on Fire,Pants on Fire,Facebook posts,"stated on June 8, 2023 in a post:",...,https://www.facebook.com/The1776Nation/posts/2...,Social media,Facebook,A great lesson in Optics 101: The Monroe Doctr...,,1776 Nation,9 June 2023,2023,Biden,1
2957,Politifact_Pants on Fire_Social media_231170,PolitiFact,https://www.politifact.com/factchecks/2023/mar...,Ciara O'Rourke,2023-03-09,['Facebook Fact-checks'],Pants on Fire,Pants on Fire,Facebook posts,"stated on March 8, 2023 in a Facebook post:",...,https://www.facebook.com/The1776Nation/posts/p...,Social media,Facebook,“One of these Joe’s is not like the other… one...,,1776 Nation,9 March 2023,2023,One of these Joes is not like the other… one o...,17
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",...,https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020,Thanks to 50 autopsies performed on patients w...,51
2958,Politifact_Pants on Fire_Social media_874359,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Tom Kertscher,2020-05-19,"['Public Health', 'Facebook Fact-checks', 'Cor...",Pants on Fire,Pants on Fire,Facebook posts,"stated on May 11, 2020 in a Facebook post:",...,https://www.facebook.com/susan.hazzard.16/post...,Social media,Facebook,Autopsies Prove that COVID-19 is a Disseminate...,,Susan Hazzard,9 May 2020,2020,If we ventilate a lung where blood does not ci...,40


In [16]:
grouped_by_year_news_blog_social_media = only_news_blog_social_media_df.groupby(by="originalDateYear")
years_news_blog_social_media = grouped_by_year_news_blog_social_media.groups
years_news_blog_social_media_dfs = [grouped_by_year_news_blog_social_media.get_group(year) for year in years_news_blog_social_media]
years_news_blog_social_media_dfs[0].head()

Unnamed: 0,ID,factcheckService,factcheckURL,factcheckAuthor,factcheckDate,factcheckCategories,factcheckLabel,combinedLabel,politifactSource,politifactSourceDetails,...,originalURL,originalTextType,originalSource,originalBodyText,originalHeadline,originalPoster,originalDate,originalDateYear,quote,quote_length
1,Politifact_FALSE_Social media_25111,PolitiFact,https://www.politifact.com/factchecks/2022/jan...,Madison Czopek,2022-01-26,['Immigration'],False,False,Facebook posts,"stated on January 7, 2019 in a Facebook post:",...,https://archive.vn/pACz2,Social media,Twitter/X,"Chuck Schumer: ""why should American citizens b...",,Lori Hendry,2019-01-07 00:00:00,2019,why should American citizens be responsible to...,14
14,Politifact_FALSE_Social media_19711,PolitiFact,https://www.politifact.com/factchecks/2019/apr...,Miriam Valverde,2019-04-08,"['Immigration', 'National', 'Homeland Security']",False,False,Tweets,"stated on April 5, 2019 in a tweet:",...,https://twitter.com/markmobility/status/111428...,Social media,Twitter/X,.@realDonaldTrump on people asking for asylum ...,,Mark Elliott,2019-04-05 00:00:00,2019,These arent people. These are animals.,8
16,Politifact_FALSE_News and blog_73653,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Emily Venezky,2020-05-28,['Candidate Biography'],False,False,Social Media,"stated on May 26, 2020 in Tweet:",...,https://dailycaller.com/2019/04/26/joe-biden-s...,News and blog,The Daily Caller,Joe Biden has a message for the public on his ...,Joe Biden Can’t Keep His Thoughts Straight,\nDAILY CALLER NEWS FOUNDATION PRODUCTIONS,2019-04-26 00:00:00,2019,The View,2
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Im completely sure that he is the best so far....,18
19,Politifact_FALSE_News and blog_605527,PolitiFact,https://www.politifact.com/factchecks/2020/may...,Samantha Putterman,2020-05-12,"['Fake news', 'Facebook Fact-checks']",False,False,Bloggers,"stated on May 31, 2019 in a blog post:",...,https://archive.fo/83rr5,News and blog,Education Blog,Hollywood legend Tom Selleck has praised Donal...,Actor Tom Selleck: ‘I Would Say “F*ck You” To ...,,2019-05-31 00:00:00,2019,Hes an answer to our problems. We need to get ...,52


In [17]:
num_quotes_per_year_news_blog_social_media = grouped_by_year_news_blog_social_media["quote"].count()
num_quotes_per_year_news_blog_social_media

originalDateYear
2019     159
2020     894
2021    1167
2022     591
2023    1113
2024     280
Name: quote, dtype: int64

In [18]:
quote_length_summary_news_blog_social_media_df = pd.DataFrame(
    [df["quote_length"].convert_dtypes().describe() for df in years_news_blog_social_media_dfs],
    index=pd.Index(data=years_news_blog_social_media, name="year")
)
quote_length_summary_news_blog_social_media_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019,159.0,14.144654,18.218031,1.0,2.0,7.0,17.5,95.0
2020,894.0,15.233781,31.436068,1.0,2.0,4.0,17.0,586.0
2021,1167.0,17.921165,31.124725,1.0,2.0,6.0,24.0,629.0
2022,591.0,19.089679,63.385052,1.0,2.0,6.0,19.0,1373.0
2023,1113.0,15.608266,31.644229,1.0,2.0,7.0,21.0,847.0
2024,280.0,20.028571,31.636702,1.0,2.0,8.0,25.25,323.0


In [19]:
writer = pd.ExcelWriter(using_dataset.output_path, engine="xlsxwriter")

for df, year in zip(years_dfs, years):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()

In [20]:
output_path = using_dataset.output_path
output_path_split = output_path.split("/")
output_path_split.insert(len(output_path_split) - 1, "news_blog_and_social_media")
output_path_news_blog_social_media = "/".join(output_path_split)
output_path_news_blog_social_media

'./data/Fakespeak-ENG/Analysis_output/news_blog_and_social_media/Fakespeak_quotes.xlsx'

In [21]:
writer = pd.ExcelWriter(output_path_news_blog_social_media, engine="xlsxwriter")

for df, year in zip(years_news_blog_social_media_dfs, years_news_blog_social_media):
    df\
        .sort_values(by="quote_length")\
        .to_excel(writer, sheet_name=str(year), index=False, columns=["quote", "quote_length", "originalDateYear"])

num_quotes_per_year_news_blog_social_media.to_excel(writer, sheet_name="Number of quotes")
quote_length_summary_news_blog_social_media_df.to_excel(writer, sheet_name="Quote length summary")

writer.close()