In [26]:
import os
import socket, contextlib
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import json
from stanza.server import CoreNLPClient
import stanza
import pandas as pd

In [2]:
class DatasetConfig():
    input_path: str
    output_path: str
    quote_annotations_path: str
    sheet_name: str
    usecols: list[str]

    def __init__(self, input_path: str, output_path: str, quote_annotations_path: str, sheet_name: str, usecols: list[str]):
        self.input_path = input_path
        self.output_path = output_path
        self.quote_annotations_path = quote_annotations_path
        self.sheet_name = sheet_name
        self.usecols = usecols

In [3]:
fakespeak_config = DatasetConfig(
    # file_path="/content/drive/My Drive/fake_news_over_time/Fakespeak_ENG_modified.xlsx",
    input_path="./data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx",
    output_path="./data/Fakespeak-ENG/Analysis_output/Fakespeak_quotes.xlsx",
    quote_annotations_path="./data/Fakespeak-ENG/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'],
)

misinfotext_config = DatasetConfig(
    input_path="./data/MisInfoText/PolitiFact_original_modified.xlsx",
    output_path="./data/MisInfoText/Analysis_output/MisInfoText_quotes.xlsx",
    quote_annotations_path="./data/MisInfoText/Analysis_output/quote_annotations.json",
    sheet_name="Working",
    usecols=None,
)

In [4]:
using_dataset = misinfotext_config

In [5]:
dataset_df = pd.read_excel(using_dataset.input_path, sheet_name=using_dataset.sheet_name)
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017
...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018


In [6]:
stanza.install_corenlp()



In [7]:
def get_free_port(host="127.0.0.1"):
    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind((host, 0))                    # 0 asks OS for an ephemeral free port
        return s.getsockname()[1]

In [8]:
should_load_docs_from_json = True

if should_load_docs_from_json and os.path.exists(using_dataset.quote_annotations_path):
    with open(using_dataset.quote_annotations_path, "r+") as file:
        json_quotes = file.read()
    
    quote_annotations = json.loads(json_quotes)
else:
    # This takes quite a while, give it some time
    num_threads = 8

    with CoreNLPClient(
        properties="./corenlp_server.props",
        endpoint=f"http://localhost:{get_free_port()}",
        annotators=["tokenize", "ssplit", "quote", "ner", "parse"],
        timeout=300000,
        memory="20G", # Adjust this based on your computer specs
        threads=8,
        ) as client:

        def annotate_one(row: pd.Series):
            text: str = row["originalBodyText"]

            try:
                return client.annotate(text, properties={'outputFormat': 'json'})
            except Exception as e:
                print("Error annotating row", row["originalURL"], "with text length", len(text))
                print(e)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            annotated_docs = list(executor.map(annotate_one, [row for _, row in dataset_df.iterrows()]))

    if len(annotated_docs) != dataset_df.shape[0]:
        print("ERROR! Number of annotated docs", len(annotate_one), "not equal to dataset size", dataset_df.shape[0])

    quote_annotations = [doc["quotes"] for doc in annotated_docs]
    
    json_quotes = json.dumps(quote_annotations, indent=2)
    with open("./misinfotext_quotes.json", "w+", encoding="utf8") as file:
        file.write(json_quotes)

    # Explicitly stop just in case
    client.stop()

In [None]:
quotation_marks = set("\'\"“”‘’«»‹›「」『』„”‚’")

def get_quotes_text_and_length(doc: dict):
    def get_clean_text(quote: dict):
        raw_text: str = quote["text"]
        clean_text = raw_text

        for mark in quotation_marks:
            clean_text = clean_text.replace(mark, '')
        
        return clean_text

    def get_clean_len(quote: dict):
        raw_text: str = quote["text"]
        raw_length: int = quote["endToken"] - quote["beginToken"] + 1
        
        num_quotation_marks = sum(raw_text.count(mark) for mark in quotation_marks)

        clean_length = raw_length - num_quotation_marks
        return clean_length

    return {
        "quotes": [get_clean_text(quote) for quote in doc],
        "quote_lengths": [get_clean_len(quote) for quote in doc],
    }

In [None]:
quote_text_and_length_df = pd.DataFrame([get_quotes_text_and_length(doc) for doc in quote_annotations])

dataset_df["quotes"] = quote_text_and_length_df["quotes"]
dataset_df["quote_lengths"] = quote_text_and_length_df["quote_lengths"]
dataset_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quotes,quote_lengths
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[third pet, 2-pet maximum, pet, domestic anima...","[2, 4, 1, 9, 2, 3]"
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,[Proposition 64 is substantially different fro...,[117]
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,[Organized crime gangs are buying hundreds or ...,[23]
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,[],[]
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,[age tax],[2]
...,...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,[],[]
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,[],[]
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,[],[]
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,[],[]


In [53]:
all_quotes_df = dataset_df.explode(["quotes", "quote_lengths"])\
    .rename(columns={"quotes": "quote"})\
    .rename(columns={"quote_lengths": "quote_length"})
all_quotes_df = all_quotes_df[all_quotes_df["quote"].notna()]
all_quotes_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quote,quote_length
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,third pet,2
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,2-pet maximum,4
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,pet,1
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,domestic animal the size of a cat or larger,9
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,pet registration,2
...,...,...,...,...,...,...,...,...,...
641,http://www.politifact.com/wisconsin/statements...,https://eu.jsonline.com/story/opinion/contribu...,"Transformational. The addition of 13,000 high-...","Walker: Foxconn is a transformational, once-in...",News and blog,2017-07-19,2017,Wisconn Valley.,3
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,cease and desist,3
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,extraordinary unusual,2
645,http://www.politifact.com/wisconsin/statements...,http://www.citizenactionwi.org/johnson_cease_d...,"Dissent is patriotic, that's what we believe. ...",Citizen Action of Wisconsin,News and blog,2017-02-28,2017,We will NOT Cease and Desist!,7
