In [1]:
import os
import socket, contextlib
from concurrent.futures import ThreadPoolExecutor
import json
from stanza.server import CoreNLPClient
import stanza
import spacy
from spacy.tokens.token import Token
import textacy
from textacy.extract.triples import direct_quotations
from textacy.extract.triples import DQTriple
import coreferee
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [2]:
stanza.install_corenlp()



In [3]:
misinfotext_df = pd.read_excel("./data/MisInfoText/PolitiFact_original_modified.xlsx", sheet_name="Working")
misinfotext_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017
...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018


In [4]:
def get_free_port(host="127.0.0.1"):
    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind((host, 0))                    # 0 asks OS for an ephemeral free port
        return s.getsockname()[1]

In [None]:
# TODO: run on entire dataset
should_load_docs_from_json = False

if should_load_docs_from_json:
    print("IMPLEMENT")
    quote_annotations = {}
else:
    # This takes quite a while, give it some time
    num_threads = 8

    with CoreNLPClient(
        properties="./corenlp_server.props",
        endpoint=f"http://localhost:{get_free_port()}",
        annotators=["tokenize", "ssplit", "quote", "ner", "parse"],
        timeout=300000,
        memory="20G", # Adjust this based on your computer
        threads=8,
        ) as client:

        def annotate_one(row: pd.Series):
            text: str = row["originalBodyText"]

            try:
                return client.annotate(text, properties={'outputFormat': 'json'})
            except Exception as e:
                print("Error annotating row", row["originalURL"], "with text length", len(text))
                print(e)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            annotated_docs = list(executor.map(annotate_one, [row for _, row in misinfotext_df.iterrows()]))

    if len(annotated_docs) != misinfotext_df.shape[0]:
        print("ERROR! Number of annotated docs", len(annotate_one), "not equal to dataset size", misinfotext_df.shape[0])

    quote_annotations = [doc["quotes"] for doc in annotated_docs]
    
    json_quotes = json.dumps(quote_annotations, indent=2)
    with open("./misinfotext_quotes.json", "w+", encoding="utf8") as file:
        file.write(json_quotes)

# Explicitly stop just in case
client.stop()

2025-10-17 07:42:03 INFO: Starting server with command: java -Xmx20G -cp C:\Users\Adam\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 56378 -timeout 300000 -threads 8 -maxCharLength 100000 -quiet False -serverProperties ./corenlp_server.props -annotators tokenize,ssplit,quote,ner,parse -preload -outputFormat serialized


In [18]:
def get_quotes_text(quote_annotation: dict):
    return [quote["text"] for quote in quote_annotation]

In [19]:
misinfotext_df["quotes"] = [get_quotes_text(doc) for doc in quote_annotations]
misinfotext_df

Unnamed: 0,factcheckURL,originalURL,originalBodyText,originalHeadline,originalTextType,originalDate,originalDateYear,quotes
0,http://www.politifact.com/arizona/statements/2...,https://associatedmediacoverage.com/three-stat...,Residents of multiple states will be asked to ...,Multiple States Have Agreed To Implement A ‘Tw...,News and blog,2016-05-06,2016,"[‘third pet’, ‘2-pet maximum’, ‘pet’, “domesti..."
1,http://www.politifact.com/california/statement...,https://users.focalbeam.com/fs/distribution:wl...,"Sacramento, CA - United States Senator Dianne ...",U.S. Senator Dianne Feinstein Opposes Prop. 64...,Press release,2016-07-12,2016,"[""Proposition 64 is substantially different fr..."
2,http://www.politifact.com/california/statement...,http://www.sacbee.com/opinion/op-ed/soapbox/ar...,We should anticipate black and gray markets in...,Why you should buy a locking gasoline cap,News and blog,2017-08-04,2017,[“Organized crime gangs are buying hundreds or...
3,http://www.politifact.com/california/statement...,https://nocagastax.com/california-gas-tax-hike...,As a ballot initiative calling for repeal of a...,California Gas-Tax-Hike Repeal Campaign Heats Up,News and blog,2017-06-15,2017,[]
4,http://www.politifact.com/california/statement...,https://chu.house.gov/media-center/press-relea...,"WASHINGTON, DC The House of Representatives t...","Rep. Chu Decries ""Heartless"" ACA Repeal Vote",Press release,2017-05-04,2017,"[""age tax""]"
...,...,...,...,...,...,...,...,...
650,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9428776407421...,Road projects across the state are staying on ...,,Social media,2017-12-18,2017,[]
651,http://www.politifact.com/wisconsin/statements...,https://x.com/ScottWalker/status/9511017961011...,The last thing we need is more Madison in our ...,,Social media,2018-01-10,2018,[]
652,http://www.politifact.com/wisconsin/statements...,https://x.com/MahlonMitchell/status/9538161542...,When \n@ScottWalker\n told firefighters we did...,,Social media,2018-01-18,2018,[]
653,http://www.politifact.com/wisconsin/statements...,http://dailycaller.com/2018/01/25/hey-look-sen...,"Now that its 2018, an election year, I would l...",HEY LOOK! Senator Tammy Baldwin Is Back In Wis...,News and blog,2018-01-25,2018,[]
