In [None]:
import os
import socket, contextlib
from concurrent.futures import ThreadPoolExecutor
import json
from stanza.server import CoreNLPClient
import stanza
import spacy
from spacy.tokens.token import Token
import textacy
from textacy.extract.triples import direct_quotations
from textacy.extract.triples import DQTriple
import coreferee
import pandas as pd

In [None]:
stanza.install_corenlp()

In [None]:
misinfotext_df = pd.read_excel("./data/MisInfoText/PolitiFact_original_modified.xlsx", sheet_name="Working")
misinfotext_df

In [None]:
def get_free_port(host="127.0.0.1"):
    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind((host, 0))                    # 0 asks OS for an ephemeral free port
        return s.getsockname()[1]

In [None]:
# TODO: run on entire dataset
should_load_docs_from_json = True

if should_load_docs_from_json:
    print("IMPLEMENT")
    annotated_docs = {}
else:
    num_threads = 8

    with CoreNLPClient(
        endpoint=f"http://localhost:{get_free_port()}",
        annotators=["tokenize", "ssplit", "quote", "ner", "parse"],
        timeout=300000,
        threads=8,
        ) as client:

        def annotate_one(row: pd.Series):
            text: str = row["originalBodyText"]

            try:
                return client.annotate(text, properties={'outputFormat': 'json'})
            except Exception as e:
                print("Error annotating row", row["originalURL"], "with text length", len(text))
                print(e)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            annotated_docs = list(executor.map(annotate_one, [row for _, row in misinfotext_df.iterrows()]))

    json_docs = json.dumps(annotated_docs, indent=2)
    with open("./misinfotext_docs.json", "w+", encoding="utf8") as file:
        file.write(json_docs)

In [None]:
def get_quotes_text(doc: dict):
    return [quote["text"] for quote in doc["quotes"]]

In [None]:
misinfotext_df["quotes"] = [get_quotes_text(doc) for doc in annotated_docs]
misinfotext_df