In [1]:
import json
from json_repair import repair_json
import pandas as pd

with open("data/quotes.json", "r") as f:
    quotes = json.load(f)

all_quotes = []
for n, quote in enumerate(quotes):
    idx = quote["key"]
    quote_string = quote["response"]["candidates"][0]["content"]["parts"][0][
        "text"
    ].strip()

    if quote_string.startswith("```json"):
        quote_string = quote_string.replace("```json", "").replace("```", "")

    try:
        quote_list = json.loads(quote_string)
    except:
        quote_string_repaired = repair_json(quote_string)
        quote_list = json.loads(quote_string_repaired)

    for q in quote_list:
        d = {"uri": idx, **q}
        all_quotes.append(d)

quotes = pd.json_normalize(all_quotes)

In [2]:
quotes

Unnamed: 0,uri,name,organisation,role,nationality,quote,message
0,8884722321,Winston Peters,Government of New Zealand,Foreign Affairs Minister,NZL,"With a war raging, Hamas remaining the de fact...","With the war raging, Hamas remaining the de fa..."
1,2025-09-846864852,Benjamin Netanyahu,Government of Israel,Prime Minister,ISR,disgraceful decision,The recognition of Palestinian statehood by Br...
2,2025-09-846864852,Donald Trump,,,USA,We're getting a very good response because Bib...,The framework deal to end the war in the Pales...
3,2025-09-846864852,Donald Trump,,,USA,"It's called peace in the Middle East, more tha...",The framework deal is focused on broader peace...
4,2025-09-846864852,Senior Israeli official,Government of Israel,Senior official,ISR,it's too early to tell.,It is too early to tell whether there is an ag...
...,...,...,...,...,...,...,...
5789,8835605895,Kerrie Aust,Australian Medical Association,General Practitioner,AUS,We wants kids and young people playing sport a...,We want children and young people to be able t...
5790,8835605895,Kate Chaney,Parliament of Australia,Independent Member of Parliament,AUS,"listening to the money from gambling, sport an...",The federal government is currently prioritizi...
5791,2025-08-817061288,White House Official,White House,Senior official,USA,simply a policy meeting,"The session, which involved Donald Trump, Tony..."
5792,2025-08-817061288,Donald Trump,,President of the United States,USA,Riviera of the Middle East.,"The plan proposed by Donald Trump, which invol..."


In [None]:
import hashlib


for col in qf.columns:
    qf[col] = qf[col].str.strip().replace(pd.NA, None)


SPOKE_FIELDS = ["name", "organisation", "role", "nationality"]


def hash_row(row):
    slug = "-".join([str(row[f]) for f in SPOKE_FIELDS])
    return hashlib.md5(slug.encode("utf-8")).hexdigest()


qf["spoke_id"] = qf.apply(hash_row, axis=1)

qf.sort_values("spoke_id")

In [None]:
with open("data/september_eng_filtered.json", "w") as f:
    filtered_data = [i for i in source_data if i["uri"] in df["uri"].values]
    json.dump(filtered_data, f, indent=4, ensure_ascii=False)

In [None]:
spokes = (
    qf[["spoke_id", "name", "organisation", "role", "nationality"]]
    .value_counts(dropna=False)
    .reset_index()
).assign(
    complete=lambda x: sum(
        [x[i] != "" for i in ["name", "organisation", "role", "nationality"]]
    )
    / 4
)

In [None]:
from tqdm import tqdm

from annoy import AnnoyIndex

# def get_neighbours(df, col, n_trees=10, index_type="angular", thresh=0.5):

n_trees = 10
index_type = "angular"
thresh = 0.65
col = "name"

lookup_text = {i: qf.iloc[i][col] for i in range(len(qf))}

# Embed text
print("Embedding text")
embeddings = embedding_model.encode(qf[col].tolist())

print("Building index")
# Build index
f = embeddings.shape[1]
t = AnnoyIndex(f, index_type)
for i, v in enumerate(embeddings):
    t.add_item(i, v)

t.build(n_trees=n_trees)  # 10 trees

print("Finding neighbours")

out = []
for idx in tqdm(range(len(embeddings))):
    neighbours = t.get_nns_by_item(idx, len(embeddings), include_distances=True)

    for n, row in enumerate(zip(*neighbours)):
        similarity = 1 - row[1]
        if idx != row[0] and similarity >= thresh:
            d = {
                "source_idx": idx,
                "target_idx": row[0],
                "similarity": similarity,
                "source_name": lookup_text[idx],
                "target_name": lookup_text[row[0]],
            }
            out.append(d)

In [None]:
names_matches = (
    pd.DataFrame(out)
    .sort_values("similarity", ascending=True)
    .assign(equal=lambda x: x.source_name == x.target_name)
    .query("equal == False")
    .replace({"": None})
    .dropna()
)
names_matches

In [None]:
names_matches.groupby(["source_name", "target_name", "similarity"]).size().reset_index()

In [None]:
import networkx as nx

G = nx.Graph()
for _, row in names_matches.drop_duplicates(
    subset=["source_name", "target_name"]
).iterrows():
    G.add_edge(row["source_name"], row["target_name"], weight=row["similarity"])

# Get connected components
components = list(nx.connected_components(G))
len(components)

In [None]:
out = []
for c in components:
    d = {"component_size": len(c), "members": list(c)}
    out.append(d)
pd.DataFrame(out).sort_values("component_size", ascending=False)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
name_embeddings = model.encode(qf["name"].tolist(), show_progress_bar=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def compare_names(name1, name2):
    emb1 = model.encode([name1])
    emb2 = model.encode([name2])

    return cosine_similarity(emb1, emb2)[0][0]


compare_names("Israel's foreign ministry", "Israel")

In [None]:
print(quotes)
print(len(quotes))