In [1]:
import os
import pandas as pd
import glob 


DATA_DIR = os.path.expanduser("~/Desktop/CBS code/data")
PARENTS_FILE = os.path.join(DATA_DIR, "all_parents.csv")
MATCH_FILE = "trainset_reconstructed.csv"

In [2]:
matches = pd.read_csv(MATCH_FILE)

# rename this column so we don't confuse it with model scores
matches = matches.rename(columns={"%": "legacy_confidence"})

matches["parent_id"] = matches["parent_id"].astype(int)
matches["child_id"]  = matches["child_id"].astype(int)
matches["match"]     = matches["match"].astype(int)
matches["legacy_confidence"] = pd.to_numeric(matches["legacy_confidence"], errors="coerce")

HIGH_CONF = 0.88
positives = matches[(matches["match"] == 1) & (matches["legacy_confidence"] >= HIGH_CONF)]

row = positives.iloc[0]
parent_id = int(row["parent_id"])
true_child_id = int(row["child_id"])

print("Parent ID:", parent_id)
print("True child ID (from legacy high-conf):", true_child_id)

positives.head()

Parent ID: 674230
True child ID (from legacy high-conf): 674464


Unnamed: 0,child_id,parent_id,match,title_sim_dutch,content_sim_dutch,title_sim_legacy,content_sim_legacy,days_diff,date_binary,legacy_confidence
0,674464,674230,1,0.0,0.891758,0.0,0.085439,0.165046,1,0.8943
2,674464,674045,1,0.0,0.891758,0.0,0.085439,0.402882,1,0.8943
18,1429753,1428246,1,0.54783,0.882689,0.0,0.076449,1.019444,1,0.9804
28,1047901,1047816,1,0.164229,0.739795,0.0,0.207792,0.002778,1,0.9907
54,1258730,1258434,1,0.273474,0.972838,0.117647,0.346847,0.385417,1,0.9853


In [3]:
parents = pd.read_csv(PARENTS_FILE)
parents = parents.drop(columns=["Unnamed: 0"], errors="ignore")

print("Parent columns:", list(parents.columns))

Parent columns: ['BT_TT', 'Gebruik_UF', '_version_', 'authors', 'authors_string', 'circulation', 'content', 'content_no_numbers', 'content_without_stopwords', 'copyright', 'datasource_key', 'datasource_theme', 'datasource_theme_string', 'datasource_title', 'datasource_type_key', 'datasource_type_title', 'departments', 'departments_string', 'document_key', 'duration', 'edition', 'edition_string', 'embargo', 'external_id', 'external_parent_id', 'first_paragraph_without_stopwords', 'found_synonyms', 'gatekeeper_key', 'graphic', 'id', 'import_id', 'insert_date', 'insert_date_date', 'kamerstuk_reference_number', 'kamerstuk_reference_number_string', 'link', 'media_value', 'medium', 'medium_category', 'medium_category_string', 'medium_string', 'medium_subcategory', 'medium_subcategory_string', 'mig_page', 'mig_title', 'mom_impact_score', 'page', 'parent_numbers', 'phenomenon', 'press_conference', 'press_conference_string', 'program', 'program_string', 'publication', 'publication_calendar', 'p

In [4]:
import glob
import os

all_files = glob.glob(os.path.join(DATA_DIR, "c_*.csv"))

child_files = [
    f for f in all_files
    if "_output" not in os.path.basename(f)
]

child_files[:10]

['/Users/souadlaaziz/Desktop/CBS code/data/c_1298147.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_689286.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_932753.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1280708.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1347257.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1205716.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1301202.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1400435.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1024774.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1377394.csv']

In [5]:
# column name that contains the parent ID
PARENT_ID_COL = "id"

# select the correct parent row
parent_row = parents[parents[PARENT_ID_COL] == parent_id]

print("Found parent rows:", len(parent_row))

# sanity check: show the row (optional)
parent_row.iloc[0]

# choose ONLY meaningful text columns for retrieval
TEXT_COLS = [
    "title",
    "content",
    "first_paragraph_without_stopwords",
    "tags_string",
    "themes_string",
    "subject_string",
]

# build the parent query text
parent_text = " ".join(
    str(parent_row.iloc[0][col])
    for col in TEXT_COLS
    if col in parent_row.columns and pd.notna(parent_row.iloc[0][col])
).lower()

# preview to make sure it looks sane
print(parent_text[:500])

Found parent rows: 1
bronneninventarisatie in de wereld van de internationale statistiek bestaan er vele verschillende leveranciers van allerlei soorten data. het vinden van de juiste bron voor het beantwoorden van een bepaalde vraag kan daarbij uitdagend zijn, zeker wanneer diverse bronnen verschillende cijfers lijken te rapporteren over hetzelfde onderwerp.

een voorbeeld betreft data over innovatie. zowel de world economic forum (wef) als de world intellectual property organization (wipo) rangschikken landen jaar


In [6]:
all_files = glob.glob(os.path.join(DATA_DIR, "c_*.csv"))

child_files = [
    f for f in all_files
    if "_output" not in os.path.basename(f)
]

print("Num child files:", len(child_files))
child_files[:10]

Num child files: 174190


['/Users/souadlaaziz/Desktop/CBS code/data/c_1298147.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_689286.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_932753.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1280708.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1347257.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1205716.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1301202.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1400435.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1024774.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1377394.csv']

In [21]:
def child_id_from_path(fp):
    return int(os.path.basename(fp).replace("c_", "").replace(".csv", ""))

def load_child_text(fp):
    df = pd.read_csv(fp)

    title = str(df.loc[0, "title"]) if "title" in df.columns else ""
    content = str(df.loc[0, "content"]) if "content" in df.columns else ""

    return (title + "\n" + content).strip()

LIMIT = 2000  # increase later
corpus_ids = []
corpus_texts = []

for fp in child_files[:LIMIT]:
    text = load_child_text(fp)

    if len(text) < 50:   # skip empty rows
        continue

    corpus_ids.append(str(child_id_from_path(fp)))
    corpus_texts.append(text)

print("Corpus size:", len(corpus_ids))

Corpus size: 2000


In [22]:
# %pip install rank-bm25 sentence-transformers

In [23]:
from rank_bm25 import BM25Okapi
import re

def tokenize(text):
    return re.findall(r"\w+", (text or "").lower())

# tokenize corpus
tokenized_corpus = [tokenize(text) for text in corpus_texts]

bm25 = BM25Okapi(tokenized_corpus)

query_tokens = tokenize(parent_text)
bm25_scores = bm25.get_scores(query_tokens)

# take top 50 candidates
top_idx = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:50]
bm25_candidates = [corpus_ids[i] for i in top_idx]

bm25_candidates[:10]

['1419170',
 '1218784',
 '1343555',
 '1250273',
 '1438169',
 '1352464',
 '1430086',
 '771717',
 '706004',
 '739545']

In [24]:
print("Num child docs in corpus:", len(corpus_ids))
print("Num texts in corpus:", len(corpus_texts))
print("BM25 candidates before cut:", len(bm25_candidates))

Num child docs in corpus: 2000
Num texts in corpus: 2000
BM25 candidates before cut: 50


In [25]:
from sentence_transformers import CrossEncoder

In [None]:
# # first version using only 2000 max chars

# RERANK_K = 20
# bm25_candidates = bm25_candidates[:RERANK_K]

# MAX_CHARS = 2000
# def cut(t):
#     return str(t)[:MAX_CHARS]

# reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# id2text = {doc_id: text for doc_id, text in zip(corpus_ids, corpus_texts)}

# pairs = [(cut(parent_text), cut(id2text[doc_id])) for doc_id in bm25_candidates]

# ce_scores = reranker.predict(pairs, batch_size=2, show_progress_bar=True)

# reranked = sorted(zip(bm25_candidates, ce_scores), key=lambda x: x[1], reverse=True)

# # reranked[:10]

Batches: 100%|██████████| 10/10 [00:00<00:00, 40.86it/s]


In [31]:
from sentence_transformers import CrossEncoder

# how many candidates to rerank
RERANK_K = 20
bm25_candidates = bm25_candidates[:RERANK_K]

# chunk settings (whole article)
CHUNK_SIZE = 1200
STRIDE = 900
BATCH_SIZE = 8

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

id2text = {doc_id: text for doc_id, text in zip(corpus_ids, corpus_texts)}

def chunk_text(text, chunk_size=CHUNK_SIZE, stride=STRIDE):
    t = str(text)
    if len(t) <= chunk_size:
        return [t]
    chunks = []
    for i in range(0, len(t), stride):
        chunks.append(t[i:i+chunk_size])
        if i + chunk_size >= len(t):
            break
    return chunks

# build (parent, chunk) pairs
pairs = []
pair_doc_ids = []

for doc_id in bm25_candidates:
    child_text = id2text[doc_id]
    for ch in chunk_text(child_text):
        pairs.append((str(parent_text), ch))
        pair_doc_ids.append(doc_id)

print("Total pairs (chunks):", len(pairs))

# score all chunks
chunk_scores = reranker.predict(
    pairs,
    batch_size=BATCH_SIZE,
    show_progress_bar=True
)

# keep BEST chunk score per article
best_score = {}
for doc_id, s in zip(pair_doc_ids, chunk_scores):
    s = float(s)
    if (doc_id not in best_score) or (s > best_score[doc_id]):
        best_score[doc_id] = s

# final reranked list (article-level)
reranked = sorted(best_score.items(), key=lambda x: x[1], reverse=True)

Total pairs (chunks): 439


Batches: 100%|██████████| 55/55 [00:04<00:00, 13.48it/s]


In [34]:
AUTO_MARGIN = 2.0
TOP_K_SHOW = 10

topk = reranked[:TOP_K_SHOW]

if len(topk) >= 2:
    margin = float(topk[0][1] - topk[1][1])
    decision = "AUTO_POSITIVE" if margin >= AUTO_MARGIN else "REVIEW"
else:
    margin = None
    decision = "REVIEW"

rows = []
for rank, (child_id, score) in enumerate(topk, start=1):
    rows.append({
        "parent_id": int(parent_id),
        "child_id": int(child_id),
        "rank": rank,
        "score": float(score),
        "margin_top1_top2": margin if rank == 1 else "",
        "decision": decision if rank == 1 else "IGNORE"
    })

model_c_df = pd.DataFrame(rows)

In [35]:
out_file = f"model_c_parent_{parent_id}_margin_{AUTO_MARGIN}.csv"
model_c_df.to_csv(out_file, index=False)

print("Saved", out_file)

Saved model_c_parent_674230_margin_2.0.csv


In [30]:
# for doc_id in bm25_candidates[:3]:
#     txt = id2text[str(doc_id)]
#     print("----", doc_id, "len=", len(txt))
#     print(txt[:300])

# df = pd.read_csv(child_files[0])
# df.columns