In [10]:
import os
import pandas as pd
import glob 


DATA_DIR = os.path.expanduser("~/Desktop/CBS code/data")
PARENTS_FILE = os.path.join(DATA_DIR, "all_parents.csv")
MATCH_FILE = "dataset_modified_cleaned.csv"

In [11]:
matches = pd.read_csv(MATCH_FILE)

# make sure IDs are clean ints
matches["parent_id"] = matches["parent_id"].astype(int)
matches["child_id"]  = matches["child_id"].astype(int)

row = matches.iloc[0]
parent_id = int(row["parent_id"])
child_id  = int(row["child_id"])

print("Parent ID:", parent_id)
print("Child ID:", child_id)

matches.head()

Parent ID: 793210
Child ID: 650359


Unnamed: 0,child_id,parent_id,title_similarity,content_similarity,match,date_binary,jac_total,sleutelwoorden_lenmatches,BT_TT_lenmatches,title_no_stop_lenmatches,1st_paragraph_no_stop_lenmatches,numbers_lenmatches
0,650359,793210,0.440837,0.922662,1,0.0,0.0714,0,0,0,2,0
1,650361,877835,0.414613,0.946601,1,0.0,0.7816,0,4,0,5,1
2,650358,827070,0.493777,0.94559,1,0.0,0.1675,0,0,0,2,3
3,650405,725836,0.632032,0.883841,1,0.0,0.4964,1,0,0,7,4
4,650439,778272,0.330049,0.940994,1,0.0,0.744,0,0,1,12,5


In [12]:
parents = pd.read_csv(PARENTS_FILE)
parents = parents.drop(columns=["Unnamed: 0"], errors="ignore")

print("Parent columns:", list(parents.columns))

Parent columns: ['BT_TT', 'Gebruik_UF', '_version_', 'authors', 'authors_string', 'circulation', 'content', 'content_no_numbers', 'content_without_stopwords', 'copyright', 'datasource_key', 'datasource_theme', 'datasource_theme_string', 'datasource_title', 'datasource_type_key', 'datasource_type_title', 'departments', 'departments_string', 'document_key', 'duration', 'edition', 'edition_string', 'embargo', 'external_id', 'external_parent_id', 'first_paragraph_without_stopwords', 'found_synonyms', 'gatekeeper_key', 'graphic', 'id', 'import_id', 'insert_date', 'insert_date_date', 'kamerstuk_reference_number', 'kamerstuk_reference_number_string', 'link', 'media_value', 'medium', 'medium_category', 'medium_category_string', 'medium_string', 'medium_subcategory', 'medium_subcategory_string', 'mig_page', 'mig_title', 'mom_impact_score', 'page', 'parent_numbers', 'phenomenon', 'press_conference', 'press_conference_string', 'program', 'program_string', 'publication', 'publication_calendar', 'p

In [13]:
import glob
import os

all_files = glob.glob(os.path.join(DATA_DIR, "c_*.csv"))

child_files = [
    f for f in all_files
    if "_output" not in os.path.basename(f)
]

child_files[:10]

['/Users/souadlaaziz/Desktop/CBS code/data/c_1298147.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_689286.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_932753.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1280708.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1347257.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1205716.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1301202.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1400435.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1024774.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1377394.csv']

In [14]:
# column name that contains the parent ID
PARENT_ID_COL = "id"

# select the correct parent row
parent_row = parents[parents[PARENT_ID_COL] == parent_id]

print("Found parent rows:", len(parent_row))

# sanity check: show the row (optional)
parent_row.iloc[0]

# choose ONLY meaningful text columns for retrieval
TEXT_COLS = [
    "title",
    "content",
    "first_paragraph_without_stopwords",
    "tags_string",
    "themes_string",
    "subject_string",
]

# build the parent query text
parent_text = " ".join(
    str(parent_row.iloc[0][col])
    for col in TEXT_COLS
    if col in parent_row.columns and pd.notna(parent_row.iloc[0][col])
).lower()

# preview to make sure it looks sane
print(parent_text[:500])

Found parent rows: 1
meeste bedrijven hadden geen extra extern vermogen nodig vanwege corona verreweg de meeste bedrijven hadden geen behoefte om vanwege de coronacrisis extern vermogen aan te trekken. horecabedrijven trokken het vaakst vermogen aan om aan de betalingsverplichtingen te kunnen voldoen. bedrijven in verhuur en handel van onroerend goed trokken tijdens de coronacrisis het vaakst extern vermogen aan voor investeringen. dit meldt het cbs op basis van nieuwe cijfers uit de conjunctuurenquête nederland.
de


In [15]:
all_files = glob.glob(os.path.join(DATA_DIR, "c_*.csv"))

child_files = [
    f for f in all_files
    if "_output" not in os.path.basename(f)
]

print("Num child files:", len(child_files))
child_files[:10]

Num child files: 174190


['/Users/souadlaaziz/Desktop/CBS code/data/c_1298147.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_689286.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_932753.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1280708.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1347257.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1205716.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1301202.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1400435.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1024774.csv',
 '/Users/souadlaaziz/Desktop/CBS code/data/c_1377394.csv']

In [16]:
def child_id_from_path(fp):
    return int(os.path.basename(fp).replace("c_", "").replace(".csv", ""))

def load_child_text(fp):
    df = pd.read_csv(fp)
    return df.to_string(index=False)

LIMIT = 2000  # increase later
corpus_ids = []
corpus_texts = []

for fp in child_files[:LIMIT]:
    corpus_ids.append(str(child_id_from_path(fp)))
    corpus_texts.append(load_child_text(fp))

print("Corpus size:", len(corpus_ids))

Corpus size: 2000


In [8]:
# %pip install rank-bm25 sentence-transformers

In [9]:
from rank_bm25 import BM25Okapi
import re

def tokenize(text):
    return re.findall(r"\w+", (text or "").lower())

# tokenize corpus
tokenized_corpus = [tokenize(text) for text in corpus_texts]

bm25 = BM25Okapi(tokenized_corpus)

query_tokens = tokenize(parent_text)
bm25_scores = bm25.get_scores(query_tokens)

# take top 50 candidates
top_idx = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:50]
bm25_candidates = [corpus_ids[i] for i in top_idx]

bm25_candidates[:10]

['793584',
 '830636',
 '1013872',
 '1220596',
 '896783',
 '1362000',
 '781802',
 '1239776',
 '1292481',
 '799056']

In [None]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("BAAI/bge-reranker-v2-m3")

# map id → article text
id2text = {doc_id: text for doc_id, text in zip(corpus_ids, corpus_texts)}

pairs = [(parent_text, id2text[doc_id]) for doc_id in bm25_candidates]

ce_scores = reranker.predict(pairs)

reranked = sorted(
    zip(bm25_candidates, ce_scores),
    key=lambda x: x[1],
    reverse=True
)

reranked[:10]

In [None]:
THRESHOLD = 0.5  # can be tuned later

final_predictions = [
    {
        "child_id": int(doc_id),
        "score": float(score),
        "uses_cbs": bool(score >= THRESHOLD)
    }
    for doc_id, score in reranked
]

final_predictions[:10]

In [None]:
model_c_df = pd.DataFrame(final_predictions)

model_c_df.to_csv(
    "model_c_predictions_0.5.csv",
    index=False
)

print("Saved model_c_predictions_0.5.csv")