Legacy dataset (`final_trainset.csv`) was (i think) built by a script we don't have, using English stemming on Dutch text. Also, some raw source files are missing on top of it Since I can't rebuild their pipeline from scratch (missing files), I wrote this script to audit the existing master list. The code:
1.  Re-calculates similarity scores using a proper Dutch model (`Spacy`) instead of the English stemmer.
2.  Merges these new scores with their original metadata (dates, keywords) to carry on the useful parts of existing work.
3.  Filters out low-scoring (date I say hallucinated) matches (where it says 'Match' but the text is totally different, for examples of what I'm talking about, consult 'match_verification.ipynb') and generate real negative samples.

The result is a single file 'final_balanced_training_set.csv', a training set that contains both old information (in columns date_binary, jac_total, etc.) and new information (in columns title_similarity, content_similarity).

In [1]:
#!pip install spacy
#!python -m spacy download nl_core_news_sm

In [2]:
import pandas as pd
import os
import re
import spacy
import numpy as np
from tqdm import tqdm
import warnings
import random

In [4]:
path_to_children = "data/" 

try:
    nlp = spacy.load("nl_core_news_sm", disable=['ner', 'parser'])
except:
    # install if its missing
    import sys
    !{sys.executable} -m spacy download nl_core_news_sm
    nlp = spacy.load("nl_core_news_sm", disable=['ner', 'parser'])

In [4]:
def clean_text_dutch(text):
    # original function didn't handle dutch lemmatization correctly (it was english, while data is in dutch), so switching to spacy logic
    if pd.isna(text) or str(text).strip() == "": 
        return ""
    
    # remove weird punctuation
    text = re.sub(r'[^\w\s]', '', str(text))
    
    # use spacy to get the root words / stemming
    doc = nlp(text)
    
    # filter out stopwords and punctuation
    tokens = [t.lemma_.lower() for t in doc if not t.is_stop and not t.is_punct]
    return " ".join(tokens)

# similarity calculations
def calc_cosine_sim(text1, text2):
    if not text1 or not text2: return 0.0
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    if doc1.vector_norm == 0 or doc2.vector_norm == 0: return 0.0
    return doc1.similarity(doc2)

def calc_jaccard(text1, text2):
    # simple word overlap count
    set1 = set(str(text1).split())
    set2 = set(str(text2).split())
    if len(set1) == 0 or len(set2) == 0: return 0.0
    # intersection over union
    return len(set1.intersection(set2)) / len(set1.union(set2))

### DATA PREPROCESSING
This cell is going to take forever to run, so use it with caution:

In [None]:
# WARNING SILENCER (keeps the browser from crashing due to memory overload)
warnings.filterwarnings("ignore")

# load the masterlist
dataset_og = pd.read_csv('dataset_og.csv')

# load parents for the lookup
df_parents = pd.read_csv('data/all_parents.csv').drop_duplicates(subset='id').set_index('id')

results = []

print("starting the dutch extraction, may the odds be in your laptop's favor")

# we iterate through every row to make sure we keep the 0s for tracking
for index, row in tqdm(dataset_og.iterrows(), total=len(dataset_og)):
    cid = int(row['child_id'])
    pid = int(row['parent_id'])
    
    # try finding the normal file
    target_file = f"data/c_{cid}.csv"
    
    # if normal file missing, try the _output version (fallback)
    if not os.path.exists(target_file):
        target_file = f"data/c_{cid}_output.csv"
    
    if os.path.exists(target_file):
        try:
            c_df = pd.read_csv(target_file)
            p_df = df_parents.loc[[pid]] 
            
            # DUTCH CHECK: logic to handle e.g. 'title' vs 'titel' and 'content' vs 'content_no_numbers'
            # makes sure we don't crash on slightly different csv headers
            if 'title' in c_df.columns:
                child_title = c_df.iloc[0]['title']
            elif 'titel' in c_df.columns:
                child_title = c_df.iloc[0]['titel']
            else:
                child_title = ""
                
            if 'content' in c_df.columns:
                child_text = c_df.iloc[0]['content']
            else:
                child_text = c_df.iloc[0].get('content_no_numbers', '')
            
            parent_title = p_df.iloc[0]['title']
            parent_text = p_df.iloc[0]['content']
            
            # CALCULATE SCORES using the dutch spacy model (nlp), replacing the old cosine similarity functions
            t1 = nlp(str(child_title))
            t2 = nlp(str(parent_title))
            c1 = nlp(str(child_text))
            c2 = nlp(str(parent_text))
            
            results.append({
                'child_id': cid,
                'parent_id': pid,
                'title_similarity': t1.similarity(t2),
                'content_similarity': c1.similarity(c2),
                'match': 1
            })
        except:
            # if file is corrupted, keep row but 0 score (for future re-processing)
            results.append({'child_id': cid, 'parent_id': pid, 'title_similarity': 0.0, 'content_similarity': 0.0, 'match': 1})
    else:
        # file is totally missing, keep row but 0 score
        results.append({'child_id': cid, 'parent_id': pid, 'title_similarity': 0.0, 'content_similarity': 0.0, 'match': 1})

dataset_modified = pd.DataFrame(results)
dataset_modified.to_csv('dataset_modified.csv', index=False)
print(f"done! processed {len(dataset_modified)} rows in dataset_modified.csv.")

starting the dutch extraction, may the odds be in your laptop's favor


100%|██████████| 103952/103952 [1:48:06<00:00, 16.03it/s] 


done! processed 103952 rows in dataset_modified.csv.


**ONLY RUN THE BELOW ONCE ON THE SAME CSV**

The code above kept crashing so I dropped columns that were not necessary for re-calculation to make it easier (it worked so), and now to get the full file we will be adding them back:

In [None]:
# merge columns back
dataset_modified = pd.read_csv('dataset_modified.csv')
dataset_og = pd.read_csv('dataset_og.csv')

# fix ids
dataset_modified['child_id'] = dataset_modified['child_id'].astype(int)
dataset_modified['parent_id'] = dataset_modified['parent_id'].astype(int)
dataset_og['child_id'] = dataset_og['child_id'].astype(int)
dataset_og['parent_id'] = dataset_og['parent_id'].astype(int)

# keep everything from og except scores and match
cols_drop = ['title_similarity', 'content_similarity', 'match']

# cols_keep ALREADY contains child_id and parent_id
cols_keep = [c for c in dataset_og.columns if c not in cols_drop]

# merge
dataset_modified = pd.merge(
    dataset_modified, 
    dataset_og[cols_keep], 
    on=['child_id', 'parent_id'], 
    how='left'
)

# save final
dataset_modified.to_csv('dataset_modified.csv', index=False)
print("merged columns and saved.")

merged columns and saved.


Creating final files for the model (so it's a fair game).
1. Dropping id-s with missing raw files (i.e. zero entry rows) AND the low-scoring ehhh entries (score < 0.4) in our modified file
2. Dropping the respective entries/rows in the og file as well, for fair comparison
3. Adding negative examples (1:1 ratio) to create 'final_balanced_training_set.csv'. This is the master textbook for the model.  

In [None]:
# cleaning up the results to remove rows where files were missing (did this as a separate step in case files reappear)
dataset_modified = pd.read_csv('dataset_modified.csv')

# thresholding: remove rows where files were missing (0.0)
# and remove the "junk" (anything under 0.4 content similarity)
mask = (dataset_modified['content_similarity'] > 0.4)
dataset_modified_cleaned = dataset_modified[mask].copy()

# syncing up with the og file for accuracy comparison later
dataset_og = pd.read_csv('dataset_og.csv')
# force ints to avoid merge issues
dataset_og['child_id'] = dataset_og['child_id'].astype(int)
dataset_og['parent_id'] = dataset_og['parent_id'].astype(int)
dataset_modified_cleaned['child_id'] = dataset_modified_cleaned['child_id'].astype(int)
dataset_modified_cleaned['parent_id'] = dataset_modified_cleaned['parent_id'].astype(int)

# merging valid rows
dataset_og_cleaned = pd.merge(
    dataset_og, 
    dataset_modified_cleaned[['child_id', 'parent_id']], 
    on=['child_id', 'parent_id'], 
    how='inner'
)

# loading spacy for the negatives calculation (hard mode)
try:
    nlp
except NameError:
    print("loading spacy...")
    nlp = spacy.load("nl_core_news_lg")

# add the real negatives (1:1 ratio) - taking clean matches and pairing with random parent
# calculating ACTUAL scores this time so the model doesnt cheat
negatives = []

# reload parents in case variable is lost
df_parents = pd.read_csv('data/all_parents.csv').drop_duplicates(subset='id').set_index('id')
all_parent_ids = list(df_parents.index)

print("generating hard negatives...")
for _, row in tqdm(dataset_modified_cleaned.iterrows(), total=len(dataset_modified_cleaned)):
    cid = int(row['child_id'])
    
    # pick a random parent that isnt the real one
    random_pid = random.choice(all_parent_ids)
    while random_pid == row['parent_id']:
        random_pid = random.choice(all_parent_ids)
        
    # calculate real similarity for this random pair
    # if files missing, default to 0
    try:
        # get child text
        target_file = f"data/c_{cid}.csv"
        if not os.path.exists(target_file):
            target_file = f"data/c_{cid}_output.csv"
            
        c_df = pd.read_csv(target_file)
        c_title = c_df.iloc[0]['title'] if 'title' in c_df.columns else c_df.iloc[0].get('titel', '')
        c_text = c_df.iloc[0]['content'] if 'content' in c_df.columns else c_df.iloc[0].get('content_no_numbers', '')
        
        # get parent text
        p_df = df_parents.loc[[random_pid]]
        p_title = p_df.iloc[0]['title']
        p_text = p_df.iloc[0]['content']
        
        # spacy calc
        t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
        c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
    except:
        t_sim = 0.0
        c_sim = 0.0

    # create a negative row
    neg_row = {col: 0 for col in dataset_modified_cleaned.columns}
    neg_row['child_id'] = cid
    neg_row['parent_id'] = int(random_pid)
    neg_row['match'] = 0
    neg_row['title_similarity'] = t_sim
    neg_row['content_similarity'] = c_sim
    
    negatives.append(neg_row)

# combine and shuffle for training
df_negatives = pd.DataFrame(negatives)
final_balanced_training_set = pd.concat([dataset_modified_cleaned, df_negatives], ignore_index=True)
final_balanced_training_set = final_balanced_training_set.sample(frac=1).reset_index(drop=True)

# saving the outputs
dataset_og_cleaned.to_csv('dataset_og_cleaned.csv', index=False)
dataset_modified_cleaned.to_csv('dataset_modified_cleaned.csv', index=False)
final_balanced_training_set.to_csv('final_balanced_training_set.csv', index=False)

print(f"saved final files. training set has {len(final_balanced_training_set)} rows")

generating hard negatives...


  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).similarity(nlp(str(p_text)))
  t_sim = nlp(str(c_title)).similarity(nlp(str(p_title)))
  c_sim = nlp(str(c_text)).sim