# Prerequisite

When u are not familiar with -> `pip install -r requirements.txt`, use this codeblock for installing the required packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve
import sklearn.metrics as metrics
import pickle
import os
import subprocess
import sys

In [None]:
# !pip install jupyter
# !pip install scikit-learn
# !pip install pandas
# !pip install numpy

In [None]:
# This one would not work with the requirements txt, so run it once separately:
# Download spaCy Dutch Model
try:
    import spacy
    if not spacy.util.is_package("nl_core_news_lg"):
        print("Downloading spaCy Dutch model.")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "nl_core_news_lg"])
except ImportError:
    print("spaCy not found. Please run 'pip install spacy' first.")

# Download NLTK dependencies for legacy stemming/stopwords
try:
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
except ImportError:
    print("NLTK not found. Please run 'pip install nltk' first.")

# Functions for evaluation purposes

In [None]:
def resultClassifierfloat(row):
    threshold = 0.5
    if (row['prediction'] > threshold and row['label'] == True):
        return 'TP'
    if (row['prediction'] < threshold and row['label'] == False):
        return 'TN'
    if (row['prediction'] < threshold and row['label'] == True):
        return 'FN'
    if (row['prediction'] > threshold and row['label'] == False):
        return 'FP'


def resultClassifierint(row):
    if (row['label'] == row['prediction'] and row['label'] == True):
        return 'TP'
    if (row['label'] == row['prediction'] and row['label'] == False):
        return 'TN'
    if (row['label'] != row['prediction'] and row['label'] == True):
        return 'FN'
    if (row['label'] != row['prediction'] and row['label'] == False):
        return 'FP'

# ! CHANGED THIS TO HANDLE 0 FP
def evaluation(model, name, X_test, y_test):
    predictions = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    print(f"\n--- {name} Results ---")
    print(f"Accuracy:  {accuracy_score(y_test, predictions):.4f}")
    print(f"Precision: {precision_score(y_test, predictions):.4f}")
    print(f"Recall:    {recall_score(y_test, predictions):.4f}")
    print(f"F1 Score:  {f1_score(y_test, predictions):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, probs):.4f}")
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f"Matrix:    TP={tp}, FP={fp}, TN={tn}, FN={fn}")

# Load preprocessed data set

Split into train, test etc

In [None]:
print("Loading dataset.")
# Load the preprocessed file (which now already contains the '%' column)
final_trainset = pd.read_csv('trainset_reconstructed.csv').fillna(0)

# Filter for high-confidence matches (>88%) to remove noise
# Convert score to numeric (just in case)
final_trainset['%'] = pd.to_numeric(final_trainset['%'], errors='coerce').fillna(0)

# Apply the 0.88 Threshold
# We keep matches > 0.88. We keep ALL non-matches (0).
clean_positives = final_trainset[(final_trainset['match'] == 1) & (final_trainset['%'] >= 0.88)]
clean_negatives = final_trainset[final_trainset['match'] == 0]

# Re-balance negatives to 50/50 (Downsample to match the new, smaller positive count)
if len(clean_positives) > 0:
    clean_negatives = clean_negatives.sample(n=len(clean_positives), random_state=42)
    final_trainset = pd.concat([clean_positives, clean_negatives]).sample(frac=1, random_state=42).fillna(0)
    print(f"Filtered Dataset Size: {len(final_trainset)} rows.")
else:
    print("No matches found above the threshold.")

# Define the target
target = 'match'

# Split the data into 80% train and 20% test (using GroupShuffleSplit to prevent data leak)
splitter = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state=42)
train_inds, test_inds = next(splitter.split(final_trainset, groups=final_trainset['child_id']))

train_set = final_trainset.iloc[train_inds]
test_set = final_trainset.iloc[test_inds]

print(f"Training set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

# Traininig Phase (English / Initial Model)

In [None]:
# Legacy features
feature_cols_legacy = ['title_sim_legacy', 'content_sim_legacy', 'date_binary']

X_train_legacy = train_set[feature_cols_legacy]
y_train = train_set[target]

X_test_legacy = test_set[feature_cols_legacy]
y_test = test_set[target]

# Random forest (same settings as before)
rf_legacy = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_legacy.fit(X_train_legacy, y_train)

In [None]:

filename = 'model_rf_initial.pkl'
pickle.dump(rf_legacy, open(filename, 'wb'))
print(f"Saved: {filename}")


# Training Phase (Dutch / Fixed Model)

In [None]:
# Dutch features
feature_cols_dutch = ['title_sim_dutch', 'content_sim_dutch', 'date_binary']

X_train_dutch = train_set[feature_cols_dutch]
y_train = train_set[target]

X_test_dutch = test_set[feature_cols_dutch]
y_test = test_set[target]

# Same random forest
rf_dutch = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_dutch.fit(X_train_dutch, y_train)

In [None]:

filename = 'model_rf_fixed.pkl'
pickle.dump(rf_dutch, open(filename, 'wb'))
print(f"Saved: {filename}")


# Evaluation Phase (Both)

In [None]:
evaluation(rf_legacy, "Legacy Model (Jaccard/Stemming)", X_test_legacy, y_test)

In [None]:
evaluation(rf_dutch, "New Model (Dutch Vectors)", X_test_dutch, y_test)

Comparing the models:

In [None]:
# Get probabilities for the curve
probs_legacy = rf_legacy.predict_proba(X_test_legacy)[:, 1]
probs_dutch = rf_dutch.predict_proba(X_test_dutch)[:, 1]

# Plot ROC Curve
fpr_a, tpr_a, _ = roc_curve(y_test, probs_legacy)
fpr_b, tpr_b, _ = roc_curve(y_test, probs_dutch)

plt.figure(figsize=(14, 6))

# Subplot ROC
plt.subplot(1, 2, 1)

# Change color to 'red' and linewidth to 3 so you can't miss it
plt.plot(fpr_a, tpr_a, label=f'Legacy Model (AUC: {auc(fpr_a, tpr_a):.3f})', linestyle='-', color='red', linewidth=3.5)
plt.plot(fpr_b, tpr_b, label=f'New Dutch Model (AUC: {auc(fpr_b, tpr_b):.3f})', color='blue', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Reference (AUC = 0.5)', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Accuracy Comparison)')
plt.legend()
plt.grid(alpha=0.3)

# Subplot Precision-0Recall
prec_a, rec_a, _ = precision_recall_curve(y_test, probs_legacy)
prec_b, rec_b, _ = precision_recall_curve(y_test, probs_dutch)

plt.subplot(1, 2, 2)
plt.plot(rec_a, prec_a, label='Legacy Model', linestyle='-', color='red', linewidth=3.5)
plt.plot(rec_b, prec_b, label='New Dutch Model', color='blue', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Reliability)')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
#plt.savefig('model_comparison.png')
plt.show()

# Overfitting check

In [None]:
def check_overfitting(model, name, X_train, y_train, X_test, y_test):
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    print(f"--- Overfitting Check: {name} ---")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy:     {test_acc:.4f}")
    print(f"Difference:        {train_acc - test_acc:.4f}")
    if (train_acc - test_acc) > 0.05:
        print("Warning: Model might be overfitting (Gap > 5%)")
    else:
        print("Model generalization looks healthy.")
    print("-" * 30)

# Check Legacy Model
check_overfitting(rf_legacy, "Legacy Model", X_train_legacy, y_train, X_test_legacy, y_test)

# Check Dutch Vector Model
check_overfitting(rf_dutch, "Dutch Vector Model", X_train_dutch, y_train, X_test_dutch, y_test)

# Error Analysis

In [None]:
# setting pandas to show full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)

# loading the big csv files once so we dont have to keep doing it
print("loading raw text files.")
# using just the cols we need
children_df = pd.read_csv('data/full_children.csv', usecols=['id', 'title', 'content'])
parents_df = pd.read_csv('data/full_parents.csv', usecols=['id', 'title', 'content'])

# renaming to make the merge easy
children_df = children_df.rename(columns={'id': 'child_id', 'title': 'child_title', 'content': 'child_content'})
parents_df = parents_df.rename(columns={'id': 'parent_id', 'title': 'parent_title', 'content': 'parent_content'})

In [None]:
print("\n--- LEGACY MODEL: CORRECT PREDICTIONS ---")

# loading the legacy model
with open('model_rf_initial.pkl', 'rb') as f:
    model_legacy = pickle.load(f)

# predicting
cols_legacy = ['title_sim_legacy', 'content_sim_legacy', 'date_binary']
preds_legacy = model_legacy.predict(test_set[cols_legacy])
df_legacy = test_set.copy()
df_legacy['pred'] = preds_legacy

# filtering for correct stuff
tp_legacy = df_legacy[(df_legacy['pred'] == 1) & (df_legacy['match'] == 1)] # true positives
tn_legacy = df_legacy[(df_legacy['pred'] == 0) & (df_legacy['match'] == 0)] # true negatives

print(f"True Positives (hits): {len(tp_legacy)}")
print(f"True Negatives (correct rejections): {len(tn_legacy)}")

# displaying content for TP
print("\nTop 5 True Positives (Matches it found):")
tp_text = tp_legacy.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
display(tp_text[['child_id', 'parent_id', 'title_sim_legacy', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())

# displaying content for TN
print("\nTop 5 True Negatives (Correctly said NO):")
tn_text = tn_legacy.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
display(tn_text[['child_id', 'parent_id', 'title_sim_legacy', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())

In [None]:
print("\n--- LEGACY MODEL: ERRORS ---")

# we already have the predictions from the cell above in 'df_legacy'

# filtering for errors
fp_legacy = df_legacy[(df_legacy['pred'] == 1) & (df_legacy['match'] == 0)] # false pos
fn_legacy = df_legacy[(df_legacy['pred'] == 0) & (df_legacy['match'] == 1)] # false neg (misses)

print(f"False Positives: {len(fp_legacy)}")
print(f"False Negatives: {len(fn_legacy)}")

# displaying FP
print("\nTop 5 False Positives (Hallucinations?):")
if len(fp_legacy) > 0:
    fp_text = fp_legacy.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
    display(fp_text[['child_id', 'parent_id', 'title_sim_legacy', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())
else:
    print("none found")

# displaying FN
print("\nTop 5 False Negatives (Missed matches):")
if len(fn_legacy) > 0:
    fn_text = fn_legacy.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
    display(fn_text[['child_id', 'parent_id', 'title_sim_legacy', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())
else:
    print("none found")

In [None]:
print("\n--- DUTCH MODEL: CORRECT PREDICTIONS ---")

# loading the new dutch model
with open('model_rf_fixed.pkl', 'rb') as f:
    model_dutch = pickle.load(f)

# predicting
cols_dutch = ['title_sim_dutch', 'content_sim_dutch', 'date_binary']
preds_dutch = model_dutch.predict(test_set[cols_dutch])
df_dutch = test_set.copy()
df_dutch['pred'] = preds_dutch

# filtering correct ones
tp_dutch = df_dutch[(df_dutch['pred'] == 1) & (df_dutch['match'] == 1)]
tn_dutch = df_dutch[(df_dutch['pred'] == 0) & (df_dutch['match'] == 0)]

print(f"True Positives (hits): {len(tp_dutch)}")
print(f"True Negatives (correct rejections): {len(tn_dutch)}")

# showing TP
print("\nTop 5 True Positives (Matches it found):")
tp_dutch_text = tp_dutch.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
display(tp_dutch_text[['child_id', 'parent_id', 'title_sim_dutch', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())

# showing TN
print("\nTop 5 True Negatives (Correctly said NO):")
tn_dutch_text = tn_dutch.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
display(tn_dutch_text[['child_id', 'parent_id', 'title_sim_dutch', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())

In [None]:
print("\n--- DUTCH MODEL: ERRORS ---")

# using predictions from previous cell

# filtering errors
fp_dutch = df_dutch[(df_dutch['pred'] == 1) & (df_dutch['match'] == 0)]
fn_dutch = df_dutch[(df_dutch['pred'] == 0) & (df_dutch['match'] == 1)]

print(f"False Positives: {len(fp_dutch)}")
print(f"False Negatives: {len(fn_dutch)}")

# showing FP
print("\nTop 5 False Positives (Checking for hidden matches):")
if len(fp_dutch) > 0:
    fp_dutch_text = fp_dutch.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
    display(fp_dutch_text[['child_id', 'parent_id', 'title_sim_dutch', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())
else:
    print("none found")

# showing FN
print("\nTop 5 False Negatives (Missed matches):")
if len(fn_dutch) > 0:
    fn_dutch_text = fn_dutch.merge(children_df, on='child_id', how='left').merge(parents_df, on='parent_id', how='left')
    display(fn_dutch_text[['child_id', 'parent_id', 'title_sim_dutch', 'child_title', 'parent_title', 'child_content', 'parent_content']].head())
else:
    print("none found")

Looking at the actual articles where the models messed up (False Positives and False Negatives) to see what's actually going on:

### 1. The Legacy Model is kind of "blind"
The model misses obvious matches because it's too focused on exact words:
* Example (False Negative): child_id 0 vs parent_id 658211
* Child Article: "High surplus value for homeowners" (Hoog bedrag aan overwaarde)
* Parent Article: "Homes almost 9% more expensive" (Koopwoningen bijna 9 procent duurder)

Both articles are clearly about the housing market and rising prices. But because the titles didn't share exact keywords (one said "surplus" other "expensive"), the legacy model gave it a score of 0 - it was completely blind to the connection. This confirms that old Jaccard approach has ehh Recall and ignores stuff that isn't basically a copy-paste.

### 2. The New Model: Hallucinations vs. Hidden Gems
The new Dutch Vector model fixes the blindness, but it introduces a new kinds of issues lol.

Bad part: it gets confused by numbers (Hallucinations)
Because every CBS article is written in the same "bureaucratic style," the model sometimes thinks different stats are the same thing.
* Example (False Positive): child_id 1120849 vs parent_id 916785
* Child: "Inflation fell to 9.9 percent"
* Parent: "Industrial production grows by nearly 10 percent"

My model gave this a high score (0.72). It probably say "9.9" or "10" percents, and words like "Growth/Decline" and got too excited to make the match. It struggles to tell the difference between types of economic stats when the sentence structure is identical.

Good part: it found matches the og trainset (whatever code was used to do the matching) missed (Hidden Gems in a sense). So, in a sense, the model is smart.
* Example (False Positive): child_id 4129186 vs parent_id 667719
* Child: "CBS figures on inflow/outflow employees in care..."
* Parent: "Labor market care and welfare"

The dataset labeled this as a "Non-Match" (0), but my model gave it a 0.72. Reading the text, both explicitly discuss the "1.4 million employees" in the care sector. The model was right, the label was wrong.

### Comparsion / Conclusion
On that Housing example where the Legacy model scored 0, the Dutch model gave it a 0.39. It didn't quite hit the match threshold, but at least it saw something. It wasn't blind.

* Legacy Model: "Safe but blind" (high precision, but misses anything with synonyms).
* New Dutch Model: "Smart but easily confused" (finds the hidden meanings and corrects human labeling errors, but struggles to differentiate between different statistics because the writing style is so similar).

Recommendation: We need to keep the threshold high (like the >0.88 rule) to filter out the "Statistical Hallucinations," while still catching the "Hidden Gems" that the old system missed.

# Feature analysis

In [None]:
# loading the original model again
with open('model_rf_initial.pkl', 'rb') as f:
    model_legacy = pickle.load(f)

# these are the features we trained on
features_legacy = ['title_sim_legacy', 'content_sim_legacy', 'date_binary']

# getting the importance scores
importances_legacy = model_legacy.feature_importances_

# making a dataframe so its easy to read
feature_df_legacy = pd.DataFrame({
    'feature': features_legacy,
    'importance': importances_legacy
}).sort_values(by='importance', ascending=False)

print("Feature Importance - Legacy Model:")
print(feature_df_legacy)

# plot
plt.figure(figsize=(8, 5))
plt.bar(feature_df_legacy['feature'], feature_df_legacy['importance'], color=['#9467bd', '#8c564b', '#e377c2'])
plt.title("Feature Importance - Legacy Model (Initial)")
plt.ylabel("Importance Score")
plt.show()

In [None]:
# loading the dutch model again
with open('model_rf_fixed.pkl', 'rb') as f:
    model_dutch = pickle.load(f)

# these are the features we trained on
features = ['title_sim_dutch', 'content_sim_dutch', 'date_binary']

# getting the importance scores
importances = model_dutch.feature_importances_

# making a dataframe so its easy to read
feature_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Feature Importance (What the model actually cares about):")
print(feature_df)

# plot
plt.figure(figsize=(8, 5))
plt.bar(feature_df['feature'], feature_df['importance'], color=['#1f77b4', '#ff7f0e', '#2ca02c'])
plt.title("Feature Importance - Dutch Model")
plt.ylabel("Importance Score")
plt.show()

Both are kinda lazy.

### both models are obsessed with dates
`date_binary` is by far the most important feature for both systems.
* legacy model: ~74% importance on date
* dutch model: ~79% importance on date

Basically, both models learned a *shortcut* - metadata, "if these articles were published on the same day, they are probably a match." if the dates don't match, they almost definitely say no.

### hallucinations explained (the traffic vs inflation error)
since both models rely so much on the date, the difference comes down to how they handle the text.
* legacy model: if the date matches, it checks for exact words. if it finds none (score 0), it rejects the match - plays it **saf4e**
* Dutch Model: if the date matches, it checks for context/vibe (vectors). As menitoned, CBS articles use similar jargon ("percent", "increase", "statistics"), so vector score is rarely zero, it’s usually like 0.3 or 0.4.

So, the dutch model sees a date match + a "meh" text score (0.4), so it gets over-confident and marks it as a match. This explains why it matched the traffic deaths with inflation stats—they happened in the same week and sounded bureaucratic.

### POTENTIAL FIX: raise the threshold
So, at the default threshold (0.5), the model is too excited and makes those "traffic vs inflation" errors. But what if we raise the threshold? To 0.8 for example - we could filter out the lazy date-matches (which usually score around 0.4-0.6) and only keep the strong semantic matches (like the housing market example).

# THRESHOLD EXPERIMENTS

In [None]:
# --- COMPARING LEGACY VS STRICT DUTCH ---
print("\n--- 1. LEGACY MODEL PERFORMANCE (BASELINE) ---")

# getting legacy predictions (standard threshold 0.5)
probs_legacy = model_legacy.predict_proba(test_set[cols_legacy])[:, 1]
legacy_preds = (probs_legacy >= 0.5).astype(int)

# saving to df to check errors
df_legacy_check = test_set.copy()
df_legacy_check['pred'] = legacy_preds

fp_leg = df_legacy_check[(df_legacy_check['pred'] == 1) & (df_legacy_check['match'] == 0)]
fn_leg = df_legacy_check[(df_legacy_check['pred'] == 0) & (df_legacy_check['match'] == 1)]
tp_leg = df_legacy_check[(df_legacy_check['pred'] == 1) & (df_legacy_check['match'] == 1)]

print(f"False positives (hallucinations): {len(fp_leg)}")
print(f"False negatives (misses): {len(fn_leg)}")
print(f"True positives (hits): {len(tp_leg)}")


print("\n--- 2. DUTCH MODEL WITH STRICT THRESHOLD (0.8) ---")

# getting the probabilities from the dutch model
# we only care about the probability of it being a match (column 1)
probs_dutch = model_dutch.predict_proba(test_set[cols_dutch])[:, 1]

# applying the new rule: only say 'yes' if the score is higher than 0.8
# this kills the lazy date-matches because they usually score around 0.4-0.7
strict_preds = (probs_dutch >= 0.8).astype(int)

# saving results to a new dataframe
df_strict = test_set.copy()
df_strict['pred'] = strict_preds

# recalculating errors
fp_strict = df_strict[(df_strict['pred'] == 1) & (df_strict['match'] == 0)]
fn_strict = df_strict[(df_strict['pred'] == 0) & (df_strict['match'] == 1)]
tp_strict = df_strict[(df_strict['pred'] == 1) & (df_strict['match'] == 1)]

print(f"False positives (hallucinations): {len(fp_strict)} (dropped from 61)")
print(f"False negatives (misses): {len(fn_strict)}") 
print(f"True positives (hits): {len(tp_strict)}")

# verifying if the specific 'inflation vs production' error is gone
# we know these IDs from the 'Top 5 False Positives' table we generated earlier
target_child = 1208491 # "Inflation fell..."
target_parent = 916785 # "Production grows..."

# checking what the model predicts for this specific pair now
check_error = df_strict[(df_strict['child_id'] == target_child) & (df_strict['parent_id'] == target_parent)]

if not check_error.empty:
    is_match = check_error.iloc[0]['pred']
    # if prediction is 0, it means the model said "NO MATCH"
    print(f"\nDid we fix the inflation error (IDs {target_child} & {target_parent})?")
    print(f"Prediction is now: {is_match} ({'FIXED: Model says NO' if is_match == 0 else 'FAIL: Model says YES'})")

A stricter threshold:

1. Kills the lazy errors

The fix worked exactly like it was supposed to - the specific error where it confused inflation with industrial production is completely gone now. Total hallucinations dropped by more than half, going from 61 down to 30. This basically proves that a lot of those errors were indeed just "lazy date matches" where the model wasn't totally sure (score was probably 0.6 or 0.7) but guessed yes anyway. By raising the bar to 0.8, we forced it to stop guessing on those low-confidence pairs.

2. Trade-off

To get rid of the noise, we had to sacrifice some real matches. Our false negatives went up from 265 to 315, which means we lost about 50 real matches. These were likely weak matches where the articles were technically related but didn't have strong vector overlap, so the model wasn't confident enough to pass the new 0.8 bar.

3. The verdict

Honestly this is good enough, better to be safe than sorry in this case. In a system like this, showing the user wrong info destroys trust immediately. It is way worse to tell a user that "traffic deaths" are the same thing as "inflation" than it is to miss a few marginal matches. The model is no longer too excited - its strict! Potential fix could be manually reviewing the grey area "meh" matches.

**Future fix:** If we really want those 50 matches back, we could retrain the model on "hard negatives" (articles with the same date but different topics) so it learns to trust the text more than the calendar.

# Confidence Threshold Distribution

In [None]:
import matplotlib.pyplot as plt

# Get probabilities for both
probs_legacy = rf_legacy.predict_proba(X_test_legacy)[:, 1]
probs_dutch = rf_dutch.predict_proba(X_test_dutch)[:, 1]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6), sharey=True)

ax1.hist(probs_legacy, bins=50, color='lightcoral', edgecolor='black')
ax1.set_title("Legacy Model A Decisiveness")
ax1.set_xlabel("Probability")
ax1.set_ylabel("Frequency")

ax2.hist(probs_dutch, bins=50, color='skyblue', edgecolor='black')
ax2.set_title("Dutch Model B Decisiveness")
ax2.set_xlabel("Probability")

plt.suptitle("Comparison of Model Decisiveness")
plt.show()

# Hard Negative Impact

In [None]:
# Filter for same-day pairs using Legacy test set
X_hard_legacy = X_test_legacy[X_test_legacy['date_binary'] == 1]
y_hard_legacy = y_test[X_test_legacy['date_binary'] == 1]

# Predict using the Legacy model
y_hard_pred_legacy = rf_legacy.predict(X_hard_legacy)

print("--- Hard Negative Analysis: Legacy Model A (Same-Day Pairs Only) ---")
print(metrics.classification_report(y_hard_legacy, y_hard_pred_legacy))

In [None]:
# Filter for same-day pairs using Dutch test set
X_hard_dutch = X_test_dutch[X_test_dutch['date_binary'] == 1]
y_hard_dutch = y_test[X_test_dutch['date_binary'] == 1]

# Predict using the Dutch Vector model
y_hard_pred_dutch = rf_dutch.predict(X_hard_dutch)

print("--- Hard Negative Analysis: Dutch Model B (Same-Day Pairs Only) ---")
print(metrics.classification_report(y_hard_dutch, y_hard_pred_dutch))