# Prerequisite

When u are not familiar with -> `pip install -r requirements.txt`, use this codeblock for installing the required packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GroupShuffleSplit
import sklearn.metrics as metrics
import pickle
import os
import subprocess
import sys

In [None]:
# !pip install jupyter
# !pip install scikit-learn
# !pip install pandas
# !pip install numpy

In [None]:
# This one would not work with the requirements txt, so run it once separately:
# Download spaCy Dutch Model
try:
    import spacy
    if not spacy.util.is_package("nl_core_news_lg"):
        print("Downloading spaCy Dutch model.")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "nl_core_news_lg"])
except ImportError:
    print("spaCy not found. Please run 'pip install spacy' first.")

# Download NLTK dependencies for legacy stemming/stopwords
try:
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
except ImportError:
    print("NLTK not found. Please run 'pip install nltk' first.")

# Functions for evaluation purposes

In [3]:
def resultClassifierfloat(row):
    threshold = 0.5
    if (row['prediction'] > threshold and row['label'] == True):
        return 'TP'
    if (row['prediction'] < threshold and row['label'] == False):
        return 'TN'
    if (row['prediction'] < threshold and row['label'] == True):
        return 'FN'
    if (row['prediction'] > threshold and row['label'] == False):
        return 'FP'


def resultClassifierint(row):
    if (row['label'] == row['prediction'] and row['label'] == True):
        return 'TP'
    if (row['label'] == row['prediction'] and row['label'] == False):
        return 'TN'
    if (row['label'] != row['prediction'] and row['label'] == True):
        return 'FN'
    if (row['label'] != row['prediction'] and row['label'] == False):
        return 'FP'

# ! CHANGED THIS TO HANDLE 0 FP
def evaluation(classifier, name, X_test, y_test):

    y_pred = classifier.predict(X_test)
    results = pd.DataFrame({'label': y_test.values, 'prediction': y_pred}, index=y_test.index)
    results['confusion_matrix'] = results.apply(resultClassifierint, axis=1)
    results_counts = results['confusion_matrix'].value_counts()

    # we use .get('KEY', 0) to avoid the error if a count is zero
    tp = results_counts.get('TP', 0)
    tn = results_counts.get('TN', 0)
    fp = results_counts.get('FP', 0)
    fn = results_counts.get('FN', 0)

    print(name)
    print(results_counts)
    
    # safe division to avoid 0/0 errors
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    print(f'Precision: {precision}')
    print(f'Recall:    {recall}')
    print("Accuracy:  ", metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

# Load preprocessed data set

Split into train, test etc

In [None]:
df = pd.read_csv('final_balanced_training_set.csv')

# Defining the feature sets
# English vars (Metadata only, since we replaced their similarity scores)
cols_initial = [
    'date_binary', 
    'sleutelwoorden_lenmatches', 
    'BT_TT_lenmatches', 
    'title_no_stop_lenmatches', 
    '1st_paragraph_no_stop_lenmatches',
    'numbers_lenmatches',
    'jac_total'
]

# Fixed vars (Dutch Spacy scores + Metadata)
cols_fixed = cols_initial + ['title_similarity', 'content_similarity']

# We start by splitting the data using ALL columns so we have the same rows for both models
X = df.fillna(0)
y = df['match']
groups = df['parent_id']

# 20% split (before it was manual numbers and crashed when we changed the masterfile size)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

# Creating the splits
X_train_full, X_test_full = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Fix for downstream code (code expects 'x_validation' and 'y_validation', i might change it later, but currently let's just map them to our Test set)
x_validation_full = X_test_full
y_validation = y_test

print(f"Original Rows: {len(df)}")
print(f"Training Rows: {len(X_train_full)} (Parents seen by model)")
print(f"Test Rows:     {len(X_test_full)} (Parents NEVER seen by model)")

Original Rows: 101974
Training Rows: 80721 (Parents seen by model)
Test Rows:     21253 (Parents NEVER seen by model)


# Traininig Phase (English / Initial Model)

In [5]:
# Select only the English columns
X_train_initial = X_train_full[cols_initial]
X_test_initial = X_test_full[cols_initial]
x_validation_initial = x_validation_full[cols_initial]

rf_initial = RandomForestClassifier(
    bootstrap=False, 
    criterion='gini',
    max_depth=40, 
    max_features='sqrt', 
    n_estimators=150,
    n_jobs=-1, 
    random_state=42,
    verbose=0
)

In [6]:
rf_initial.fit(X_train_initial, y_train)

# Evaluation Phase (Eng)

In [7]:
evaluation(rf_initial, 'initial_model_forest', X_test_initial, y_test)

initial_model_forest
confusion_matrix
TP    11166
TN    10030
FN       57
Name: count, dtype: int64
Precision: 1.0
Recall:    0.9949211440791232
Accuracy:   0.9973180256904908
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     10030
           1       1.00      0.99      1.00     11223

    accuracy                           1.00     21253
   macro avg       1.00      1.00      1.00     21253
weighted avg       1.00      1.00      1.00     21253



In [8]:
filename = 'model_rf_initial.pkl'
pickle.dump(rf_initial, open(filename, 'wb'))
print(f"Saved: {filename}")

Saved: model_rf_initial.pkl


# Training Phase (Dutch / Fixed Model)

In [9]:
# Select the Fixed columns (Dutch scores + metadata)
X_train_fixed = X_train_full[cols_fixed]
X_test_fixed = X_test_full[cols_fixed]
x_validation_fixed = x_validation_full[cols_fixed]

rf_fixed = RandomForestClassifier(
    bootstrap=False, 
    criterion='gini',
    max_depth=40, 
    max_features='sqrt', 
    n_estimators=150,
    n_jobs=-1, 
    random_state=42,
    verbose=0
)

print("Training Fixed (Dutch) Model...")
rf_fixed.fit(X_train_fixed, y_train)
print("Done.")

Training Fixed (Dutch) Model...
Done.


# Evaluation Phase (Fixed)
### -> given test data

In [10]:
evaluation(rf_fixed, 'fixed_model_forest', X_test_fixed, y_test)

fixed_model_forest
confusion_matrix
TP    11166
TN    10024
FN       57
FP        6
Name: count, dtype: int64
Precision: 0.9994629430719656
Recall:    0.9949211440791232
Accuracy:   0.9970357126052792
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     10030
           1       1.00      0.99      1.00     11223

    accuracy                           1.00     21253
   macro avg       1.00      1.00      1.00     21253
weighted avg       1.00      1.00      1.00     21253



In [11]:
filename = 'model_rf_fixed.pkl'
pickle.dump(rf_fixed, open(filename, 'wb'))
print(f"Saved: {filename}")

Saved: model_rf_fixed.pkl
