# Prerequisite

When u are not familiar with -> `pip install -r requirements.txt`, use this codeblock for installing the required packages

In [1]:
# !pip install jupyter
# !pip install scikit-learn
# !pip install pandas
# !pip install numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Functions for evaluation purposes

In [None]:
def resultClassifierfloat(row):
    threshold = 0.5
    if (row['prediction'] > threshold and row['label'] == True):
        return 'TP'
    if (row['prediction'] < threshold and row['label'] == False):
        return 'TN'
    if (row['prediction'] < threshold and row['label'] == True):
        return 'FN'
    if (row['prediction'] > threshold and row['label'] == False):
        return 'FP'


def resultClassifierint(row):
    if (row['label'] == row['prediction'] and row['label'] == True):
        return 'TP'
    if (row['label'] == row['prediction'] and row['label'] == False):
        return 'TN'
    if (row['label'] != row['prediction'] and row['label'] == True):
        return 'FN'
    if (row['label'] != row['prediction'] and row['label'] == False):
        return 'FP'

# ! CHANGED THIS TO HANDLE 0 FP
def evaluation(classifier, name, X_test, y_test):

    y_pred = classifier.predict(X_test)
    results = pd.DataFrame({'label': y_test.values, 'prediction': y_pred}, index=y_test.index)
    results['confusion_matrix'] = results.apply(resultClassifierint, axis=1)
    results_counts = results['confusion_matrix'].value_counts()

    # we use .get('KEY', 0) to avoid the error if a count is zero
    tp = results_counts.get('TP', 0)
    tn = results_counts.get('TN', 0)
    fp = results_counts.get('FP', 0)
    fn = results_counts.get('FN', 0)

    print(name)
    print(results_counts)
    
    # safe division to avoid 0/0 errors
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    print(f'Precision: {precision}')
    print(f'Recall:    {recall}')
    print("Accuracy:  ", metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

# Load preprocessed data set

Split into train, test etc

In [None]:
df = pd.read_csv('cleaned_initial.csv')

feature_cols = [
    'date_binary', 
    'title_similarity', 
    'content_similarity', 
    'sleutelwoorden_lenmatches', 
    'BT_TT_lenmatches', 
    'title_no_stop_lenmatches', 
    '1st_paragraph_no_stop_lenmatches',
    'numbers_lenmatches',
    'jac_total'
]

X = df[feature_cols].fillna(0)
y = df['match']
groups = df['parent_id']

# 20% split (before it was manual numbers and crashed when we changed the masterfile size)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Fix for downstream code (code expects 'x_validation' and 'y_validation', i might change it later, but currently let's just map them to our Test set)
x_validation = X_test
y_validation = y_test

print(f"Original Rows: {len(df)}")
print(f"Training Rows: {len(X_train)} (Parents seen by model)")
print(f"Test Rows:     {len(X_test)} (Parents NEVER seen by model)")

Original Rows: 51250
Training Rows: 40319 (Parents seen by model)
Test Rows:     10931 (Parents NEVER seen by model)


# Traininig Phase
```
# Note ->  Changed in version 1.1: The default of max_features changed from "auto" to "sqrt".
```

In [None]:
rf = RandomForestClassifier(
    bootstrap=False, 
    criterion='gini',
    max_depth=40, 
    max_features='sqrt', 
    n_estimators=150,
    n_jobs=-1, 
    random_state=42,
    verbose=0
)

rf.fit(X_train, y_train)

In [None]:
rf

In [None]:
rf.fit(X_train, y_train)

# Evaluation Phase
### -> given test data

In [None]:
evaluation(rf, 'default_grid_forest', X_test, y_test)

default_grid_forest
confusion_matrix
TP    10931
Name: count, dtype: int64
Precision: 1.0
Recall:    1.0
Accuracy:   1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     10931

    accuracy                           1.00     10931
   macro avg       1.00      1.00      1.00     10931
weighted avg       1.00      1.00      1.00     10931



### -> given validation data

In [None]:
# Evaluation on the val set (which is now our test set)
evaluation(rf, 'default_grid_forest', x_validation, y_validation)

default_grid_forest
confusion_matrix
TP    10931
Name: count, dtype: int64
Precision: 1.0
Recall:    1.0
Accuracy:   1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     10931

    accuracy                           1.00     10931
   macro avg       1.00      1.00      1.00     10931
weighted avg       1.00      1.00      1.00     10931



# Save model weights

In [None]:
import pickle

filename = 'pretrained_model_initial.pkl'
pickle.dump(rf, open(filename, 'wb'))

# Load pretrained model weights

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

# Redo evaluation

### -> test data

In [None]:
evaluation(loaded_model, 'default_grid_forest', X_test, y_test)

default_grid_forest
confusion_matrix
TP    10931
Name: count, dtype: int64
Precision: 1.0
Recall:    1.0
Accuracy:   1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     10931

    accuracy                           1.00     10931
   macro avg       1.00      1.00      1.00     10931
weighted avg       1.00      1.00      1.00     10931



### -> validation data

In [None]:
evaluation(loaded_model, 'default_grid_forest', x_validation, y_validation)

default_grid_forest
confusion_matrix
TP    10931
Name: count, dtype: int64
Precision: 1.0
Recall:    1.0
Accuracy:   1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     10931

    accuracy                           1.00     10931
   macro avg       1.00      1.00      1.00     10931
weighted avg       1.00      1.00      1.00     10931



In [None]:
# Check how many 1s (Matches) and 0s (Non-Matches) we actually have
print("--- Class Distribution ---")
print(df['match'].value_counts())

print("\n--- Split Check ---")
print(f"y_train 0s: {len(y_train[y_train==0])}")
print(f"y_test 0s:  {len(y_test[y_test==0])}")

--- Class Distribution ---
match
1    51250
Name: count, dtype: int64

--- Split Check ---
y_train 0s: 0
y_test 0s:  0
