# Prerequisite

When u are not familiar with -> `pip install -r requirements.txt`, use this codeblock for installing the required packages

In [None]:
# !pip install jupyter
# !pip install scikit-learn
# !pip install pandas
# !pip install numpy

In [44]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Functions for evaluation purposes

In [4]:
def resultClassifierfloat(row):
    threshold = 0.5
    if (row['prediction'] > threshold and row['label'] == True):
        return 'TP'
    if (row['prediction'] < threshold and row['label'] == False):
        return 'TN'
    if (row['prediction'] < threshold and row['label'] == True):
        return 'FN'
    if (row['prediction'] > threshold and row['label'] == False):
        return 'FP'


def resultClassifierint(row):
    if (row['label'] == row['prediction'] and row['label'] == True):
        return 'TP'
    if (row['label'] == row['prediction'] and row['label'] == False):
        return 'TN'
    if (row['label'] != row['prediction'] and row['label'] == True):
        return 'FN'
    if (row['label'] != row['prediction'] and row['label'] == False):
        return 'FP'


def evaluation(classifier, name, X_test, y_test):
    # Predict the response for test dataset
    y_pred = classifier.predict(X_test)
    results = pd.DataFrame({'label': y_test.values, 'prediction': y_pred}, index=y_test.index)
    results['confusion_matrix'] = results.apply(resultClassifierint, axis=1)
    results_counts = results['confusion_matrix'].value_counts()

    print(name)
    print(results_counts)
    print('Precision: ', (results_counts.loc['TP']) / (results_counts.loc['TP'] + results_counts.loc['FP']))
    print('Recall: ', (results_counts.loc['TP']) / (results_counts.loc['TP'] + results_counts.loc['FN']))
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

# Load preprocessed data set

Split into train, test etc

In [45]:
trainset = pd.read_csv('./final_trainset.csv')
trainset.iloc[52283:, trainset.columns.get_loc('match')] = 0
trainset1 = trainset[:52283]
trainset2 = trainset[52283:]

train_test_set1 = trainset1[:41581]
train_test_set2 = trainset2[:41581]
train_test_set = pd.concat([train_test_set1, train_test_set2], axis=0, join='inner')

validation_set1 = trainset1[41581:]
validation_set2 = trainset2[41581:]
validation_set = pd.concat([validation_set1, validation_set2], axis=0, join='inner')

In [46]:
feature_cols = ['date_binary',
                'jac_total',
                'title_similarity',
                'content_similarity',
                'sleutelwoorden_lenmatches',
                'BT_TT_lenmatches',
                'title_no_stop_lenmatches',
                '1st_paragraph_no_stop_lenmatches',
                'numbers_lenmatches']
X = train_test_set[feature_cols]
X = X.fillna(0)
y = train_test_set['match']

In [47]:
X[X.isna()].sum().sum()

np.float64(0.0)

In [48]:
len(X[~X.isna()]) - len(X[X.isna()])

0

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Traininig Phase
```
# Note ->  Changed in version 1.1: The default of max_features changed from "auto" to "sqrt".
```

In [54]:
rf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                            max_depth=40, max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=150,
                            n_jobs=-1, oob_score=False, random_state=None,
                            verbose=0, warm_start=False)

In [55]:
rf

In [56]:
rf.fit(X_train, y_train)

# Evaluation Phase
### -> given test data

In [57]:
evaluation(rf, 'default_grid_forest', X_test, y_test)

default_grid_forest
confusion_matrix
TP    11892
TN    11669
FP      729
FN      659
Name: count, dtype: int64
Precision:  0.9422391252674115
Recall:  0.9474942235678432
Accuracy:  0.9443665076756583
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     12398
           1       0.94      0.95      0.94     12551

    accuracy                           0.94     24949
   macro avg       0.94      0.94      0.94     24949
weighted avg       0.94      0.94      0.94     24949



### -> given validation data

In [59]:
feature_cols = ['date_binary',
                'jac_total',
                'title_similarity',
                'content_similarity',
                'sleutelwoorden_lenmatches',
                'BT_TT_lenmatches',
                'title_no_stop_lenmatches',
                '1st_paragraph_no_stop_lenmatches',
                'numbers_lenmatches']

X = validation_set[feature_cols]
X = X.fillna(0)
y = validation_set['match']
x_validation = X
y_validation = y

evaluation(rf, 'default_grid_forest', x_validation, y_validation)

default_grid_forest
confusion_matrix
TP    10139
TN     9223
FP      865
FN      563
Name: count, dtype: int64
Precision:  0.9213922210105416
Recall:  0.9473930106522145
Accuracy:  0.9313131313131313
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     10088
           1       0.92      0.95      0.93     10702

    accuracy                           0.93     20790
   macro avg       0.93      0.93      0.93     20790
weighted avg       0.93      0.93      0.93     20790



# Save model weights

In [60]:
import pickle

filename = 'rf_model_pretrained_11_11_24.pkl'
pickle.dump(rf, open(filename, 'wb'))

# Load pretrained model weights

In [62]:
loaded_model = pickle.load(open(filename, 'rb'))

# Redo evaluation

### -> test data

In [65]:
evaluation(loaded_model, 'default_grid_forest', X_test, y_test)

default_grid_forest
confusion_matrix
TP    11892
TN    11669
FP      729
FN      659
Name: count, dtype: int64
Precision:  0.9422391252674115
Recall:  0.9474942235678432
Accuracy:  0.9443665076756583
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     12398
           1       0.94      0.95      0.94     12551

    accuracy                           0.94     24949
   macro avg       0.94      0.94      0.94     24949
weighted avg       0.94      0.94      0.94     24949



### -> validation data

In [64]:
evaluation(loaded_model, 'default_grid_forest', x_validation, y_validation)

default_grid_forest
confusion_matrix
TP    10139
TN     9223
FP      865
FN      563
Name: count, dtype: int64
Precision:  0.9213922210105416
Recall:  0.9473930106522145
Accuracy:  0.9313131313131313
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     10088
           1       0.92      0.95      0.93     10702

    accuracy                           0.93     20790
   macro avg       0.93      0.93      0.93     20790
weighted avg       0.93      0.93      0.93     20790

