# Random Forest Classifier

By Steven Sison on December 16, 2023

## Description

This document will be used for the preliminary training and evaluation of the random forest classifier. The document includes the necessary processes taken to train the model with the default hyperparameters. This also evaluates the performance of the classifier in terms of accuracy, precision, recall, F1-score, training time, and detection time. Furthermore, this document will only use lexical features and will observe the effect of increasing the number of features used in the model. As this is only for preliminary work, no optimizations, except a simple train-test validation, will be carried out.

### Preliminaries

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib

dataset = pd.read_csv("final_unbalanced_withLexical.csv")      # Loading the dataset
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

## Model Training and Evaluation

### Base Model using All Lexical Features Generated

In [None]:
pipeline = Pipeline([
    ('classifier', RandomForestClassifier())
])

pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = pipeline.classes_)
disp.plot()
plt.show()

### Evaluating the Effect of Balanced and Unbalanced Datasets

In [None]:
dataset['url_type'].value_counts()

In [None]:
# Upsampling

from sklearn.utils import resample

dataset_benign = dataset[(dataset['url_type'] == 0)]
dataset_defacement = dataset[(dataset['url_type'] == 1)]
dataset_phishing = dataset[(dataset['url_type'] == 2)]
dataset_malware = dataset[(dataset['url_type'] == 3)]

dataset_benign_upsampled = resample(dataset_benign,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_defacement_upsampled = resample(dataset_defacement,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_phishing_upsampled = resample(dataset_phishing,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_malware_upsampled = resample(dataset_malware,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)


dataset_upsampled = pd.concat([dataset_benign_upsampled, dataset_defacement_upsampled, dataset_malware_upsampled, dataset_phishing_upsampled])

# dataset_upsampled.info(0)
dataset_upsampled['url_type'].value_counts()

x_up_train, x_up_test, y_up_train, y_up_test = train_test_split(dataset_upsampled.drop(columns=['url_type']), dataset_upsampled['url_type'], test_size = 0.2, random_state=42)

In [None]:

pipeline_up = Pipeline([
    ('classifier', RandomForestClassifier())
])

pipeline_up.fit(x_up_train, y_up_train)
y_up_pred = pipeline_up.predict(x_up_test)
print(classification_report(y_up_test, y_up_pred))

In [None]:
cm_up = confusion_matrix(y_up_test, y_up_pred, labels=pipeline_up.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = pipeline_up.classes_)
disp.plot()
plt.show()

### Evaluating the Effect of Adding more Lexical Features

In [None]:
results = []

for i in range(3):
    pipeline = Pipeline([('classifier', RandomForestClassifier())])

    temp_url_features = x_up_train.iloc[:, 0:(25*(i+1))]
    
    pipeline.fit(temp_url_features, y_up_train)
    
    filename =  'rf_lexical_{}.sav'.format((25*(i+1)))
    joblib.dump(pipeline, filename)

    url_type_predict = pipeline.predict(x_up_test.iloc[:, 0:(25*(i+1))])

    accuracy = accuracy_score(y_up_test, url_type_predict)
    recall = recall_score(y_up_test, url_type_predict, average = 'weighted')
    precision = precision_score(y_up_test, url_type_predict, average = 'weighted', zero_division=1)
    f1 = f1_score(y_up_test, url_type_predict, average = 'weighted')
    results.append(((4*(i+1)), accuracy, recall, precision, f1))

In [None]:
results = pd.DataFrame(results, columns=['Number of Features', 'Accuracy', 'Recall', 'Precision', 'F1-Score'])
results = results.sort_values(by='Number of Features', ascending=True)
print(results)

Observations:
- Increasing the number of features improves all the class weighted metrics of the model at the cost of a higher training time.

## Model Optimizations

### With Cross Validation (No Hyperparameter Tuning and Using Upsampled Dataset)

In [None]:
from numpy import mean
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from matplotlib import pyplot

def evaluating_model(cv):

    X, y = dataset_upsampled.drop(columns=['url_type']), dataset_upsampled['url_type']

    model = RandomForestClassifier()
    scores = cross_val_score(model, X,y, scoring = "accuracy", cv = cv, n_jobs=1)

    return mean(scores), scores.min(), scores.max()

# Getting the Ideal Score
'''ideal, _, _ = evaluating_model(LeaveOneOut())
print('Ideal: %.3f' % ideal)'''

folds = range(10,11)

means, mins, maxs = list(), list(), list()

for k in folds:
    # define the test condition
    cv = KFold(n_splits=k, shuffle=True, random_state=1)
    # evaluate k value
    k_mean, k_min, k_max = evaluating_model(cv)
    # report performance
    print('> folds=%d, accuracy=%.3f (%.3f,%.3f)' % (k, k_mean, k_min, k_max))
    # store mean accuracy
    means.append(k_mean)
    # store min and max relative to the mean
    mins.append(k_mean - k_min)
    maxs.append(k_max - k_mean)

# line plot of k mean values with min/max error bars
pyplot.errorbar(folds, means, yerr=[mins, maxs], fmt='o')
# plot the ideal case in a separate color
# pyplot.plot(folds, [ideal for _ in range(len(folds))], color='r')
# show the plot
pyplot.show()


### Hyperparameter Tuning

In [8]:
from bayes_opt import BayesianOptimization

def rf_cl_bo(max_depth, n_estimators, min_samples_leaf, min_samples_split):
    params_rf = {}
    params_rf['max_features'] = 'sqrt'
    params_rf['max_depth'] = round(max_depth)
    params_rf['n_estimators'] = round(n_estimators)
    params_rf['min_samples_leaf'] = round(min_samples_leaf)
    params_rf['min_samples_split'] = round(min_samples_split)
    scores = cross_val_score(RandomForestClassifier(random_state=123, **params_rf),
                             x_up_train, y_up_train, scoring='accuracy', cv=5).mean()
    score = scores.mean()
    return score
# Run Bayesian Optimization
start = time.time()
params_rf ={
    'max_depth':(2, 20),
    'n_estimators':(100, 1500),
    'min_samples_leaf':(1,100),
    'min_samples_split':(2,100) 
}
rf_bo = BayesianOptimization(rf_cl_bo, params_rf, random_state=111)
rf_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.9115   [0m | [0m13.02    [0m | [0m17.74    [0m | [0m44.73    [0m | [0m1.177e+03[0m |
| [0m2        [0m | [0m0.8584   [0m | [0m7.316    [0m | [0m15.77    [0m | [0m4.203    [0m | [0m688.3    [0m |
| [0m3        [0m | [0m0.8432   [0m | [0m6.296    [0m | [0m34.43    [0m | [0m99.09    [0m | [0m432.8    [0m |
| [0m4        [0m | [0m0.7562   [0m | [0m3.461    [0m | [0m67.29    [0m | [0m62.88    [0m | [0m484.0    [0m |
| [0m5        [0m | [0m0.8918   [0m | [0m10.39    [0m | [0m12.72    [0m | [0m9.248    [0m | [0m1.361e+03[0m |
| [95m6        [0m | [95m0.9166   [0m | [95m16.29    [0m | [95m84.22    [0m | [95m81.89    [0m | [95m1.487e+03[0m |
| [0m7        [0m | [0m0.8999   [0m | [0m12.39    [0m | [0m81.56    [0m | [0m43.29    [0m | [0m138.4   