# RF Classifier with Filter-Based FS (Purely Lexical)

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_unbalanced_with_lexicalcontent.csv")      # Loading the dataset
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [None]:
dataset['url_type'].value_counts()

#### 3. Removing Unnecessary Features

In [None]:
important_features_wrapper = ['url_host_length',
                             'url_is_https',
                             'url_ip_in_domain',
                             'has_php_in_string',
                             'url_number_of_parameters',
                             'has_exe_in_string',
                             'url_has_port',
                             'url_is_digits_in_domain',
                             'url_path_length',
                             'url_num_question_mark', 
                             'url_query_length',
                             'url_string_entropy',
                             'url_num_periods',
                             'get_tld',
                             'url_scheme']

X_test = x_test[important_features_wrapper]
X_train = x_train[important_features_wrapper]

#### Hyper-parameter Tuning

##### 19 Features (Purely Lexical)

In [4]:
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 100),
        'n_estimators': trial.suggest_int('n_estimators', 300, 3000),
        'max_samples': trial.suggest_float('max_sample', 0, 1),
        'max_features': trial.suggest_int('max_features', 1, len(important_features_wrapper))
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Train the model with early stopping
    scores = cross_val_score(RandomForestClassifier(random_state=123, **param, warm_start = True, n_jobs = 16),
                             X_train, y_train, scoring='neg_mean_absolute_error', cv=5).mean()
    score = scores.mean()
    
    return score

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params = study.best_params
best_error = study.best_value
print("Best Hyperparameters (12 Features): ", best_params)
print("Best Error (12 Features): ", best_error)

KeyboardInterrupt: 

#### Model Training

In [None]:
from sklearn.model_selection import KFold

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# Initialize the model
rf_classifier = Pipeline([
    ('classifier', RandomForestClassifier(random_state=123, **best_params, warm_start = True, n_jobs = 16))
])

# Train the Model
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

#### Evaluation

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred_10, labels=rf_classifier_19.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = rf_classifier_19.classes_)
disp.plot()
plt.show()'''

In [None]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(random_state=45, **params_gbm),
                        x_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

In [None]:
# Dumping the model
joblib.dump(rf_classifier, 'rf_filter_lexical.sav')

In [None]:
import lexical_generator_filter_lexical
import time

def rf_predict_maliciousness(url):

    numerical_values = lexical_generator_filter_lexical.lexical_generator(url)

    match rf_classifier.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Defacement"
        case 2:
            return "Phishing"
        case 3:
            return "Malware"

url = "www.youtube.com/watch?v=RJM5rF-aluM"
print("Current URL: "+url)

start = time.perf_counter()
prediction = rf_predict_maliciousness(url)
end = time.perf_counter()
print("------------- Filter-Based (Lexical) -------------")
print(prediction)
print(end-start)