# RF Classifier with Wrapper-Based FS

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os

print(os.getcwd())

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_new_Bacud_unbalanced_lexical.csv")      # Loading the dataset
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [None]:
dataset['url_type'].value_counts()

#### 3. Removing Unnecessary Features

In [None]:
important_features_wrapper_19 = ['url_domain_entropy', 
                                 'url_is_digits_in_domain', 
                                 'url_number_of_digits', 
                                 'url_is_https', 
                                 'url_path_length', 
                                 'url_host_length', 
                                 'get_tld', 
                                 'url_domain_len', 
                                 'url_num_subdomain', 
                                 'url_num_periods', 
                                 'url_num_of_hyphens', 
                                 'url_num_underscore', 
                                 'url_num_equal', 
                                 'url_num_forward_slash', 
                                 'has_login_in_string', 
                                 'has_exe_in_string', 
                                 'has_linkeq_in_string', 
                                 'has_paypal_in_string', 
                                 'has_php_in_string']

important_features_wrapper_30 = ['url_domain_entropy',
                                 'url_is_digits_in_domain', 
                                 'url_number_of_digits', 
                                 'url_is_https', 
                                 'url_path_length', 
                                 'url_host_length', 
                                 'get_tld', 
                                 'url_domain_len', 
                                 'url_num_subdomain', 
                                 'url_is_encoded', 
                                 'url_num_periods', 
                                 'url_num_of_hyphens', 
                                 'url_num_underscore', 
                                 'url_num_equal', 
                                 'url_num_forward_slash', 
                                 'url_num_question_mark', 
                                 'url_num_semicolon', 
                                 'url_num_at', 
                                 'has_secure_in_string', 
                                 'has_login_in_string', 
                                 'has_ebayisapi_in_string', 
                                 'has_exe_in_string', 
                                 'has_jpg_in_string', 
                                 'has_linkeq_in_string', 
                                 'has_paypal_in_string', 
                                 'has_mailphp_in_string', 
                                 'has_php_in_string', 
                                 'has_admin_in_string', 
                                 'has_personal_in_string', 
                                 'has_update_in_string']

X_test_19 = x_test[important_features_wrapper_19]
X_train_19 = x_train[important_features_wrapper_19]

X_test_30 = x_test[important_features_wrapper_30]
X_train_30 = x_train[important_features_wrapper_30]

#### Hyper-parameter Tuning

##### 19 Features (Purely Lexical)

In [None]:
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
def objective_19(trial):
    # Define the search space for hyperparameters
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 100),
        'n_estimators': trial.suggest_int('n_estimators', 300, 3000),
        'max_samples': trial.suggest_float('max_sample', 0, 1),
        'max_features': trial.suggest_int('max_features', 1, len(important_features_wrapper_19))
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_19, y_train, test_size=0.2, random_state=42)
    
    # Train the model with early stopping
    scores = cross_val_score(RandomForestClassifier(random_state=123, **param, warm_start = True, n_jobs = 16),
                             X_train_19, y_train, scoring='neg_mean_absolute_error', cv=5).mean()
    score = scores.mean()
    
    return score

# Create an Optuna study and optimize the objective function
study_19 = optuna.create_study(direction='maximize')
study_19.optimize(objective_19, n_trials=25) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_19 = study_19.best_params
best_error_19 = study_19.best_value
print("Best Hyperparameters (12 Features): ", best_params_19)
print("Best Error (12 Features): ", best_error_19)

##### 30 Features (Purely Lexical)

In [None]:
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
def objective_30(trial):
    # Define the search space for hyperparameters
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 100),
        'n_estimators': trial.suggest_int('n_estimators', 300, 3000),
        'max_samples': trial.suggest_float('max_sample', 0, 1),
        'max_features': trial.suggest_int('max_features', 1, len(important_features_wrapper_19))
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_30, y_train, test_size=0.2, random_state=42)
    
    # Train the model with early stopping
    scores = cross_val_score(RandomForestClassifier(random_state=123, **param, warm_start = True, n_jobs = 16),
                             X_train_30, y_train, scoring='neg_mean_absolute_error', cv=5).mean()
    score = scores.mean()
    
    return score

# Create an Optuna study and optimize the objective function
study_30 = optuna.create_study(direction='maximize')
study_30.optimize(objective_30, n_trials=25) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_30 = study_30.best_params
best_error_30 = study_30.best_value
print("Best Hyperparameters (12 Features): ", best_params_30)
print("Best Error (12 Features): ", best_error_30)

#### Model Training

In [None]:
from sklearn.model_selection import KFold

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# Initialize the model
rf_classifier_19 = Pipeline([
    ('classifier', RandomForestClassifier(random_state=123, **best_params_19, warm_start = True, n_jobs = 16))
])

rf_classifier_30 = Pipeline([
    ('classifier', RandomForestClassifier(random_state=123, **best_params_30, warm_start = True, n_jobs = 16))
])

# Train the Model
rf_classifier_19.fit(X_train_19, y_train)
y_pred_19 = rf_classifier_19.predict(X_test_19)

rf_classifier_30.fit(X_train_30, y_train)
y_pred_30 = rf_classifier_30.predict(X_test_30)

#### Evaluation

In [None]:
# Classification Report
print(classification_report(y_test, y_pred_19))
print(classification_report(y_test, y_pred_30))

In [None]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred_10, labels=rf_classifier_19.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = rf_classifier_19.classes_)
disp.plot()
plt.show()'''

In [None]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(random_state=45, **params_gbm),
                        x_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

In [None]:
# Dumping the model
joblib.dump(rf_classifier_19, 'rf_ffs_12.sav')
joblib.dump(rf_classifier_30, 'rf_ffs_33.sav')

In [1]:
import lexical_generator_19
import lexical_generator_30
import time

def rf_predict_maliciousness_19(url):

    numerical_values = lexical_generator_19.lexical_generator(url)

    match rf_classifier_19.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Defacement"
        case 2:
            return "Phishing"
        case 3:
            return "Malware"
        
def rf_predict_maliciousness_30(url):

    numerical_values = lexical_generator_30.lexical_generator(url)

    numerical_values.head()

    match rf_classifier_30.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Defacement"
        case 2:
            return "Phishing"
        case 3:
            return "Malware"

url = "www.youtube.com/watch?v=RJM5rF-aluM"
print("Current URL: "+url)

start = time.perf_counter()
prediction = rf_predict_maliciousness_19(url)
end = time.perf_counter()
print("------- 19 Features -------------")
print(prediction)
print(end-start)

start = time.perf_counter()
prediction = rf_predict_maliciousness_30(url)
end = time.perf_counter()
print("------- 30 Features -------------")
print(prediction)
print(end-start)

ModuleNotFoundError: No module named 'lexical_generator_19'