# Capstone Project: Malicious URL detection using Machine Learning and Artificial Intelligence


### Model Experiments
#### Explore models hyperparameters

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
import time
import warnings
from colorama import Fore
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report, accuracy_score,f1_score, precision_score, recall_score, roc_auc_score,roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.compose import make_column_transformer
from sklearn.metrics import precision_recall_fscore_support
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lightgbm as lgb


# Ignore warnings
warnings.filterwarnings('ignore')

In [2]:
#Collect Initial Data
df4 = pd.read_csv('../data/finaldataset/maliciousurl_processed.csv')


In [3]:
df4.head()

Unnamed: 0,url,type,category,url_len,domain,root_domain,count_at,count-question,count_hyphen,count_equal,...,count_https,count_http,count_www,digits_count,hostname_length,sus_url,letters_count,short_url,use_of_ip,url_region
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,1310791,0,0,1,0,...,0,0,0,0,0,0,13,0,0,26
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,58335668,0,0,0,0,...,0,0,0,1,0,0,29,0,0,72
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,28611805,0,0,0,0,...,0,0,0,1,0,0,25,0,0,72
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,89045308,0,1,1,4,...,0,1,0,7,17,0,60,0,0,18
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,76838614,0,1,1,3,...,0,1,0,22,23,0,199,0,0,72


In [4]:
df4.shape

(651191, 31)

In [5]:
features = df4.columns[:-1]
print(features)
print('Number of features: {}'.format(len(features)))

Index(['url', 'type', 'category', 'url_len', 'domain', 'root_domain',
       'count_at', 'count-question', 'count_hyphen', 'count_equal',
       'count_dot', 'count_hash', 'count_percent', 'count_plus',
       'count_dollarsign', 'count_exlamation', 'count_star', 'count_comma',
       'count_double_slash', 'count_slash', 'abnormal_url', 'count_https',
       'count_http', 'count_www', 'digits_count', 'hostname_length', 'sus_url',
       'letters_count', 'short_url', 'use_of_ip'],
      dtype='object')
Number of features: 30


###  Train/Test Split

With your data prepared, split it into a train and test set.

In [6]:
X = df4.drop(['url','type','category','domain'],axis=1)
y = df4['category']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Refining selected model 

### RandomForest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import time

# Create a pipeline with the RandomForestClassifier
pipeline = Pipeline([
    ('classifier', RandomForestClassifier( n_estimators=5000))
])

# Define the hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False],
    'classifier__random_state': [42]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
start_time = time.time()
grid_search.fit(X_train, y_train)
elapsed_time = (time.time() - start_time) / 60  # convert to minutes

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best score
best_score = grid_search.best_score_

print("RandomForestClassifier: %s | Time: %s" % (best_score, elapsed_time))
print("best_params: {:.2f}".format(best_params))


In [None]:
# Make predictions on the test set
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the false positive rate (FPR), true positive rate (TPR), and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Calculate the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
