# Capstone Project: Malicious URL detection using Machine Learning and Artificial Intelligence


### Model Experiments
#### Explore models hyperparameters

### Import libraries

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
import time
import warnings
from colorama import Fore
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report, accuracy_score,f1_score, precision_score, recall_score, roc_auc_score,roc_curve
from sklearn.compose import make_column_transformer
from sklearn.metrics import precision_recall_fscore_support
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lightgbm as lgb


# Ignore warnings
warnings.filterwarnings('ignore')

In [51]:
#Collect Initial Data
df4 = pd.read_csv('../data/finaldataset/maliciousurl_processed.csv')


In [52]:
df4.head()

Unnamed: 0,url,type,category,url_len,domain,root_domain,count_at,count-question,count_hyphen,count_equal,...,count_https,count_http,count_www,digits_count,hostname_length,sus_url,letters_count,short_url,use_of_ip,url_region
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,1310791,0,0,1,0,...,0,0,0,0,0,0,13,0,0,26
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,58335668,0,0,0,0,...,0,0,0,1,0,0,29,0,0,72
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,28611805,0,0,0,0,...,0,0,0,1,0,0,25,0,0,72
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,89045308,0,1,1,4,...,0,1,0,7,17,0,60,0,0,18
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,76838614,0,1,1,3,...,0,1,0,22,23,0,199,0,0,72


In [53]:
df4.shape

(651191, 31)

In [54]:
features = df4.columns[:-1]
print(features)
print('Number of features: {}'.format(len(features)))

Index(['url', 'type', 'category', 'url_len', 'domain', 'root_domain',
       'count_at', 'count-question', 'count_hyphen', 'count_equal',
       'count_dot', 'count_hash', 'count_percent', 'count_plus',
       'count_dollarsign', 'count_exlamation', 'count_star', 'count_comma',
       'count_double_slash', 'count_slash', 'abnormal_url', 'count_https',
       'count_http', 'count_www', 'digits_count', 'hostname_length', 'sus_url',
       'letters_count', 'short_url', 'use_of_ip'],
      dtype='object')
Number of features: 30


###  Train/Test Split

With your data prepared, split it into a train and test set.

In [55]:
X = df4.drop(['url','type','category','domain'],axis=1)
y = df4['category']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Refining selected model 

### Light GBM Classifier

In [59]:

fit_params={"early_stopping_rounds":30, 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose': 100}

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':[1,2,6,12]}


n_HP_points_to_test = 100

#n_estimators is set to a "large value". 
# The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier( n_estimators=5000)

gs = RandomizedSearchCV(
    estimator = clf,
    param_distributions = param_test, 
    n_iter = n_HP_points_to_test,
    scoring = 'balanced_accuracy',
    cv = 3,
    refit = False,
    random_state = 314,
    verbose = 10)

gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3; 1/100] START colsample_bytree=0.9501241488957805, min_child_samples=301, min_child_weight=0.1, num_leaves=28, reg_alpha=0, reg_lambda=100, scale_pos_weight=6, subsample=0.8618840095715408
[100]	valid's multi_logloss: 0.211363
[200]	valid's multi_logloss: 0.193682
[300]	valid's multi_logloss: 0.18598
[400]	valid's multi_logloss: 0.181293
[500]	valid's multi_logloss: 0.177657
[600]	valid's multi_logloss: 0.174922
[700]	valid's multi_logloss: 0.172606
[800]	valid's multi_logloss: 0.170491
[900]	valid's multi_logloss: 0.169052
[1000]	valid's multi_logloss: 0.16762
[1100]	valid's multi_logloss: 0.16648
[1200]	valid's multi_logloss: 0.165333
[1300]	valid's multi_logloss: 0.164458
[1400]	valid's multi_logloss: 0.163473
[1500]	valid's multi_logloss: 0.162691
[1600]	valid's multi_logloss: 0.162116
[1700]	valid's multi_logloss: 0.161537
[1800]	valid's multi_logloss: 0.160983
[1900]	valid's multi_logloss: 0.160533
[2000]	vali

In [None]:
# Make predictions on the test set
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Calculate the false positive rate (FPR), true positive rate (TPR), and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Calculate the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
