# Capstone Project: Malicious URL detection using Machine Learning and Artificial Intelligence


### Model Experiments
#### Explore models hyperparameters

### Import libraries

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
import time
import warnings
from colorama import Fore
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report, accuracy_score,f1_score, precision_score, recall_score, roc_auc_score,roc_curve
from sklearn.compose import make_column_transformer
from sklearn.metrics import precision_recall_fscore_support

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
# Ignore warnings
warnings.filterwarnings('ignore')

In [5]:
#Collect Initial Data
df4 = pd.read_csv('../data/finaldataset/maliciousurl_processed.csv')


In [6]:
df4.head()

Unnamed: 0,url,type,category,url_len,domain,root_domain,count_at,count-question,count_hyphen,count_equal,...,count_https,count_http,count_www,digits_count,hostname_length,sus_url,letters_count,short_url,use_of_ip,url_region
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,1310791,0,0,1,0,...,0,0,0,0,0,0,13,0,0,26
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,58335668,0,0,0,0,...,0,0,0,1,0,0,29,0,0,72
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,28611805,0,0,0,0,...,0,0,0,1,0,0,25,0,0,72
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,89045308,0,1,1,4,...,0,1,0,7,17,0,60,0,0,18
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,76838614,0,1,1,3,...,0,1,0,22,23,0,199,0,0,72


In [7]:
df4.shape

(651191, 31)

In [8]:
features = df4.columns[:-1]
print(features)
print('Number of features: {}'.format(len(features)))

Index(['url', 'type', 'category', 'url_len', 'domain', 'root_domain',
       'count_at', 'count-question', 'count_hyphen', 'count_equal',
       'count_dot', 'count_hash', 'count_percent', 'count_plus',
       'count_dollarsign', 'count_exlamation', 'count_star', 'count_comma',
       'count_double_slash', 'count_slash', 'abnormal_url', 'count_https',
       'count_http', 'count_www', 'digits_count', 'hostname_length', 'sus_url',
       'letters_count', 'short_url', 'use_of_ip'],
      dtype='object')
Number of features: 30


###  Train/Test Split

With your data prepared, split it into a train and test set.

In [9]:
X = df4.drop(['url','type','category','domain'],axis=1)
y = df4['category']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Refining selected model 

### XGBClassifier Classifier

In [12]:
#further exploring other hyperparameters
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)
parameters = {
    'max_depth':[19],
    'n_estimators': [400,500],
    'learning_rate': [0.1, 0.01, 0.05],
    "subsample":[0.75, 1],
    "colsample_bytree":[0.75, 1],
    "min_child_weight":[1, 5]
}
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)
start_time = time.time()
grid_search.fit(X_train,y_train)
elapsed_time = (time.time() - start_time) / 60  # convert to minutes

# Get the best hyperparameters
best_params = grid_search.best_params_

print("XGBClassifier: %s | Time: %s" % (grid_search.best_score_, elapsed_time))
print("best_params:", best_params)

# Make predictions on the test set
best_estimator = grid_search.best_estimator_
train_accuracy = best_estimator.score(X_train, y_train)
test_accuracy = best_estimator.score(X_test, y_test)

results_df = pd.DataFrame({
    'Name': ['XGBClassifier'],
    'Train Time': [elapsed_time],
    'Train Accuracy': [train_accuracy],
    'Test Accuracy': [test_accuracy],
    'Best Params': [best_params]
})

# Print the train and test accuracy, as well as the best parameters
print("Train accuracy: {:.2f}".format(train_accuracy))
print("Test accuracy: {:.2f}".format(test_accuracy))
print(results_df)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [None]:
import sklearn.metrics as metrics
xgb_c = XGBClassifier(**best_params)

start_time = time.time()
xgb_c.fit(X_train,y_train)
elapsed_time = time.time() - start_time/ 60  # convert to minutes
y_pred_x = xgb_c.predict(X_test)
print(classification_report(y_test,y_pred_x,target_names=['benign', 'defacement','phishing','malware']))


score = metrics.accuracy_score(y_test, y_pred_x)
print("accuracy:   %s" %  score)
print ("%s : %s |Time:%s" %("XGBClassifier", score,elapsed_time))


In [None]:
# Evaluate the logistic regression model on the train data
train_accuracy = xgb_c.score(X_train, y_train)
test_accuracy = xgb_c.score(X_test, y_test)
# Print the train  accuracy
print("Train accuracy: {:.2f}".format(train_accuracy))
print("Test accuracy: {:.2f}".format(test_accuracy))

In [None]:

feat_importances = pd.Series(xgb_c.feature_importances_, index=X_train.columns)
feat_importances.sort_values().plot(kind="barh",figsize=(10, 6)).set(xlabel='Type',title = 'Fearture importance for XGBClassifier Model')
plt.savefig("images/fearture-importance-XGBClassifier-improved")


I got accurancy of 93.664% for XGBClassifier in first phase with no parameter, after above experimentation the accurancy was improved to 95.37%