# Detecting Maclicious URLs using Machine Learning<br>
The malicious urls can be detected using the lexical features along with tokenization of the url strings

In [1]:
# Importing libraries
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
from tld import get_tld

# Importing ML libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Joblib import to save model
import pickle

In [2]:
# Reading the data
originalData = pd.read_csv("urldata.csv", index_col=0)
originalData.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [3]:
# Data info
print(f'Shape of data : {originalData.shape}')
print('\n====================\n')
print("Data info:")
originalData.info()

Shape of data : (450176, 3)


Data info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 450176 entries, 0 to 450175
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
 2   result  450176 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 13.7+ MB


In [4]:
# Checking missing values
print(f'Null value columns wise count: \n{originalData.isnull().sum()}')

Null value columns wise count: 
url       0
label     0
result    0
dtype: int64


## 1. Data Preprocessing

Majorly 3 categories will be used to classify the url:

1. **Length related features:** Length of URL, Hostname, Path, First Directory and Top Level Domain (TLD).
2. **Count related features:** Count of special charachters, http, www, digits, letters and number of directories.
3. **Binary features:** Whether the URL used IP or not

### Length Features

In [5]:
# Extracting all the length features and saving them in the
# dataset as new features.

# Length of URL
originalData['url_length'] = originalData['url'].apply(lambda i: len(str(i)))

# Hostname Length
originalData['hostname_length'] = originalData['url'].apply(lambda i: len(urlparse(i).netloc))

# Path Length
originalData['path_length'] = originalData['url'].apply(lambda i: len(urlparse(i).path))

# First Directory Length
def firstDirLength(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

originalData['fd_length'] = originalData['url'].apply(lambda i: firstDirLength(i))


# Length of Top Level Domain. Extracted using TLD library
originalData['tld'] = originalData['url'].apply(lambda i: get_tld(i,fail_silently=True))
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

originalData['tld_length'] = originalData['tld'].apply(lambda i: tld_length(i))
originalData = originalData.drop("tld",1)   # Removing the unwanted feature used for count

  originalData = originalData.drop("tld",1)   # Removing the unwanted feature used for count


In [6]:
originalData.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,tld_length
0,https://www.google.com,benign,0,22,14,0,0,3
1,https://www.youtube.com,benign,0,23,15,0,0,3
2,https://www.facebook.com,benign,0,24,16,0,0,3
3,https://www.baidu.com,benign,0,21,13,0,0,3
4,https://www.wikipedia.org,benign,0,25,17,0,0,3


### Count Features

In [7]:
# Extracting all the "count features" and saving them as new feautres
# in the dataset.

originalData['numberOf-'] = originalData['url'].apply(lambda i: i.count('-'))
originalData['numberOf@'] = originalData['url'].apply(lambda i: i.count('@'))
originalData['numberOf?'] = originalData['url'].apply(lambda i: i.count('?'))
originalData['numberOf%'] = originalData['url'].apply(lambda i: i.count('%'))
originalData['numberOf.'] = originalData['url'].apply(lambda i: i.count('.'))
originalData['numberOf='] = originalData['url'].apply(lambda i: i.count('='))
originalData['numberOfhttp'] = originalData['url'].apply(lambda i : i.count('http'))
originalData['numberOfhttps'] = originalData['url'].apply(lambda i : i.count('https'))
originalData['numberOfwww'] = originalData['url'].apply(lambda i: i.count('www'))

def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits
originalData['numberOfdigits']= originalData['url'].apply(lambda i: digit_count(i))

def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
originalData['numberOfletters']= originalData['url'].apply(lambda i: letter_count(i))

def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')
originalData['numberOfdir'] = originalData['url'].apply(lambda i: no_of_dir(i))

In [8]:
originalData.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,tld_length,numberOf-,numberOf@,numberOf?,numberOf%,numberOf.,numberOf=,numberOfhttp,numberOfhttps,numberOfwww,numberOfdigits,numberOfletters,numberOfdir
0,https://www.google.com,benign,0,22,14,0,0,3,0,0,0,0,2,0,1,1,1,0,17,0
1,https://www.youtube.com,benign,0,23,15,0,0,3,0,0,0,0,2,0,1,1,1,0,18,0
2,https://www.facebook.com,benign,0,24,16,0,0,3,0,0,0,0,2,0,1,1,1,0,19,0
3,https://www.baidu.com,benign,0,21,13,0,0,3,0,0,0,0,2,0,1,1,1,0,16,0
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,0,0,0,2,0,1,1,1,0,20,0


### Binary Features

In [9]:
# Checking the use of IP in domain
def havingIP(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        return -1
    else:
        return 1
    
originalData['use_of_ip'] = originalData['url'].apply(lambda i: havingIP(i))

# Cheking wether the URL used a shortening service or not
def shorteningService(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return -1
    else:
        return 1

originalData['short_url'] = originalData['url'].apply(lambda i: shorteningService(i))

In [10]:
# Final data after preprocessing
originalData.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,tld_length,numberOf-,numberOf@,...,numberOf.,numberOf=,numberOfhttp,numberOfhttps,numberOfwww,numberOfdigits,numberOfletters,numberOfdir,use_of_ip,short_url
0,https://www.google.com,benign,0,22,14,0,0,3,0,0,...,2,0,1,1,1,0,17,0,1,1
1,https://www.youtube.com,benign,0,23,15,0,0,3,0,0,...,2,0,1,1,1,0,18,0,1,1
2,https://www.facebook.com,benign,0,24,16,0,0,3,0,0,...,2,0,1,1,1,0,19,0,1,1
3,https://www.baidu.com,benign,0,21,13,0,0,3,0,0,...,2,0,1,1,1,0,16,0,1,1
4,https://www.wikipedia.org,benign,0,25,17,0,0,3,0,0,...,2,0,1,1,1,0,20,0,1,1


## 3. Building Models

Following models will be used for classification:
1. Logistic Regression
2. Decision Trees
3. Random Forest

In [11]:
originalData.columns

Index(['url', 'label', 'result', 'url_length', 'hostname_length',
       'path_length', 'fd_length', 'tld_length', 'numberOf-', 'numberOf@',
       'numberOf?', 'numberOf%', 'numberOf.', 'numberOf=', 'numberOfhttp',
       'numberOfhttps', 'numberOfwww', 'numberOfdigits', 'numberOfletters',
       'numberOfdir', 'use_of_ip', 'short_url'],
      dtype='object')

In [12]:
# Predictor Variables
x = originalData[['hostname_length',
       'path_length', 'fd_length', 'tld_length', 'numberOf-', 'numberOf@',
       'numberOf?', 'numberOf%', 'numberOf.', 'numberOf=', 'numberOfhttp',
       'numberOfhttps', 'numberOfwww', 'numberOfdigits', 'numberOfletters',
       'numberOfdir', 'use_of_ip']]

# Target Variable
y = originalData['result']

In [13]:
# Details of variables

print(f'Shape of predictor(x) variable is : {x.shape}')
print(f'Shape of target(y) variable is : {y.shape}')

Shape of predictor(x) variable is : (450176, 17)
Shape of target(y) variable is : (450176,)


In [14]:
# Splitting the data into Training and Testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.3, random_state=42)

#### Decision Tree

In [15]:
decissionTree = DecisionTreeClassifier()
decissionTree.fit(x_train,y_train)

prediction_DT = decissionTree.predict(x_test)
print(f'Accuracy using Decission Tree is: {accuracy_score(y_test,prediction_DT)}')

Accuracy using Decission Tree is: 0.9953795966032419


In [16]:
print(f'Confusion Matrix using Decission Tree:\n\n{confusion_matrix(y_test,prediction_DT)}')

Confusion Matrix using Decission Tree:

[[241204    748]
 [   708  72464]]


### Random Forest

In [17]:
randomForest = RandomForestClassifier()
randomForest.fit(x_train, y_train)

prediction_RF = randomForest.predict(x_test)
print(f'Accuracy using Random Forest is: {accuracy_score(y_test, prediction_RF)}')

Accuracy using Random Forest is: 0.9972455287442403


In [18]:
print(f'Confusion Matrix using Random Forest:\n\n{confusion_matrix(y_test,prediction_RF)}')

Confusion Matrix using Random Forest:

[[241664    288]
 [   580  72592]]


### Logistic Regression

In [19]:
logRegressionModel = LogisticRegression()
logRegressionModel.fit(x_train,y_train)

prediction_LR = logRegressionModel.predict(x_test)
print(f'Accuracy using Random Forest is: {accuracy_score(y_test,prediction_LR)}')

Accuracy using Random Forest is: 0.9957001053553521


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
print(f'Confusion Matrix using Logistic regression:\n\n{print(confusion_matrix(y_test,prediction_LR))}')

[[241386    566]
 [   789  72383]]
Confusion Matrix using Logistic regression:

None


In [21]:
# As we can see that highest accuracy is
# provided by Random forest classifier, we
# will priceed to save it and use it in our
# application.

pickle.dump(randomForest, open('URLModel.pkl', 'wb'))

['URLModel.pkl']