## <font color="Blue">Phishing-URL Detection Model using Deep Learning algorithm - ConvXGB </font>

<b>GOAL:</b> This study employs new deep learning algorithm named "ConvXGB" to the field of cybersecurity in detecting phishing URL.</br>
<b>Author :</b> Saravanan Muthuramalingam </br>
<b>Purpose of this notebook :</b> This Notebook handles the following,
    <li> Understanding the datasets </li>
    <li> Data Cleaning and Formating </li>
    <li> Inferential Statistics </li>
    <li> Data Pre-processing </li>
    


In [1]:
# import all required python libraries
#-------------------------------------#
# Statistics Libraries
import numpy as np

# Dataset related Libraires
import pandas as pd 
import csv

# Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# URL Parsing Libraries
import urllib.parse
from urllib.parse import urlparse
from urllib.parse import urlsplit
from urlpath import URL

# OS and regular expression Libraries
import re
import glob
import os

# Image processing related Libraries
from PIL import Image
import cv2
import skimage.measure
import imghdr

# Image validation related Libraries
from difPy import dif
from sklearn.preprocessing import OneHotEncoder

# To Build CNN in Keras 
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD
#from keras.layers.normalization import BatchNormalization
from tensorflow.keras.layers import BatchNormalization
from keras.layers import LeakyReLU
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam, Adadelta, RMSprop

# XGBoost classification algorithm
from xgboost import XGBClassifier

# Perforrmance evaluation Librraries
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


#### <font color=blue> Data Validation </font>

In [14]:
# loading the raw-dataset to pandas dataframe
Phishing_url_df = pd.read_csv('phishing_dataset.csv', encoding='unicode_escape')

In [33]:
# Reading the top rows of dataset
Phishing_url_df.head()

Unnamed: 0,rec_id,url,website,result,created_date
0,1,https://www.screamingfrog.co.uk/pay-per-click/,1635703174277606.html,0,2021-10-31 17:59:34
1,2,https://sms-labanquepostale-sms-labanqueposta...,1613573052480813.html,1,2021-02-17 20:14:12
2,3,http://mrsbt213.yolasite.com/,1607095600394378.html,1,2020-12-04 20:56:40
3,4,http://234.50.198.35.bc.googleusercontent.com...,1620759901211522.html,1,2021-05-12 00:35:01
4,5,https://3-138-183-243.cprapid.com/secureNetfl...,1626464266508342.html,1,2021-07-16 19:37:46


In [34]:
# Viewing the features in the raw dataset
Phishing_url_df.columns

Index(['rec_id', 'url', 'website', 'result', 'created_date'], dtype='object')

In [7]:
# Calculating the total row-count of raw dataset
Phishing_url_df['url'].size

79741

In [8]:
# Understanding the datatype of each features
Phishing_url_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79741 entries, 0 to 79740
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   rec_id        79741 non-null  int64 
 1   url           79741 non-null  object
 2   website       79741 non-null  object
 3   result        79741 non-null  int64 
 4   created_date  79741 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.0+ MB


In [9]:
# Checking for any missing values in the dataset features
Phishing_url_df.isnull().sum()

rec_id          0
url             0
website         0
result          0
created_date    0
dtype: int64

#### a. Data Similarity Check

Here in the data validation, we are checking if there is similar/fake screenshot images in the dataset as unique screenshot will result in poor test result.

In [4]:
# Function that searches the folder for image files, converts them to a tensor
def create_imgs_matrix(directory, px_size=50):
    global image_files   
    image_files = []
    # create list of all files in directory     
    folder_files = [filename for filename in os.listdir(directory)]  
    
    # create images matrix   
    counter = 0
    for filename in folder_files: 
        # check if the file is accesible and if the file format is an image
        if not os.path.isdir(directory + filename) and imghdr.what(directory + filename):
            # decode the image and create the matrix
            img = cv2.imdecode(np.fromfile(directory + filename, dtype=np.uint8), cv2.IMREAD_UNCHANGED)
            if type(img) == np.ndarray:
                img = img[...,0:3]
                # resize the image based on the given compression value
                img = cv2.resize(img, dsize=(px_size, px_size), interpolation=cv2.INTER_CUBIC)
                if counter == 0:
                    imgs_matrix = img
                    image_files.append(filename)
                    counter += 1
                else:
                    imgs_matrix = np.concatenate((imgs_matrix, img))
                    image_files.append(filename)
    return imgs_matrix

#Function for rotating an image matrix by a 90 degree angle
def rotate_img(image):
    image = np.rot90(image, k=1, axes=(0, 1))
    return image

# Function for checking the quality of compared images, appends the lower quality image to the list
def check_img_quality(directory, imageA, imageB, list):
    size_imgA = os.stat(directory + imageA).st_size
    size_imgB = os.stat(directory + imageB).st_size
    if size_imgA > size_imgB:
        add_to_list(imageB, list)
    else:
        add_to_list(imageA, list)

In [6]:
#To print the duplicates images ratio.
search = dif(r"C:\Users\msara\Desktop\dataset\converted_images")

DifPy preparing files: [14999/14999] [100%]
DifPy comparing images: [14999/14999] [100%]
Found 2869 images with one or more duplicate/similar images in 25555.3503 seconds.


#### b. Checking missing values

In [4]:
#Checking if there is any featureset with missing values
total = Phishing_url_df.isnull().sum().sort_values(ascending=False)
percent = (Phishing_url_df.isnull().sum())/Phishing_url_df.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(5)

Unnamed: 0,Total,Percent
rec_id,0,0.0
url,0,0.0
website,0,0.0
result,0,0.0
created_date,0,0.0


In [5]:
#Checking if there is any featureset with missing values
missing_data[missing_data['Percent'] != 0.0]

Unnamed: 0,Total,Percent


#### c. Data duplication Check

In [6]:
Phishing_url_df.shape

(79741, 5)

In [11]:
Phishing_url_df['url'].unique().size

79564

In [23]:
Phishing_url_df[~Phishing_url_df["url"].duplicated()]

Unnamed: 0,rec_id,url,website,result,created_date
0,1,https://www.screamingfrog.co.uk/pay-per-click/,1635703174277606.html,0,2021-10-31 17:59:34
1,2,https://sms-labanquepostale-sms-labanqueposta...,1613573052480813.html,1,2021-02-17 20:14:12
2,3,http://mrsbt213.yolasite.com/,1607095600394378.html,1,2020-12-04 20:56:40
3,4,http://234.50.198.35.bc.googleusercontent.com...,1620759901211522.html,1,2021-05-12 00:35:01
4,5,https://3-138-183-243.cprapid.com/secureNetfl...,1626464266508342.html,1,2021-07-16 19:37:46
...,...,...,...,...,...
79736,79737,https://bestjobmanage.com/adminservicedesk/True/,1622125898152592.html,1,2021-05-27 14:31:38
79737,79738,http://vieuxshack.com/download/adobe/b51f1807...,160822961118687.html,1,2020-12-17 23:56:51
79738,79739,https://www.tumblr.com/search/gas%20mask%20ta...,163570726902772.html,0,2021-10-31 19:07:49
79739,79740,https://www.magnetic-shield.com/pdf/wc_4.pdf,1635701992217159.html,0,2021-10-31 17:39:52
