## ENGG* 6600 Security of Cyber Grids
### Prof. Hadis Karimipour

### Project Title - Identifying suspicious URLs using Supervised Learning and Lexical Analysis

#### Coded by Ruthvik Raja M.V (1162634) and Debanjan Mitra (1126062)

### Binary Classification using KNN ###

In [1]:
# Importing all the necessary libraries

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Loading Malware, Spam, Phishing and Defacement Datasets
# Each Dataset also consists of Benign URL's

malware=pd.read_csv("Dataset//Malware.csv")
spam=pd.read_csv("Dataset/Spam.csv")
phishing=pd.read_csv("Dataset/Phishing.csv")
defacement=pd.read_csv("Dataset/Defacement.csv")

In [3]:
# Data Cleaning: Strip whitespaces from the column names and drop NA values

malware = malware.rename(str.strip, axis='columns')
spam=spam.rename(str.strip, axis="columns")
phishing=phishing.rename(str.strip, axis="columns")
defacement=defacement.rename(str.strip, axis="columns")

In [4]:
malware.info() # NumerRate_Extension values are missing for most of the rows
spam.info() # NumerRate_Extension values are missing for most of the rows
phishing.info() # NumerRate_Extension values are missing for most of the rows
defacement.info() # Entropy_DirectoryName and NumberRate_Extension values are missing for most of the rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14493 entries, 0 to 14492
Data columns (total 80 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14493 non-null  int64  
 1   domain_token_count               14493 non-null  int64  
 2   path_token_count                 14493 non-null  int64  
 3   avgdomaintokenlen                14493 non-null  float64
 4   longdomaintokenlen               14493 non-null  int64  
 5   avgpathtokenlen                  14482 non-null  float64
 6   tld                              14493 non-null  int64  
 7   charcompvowels                   14493 non-null  int64  
 8   charcompace                      14493 non-null  int64  
 9   ldl_url                          14493 non-null  int64  
 10  ldl_domain                       14493 non-null  int64  
 11  ldl_path                         14493 non-null  int64  
 12  ldl_filename      

### Malware and Benign ###

In [5]:
# In Malware Dataset nearly 40% of values are NULL values in the NumberRate_Extension column

malware1=malware.drop(["NumberRate_Extension"], axis=1)
malware1=malware1.dropna()
malware1=shuffle(malware1)
malware1=malware1.reset_index(drop=True)
malware1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12442 entries, 0 to 12441
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      12442 non-null  int64  
 1   domain_token_count               12442 non-null  int64  
 2   path_token_count                 12442 non-null  int64  
 3   avgdomaintokenlen                12442 non-null  float64
 4   longdomaintokenlen               12442 non-null  int64  
 5   avgpathtokenlen                  12442 non-null  float64
 6   tld                              12442 non-null  int64  
 7   charcompvowels                   12442 non-null  int64  
 8   charcompace                      12442 non-null  int64  
 9   ldl_url                          12442 non-null  int64  
 10  ldl_domain                       12442 non-null  int64  
 11  ldl_path                         12442 non-null  int64  
 12  ldl_filename      

In [6]:
# Functions:-

# If URL_Type_obf_Type == "malware" [class label 0], else [class label 1]
# Function to label the classes in a DataFrame
def class_labels_malware(class_name):
    if(class_name=="malware"):
        return 0
    else:
        return 1
   
# Function for Data Scaling
def dataset_scaling(dataset):
        
  # Scaling dataset
  scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling object for features 
  
  df_X = dataset.loc[:, dataset.columns!="URL_Type_obf_Type"]
  df_Y = dataset[["URL_Type_obf_Type"]]
  
  df_X = scaler.fit_transform(df_X)
  dataset1=np.concatenate((df_X, df_Y), axis=1)
  
  return dataset1, scaler    

In [7]:
malware1["URL_Type_obf_Type"]=malware1["URL_Type_obf_Type"].apply(lambda x:class_labels_malware(x))
# Mapping the classes with their respective labels

malware_columns=malware1.columns
malware2, scaler=dataset_scaling(malware1) # Scaling the Dataset

malware2=pd.DataFrame(malware2,columns=malware_columns)
# After, performing scaling the output will be of type array so storing it as a DataFrame

In [8]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(malware2.loc[:, malware2.columns!="URL_Type_obf_Type"], malware2["URL_Type_obf_Type"], random_state=0, test_size=0.25)

In [9]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output for Train data
predicted_train=knn.predict(train_x)

1.076361894607544


In [12]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 98.4% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99% for training
print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.98

97.84635165541627
98.70324724038151
0.9797031202665859


In [None]:
# Invert Input Variables[Optional] -> Apply similar code further if we want to Invert the Input Variables
X_invert=scaler.inverse_transform(malware2.loc[:, malware2.columns!="URL_Type_obf_Type"])
train_x_invert=scaler.inverse_transform(train_x)
test_x_invert=scaler.inverse_transform(test_x)   

### Spam and Benign ###

In [27]:
# In Spam Dataset nearly 34% of values are NULL values in the NumberRate_Extension column

spam1=spam.drop(["NumberRate_Extension"], axis=1)
spam1=spam1.dropna()
spam1=shuffle(spam1)
spam1=spam1.reset_index(drop=True)
spam1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12420 entries, 0 to 12419
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      12420 non-null  int64  
 1   domain_token_count               12420 non-null  int64  
 2   path_token_count                 12420 non-null  int64  
 3   avgdomaintokenlen                12420 non-null  float64
 4   longdomaintokenlen               12420 non-null  int64  
 5   avgpathtokenlen                  12420 non-null  float64
 6   tld                              12420 non-null  int64  
 7   charcompvowels                   12420 non-null  int64  
 8   charcompace                      12420 non-null  int64  
 9   ldl_url                          12420 non-null  int64  
 10  ldl_domain                       12420 non-null  int64  
 11  ldl_path                         12420 non-null  int64  
 12  ldl_filename      

In [28]:
# If URL_Type_obf_Type == "spam" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_spam(class_name):
    if(class_name=="spam"):
        return 0
    else:
        return 1

In [29]:
spam1["URL_Type_obf_Type"]=spam1["URL_Type_obf_Type"].apply(lambda x:class_labels_spam(x))
spam_columns=spam1.columns
spam2, scaler=dataset_scaling(spam1) # Scaling the Dataset

spam2=pd.DataFrame(spam2,columns=spam_columns)

In [30]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(spam2.loc[:, spam2.columns!="URL_Type_obf_Type"], spam2["URL_Type_obf_Type"], random_state=0, test_size=0.25)

In [31]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=knn.predict(train_x)


1.0912067890167236


In [32]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 99.4% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.5% for training

# Takes around 1.15 seconds

print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.99

99.71014492753623
99.51690821256038
0.9972316210396801


### Phishing and Benign ###

In [33]:
# In Phishing Dataset nearly 48% of values are NULL values in the NumberRate_Extension column

phishing1=phishing.drop(["NumberRate_Extension"], axis=1)
phishing1=phishing1.dropna()
phishing1=shuffle(phishing1)
phishing1=phishing1.reset_index(drop=True)
phishing1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13084 entries, 0 to 13083
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      13084 non-null  int64  
 1   domain_token_count               13084 non-null  int64  
 2   path_token_count                 13084 non-null  int64  
 3   avgdomaintokenlen                13084 non-null  float64
 4   longdomaintokenlen               13084 non-null  int64  
 5   avgpathtokenlen                  13084 non-null  float64
 6   tld                              13084 non-null  int64  
 7   charcompvowels                   13084 non-null  int64  
 8   charcompace                      13084 non-null  int64  
 9   ldl_url                          13084 non-null  int64  
 10  ldl_domain                       13084 non-null  int64  
 11  ldl_path                         13084 non-null  int64  
 12  ldl_filename      

In [34]:
# If URL_Type_obf_Type == "phishing" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_phishing(class_name):
    if(class_name=="phishing"):
        return 0
    else:
        return 1

In [35]:
phishing1["URL_Type_obf_Type"]=phishing1["URL_Type_obf_Type"].apply(lambda x:class_labels_phishing(x))
phishing_columns=phishing1.columns
phishing2, scaler=dataset_scaling(phishing1) # Scaling the Dataset

phishing2=pd.DataFrame(phishing2,columns=phishing_columns)

In [36]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(phishing2.loc[:, phishing2.columns!="URL_Type_obf_Type"], phishing2["URL_Type_obf_Type"], random_state=0, test_size=0.25)


In [37]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=knn.predict(train_x)

1.2158730030059814


In [38]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 96.4% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 97.4% for training

print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.96


96.66768572302048
97.85998165698562
0.9677228309150134


### Defacement and Benign ###

In [46]:
# In Defacement Dataset nearly 39% of values are NULL values in the Entropy_DirectoryName
 # and 32% of values are NULL values in the NumberRate_Extension column

# If we are using the dropna() on the Defacement Dataset, the number of rows falls from 
 # 15711 to 5186 rows but if we drop the above two columns and apply dropna() then the number
  # of rows reduces from 15711 to 15477 so, better to drop the two columns

defacement1=defacement.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
defacement1=defacement1.dropna()
defacement1=shuffle(defacement1) 
defacement1=defacement1.reset_index(drop=True)
defacement1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15477 entries, 0 to 15476
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      15477 non-null  int64  
 1   domain_token_count               15477 non-null  int64  
 2   path_token_count                 15477 non-null  int64  
 3   avgdomaintokenlen                15477 non-null  float64
 4   longdomaintokenlen               15477 non-null  int64  
 5   avgpathtokenlen                  15477 non-null  float64
 6   tld                              15477 non-null  int64  
 7   charcompvowels                   15477 non-null  int64  
 8   charcompace                      15477 non-null  int64  
 9   ldl_url                          15477 non-null  int64  
 10  ldl_domain                       15477 non-null  int64  
 11  ldl_path                         15477 non-null  int64  
 12  ldl_filename      

In [47]:
# If URL_Type_obf_Type == "Defacement" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_defacement(class_name):
    if(class_name=="Defacement"):
        return 0
    else:
        return 1

In [48]:
defacement1["URL_Type_obf_Type"]=defacement1["URL_Type_obf_Type"].apply(lambda x:class_labels_defacement(x))
defacement_columns=defacement1.columns
defacement2, scaler=dataset_scaling(defacement1) # Scaling the Dataset

defacement2=pd.DataFrame(defacement2,columns=defacement_columns)


In [49]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(defacement2.loc[:, defacement2.columns!="URL_Type_obf_Type"], defacement2["URL_Type_obf_Type"], random_state=0, test_size=0.25)


In [50]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=knn.predict(train_x)

1.6376173496246338


In [51]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 99.1% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.5% for training

# When k=5, 99.1% and 99.5% for test and train data and takes 1.33 seconds to execute

print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.99

99.19896640826873
99.5261480141294
0.9919333853760083


### <---------------------- THE END -----------------------> ###