## ENGG* 6600 Security of Cyber Grids
### Prof. Hadis Karimipour

### Project Title - Identifying suspicious URLs using Supervised Learning and Lexical Analysis

#### Coded by Ruthvik Raja M.V (1162634) and Debanjan Mitra (1126062)

### Multi Classification using KNN and PCA ###

In [2]:
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA

In [3]:
# Loading Malware, Spam, Phishing and Defacement Datasets
# Each Dataset also consists of Benign URL's

malware=pd.read_csv("Dataset/Malware.csv")
spam=pd.read_csv("Dataset/Spam.csv")
phishing=pd.read_csv("Dataset/Phishing.csv")
defacement=pd.read_csv("Dataset/Defacement.csv")


In [5]:
# Data Cleaning: Strip whitespaces from the column names and drop NA values

malware = malware.rename(str.strip, axis='columns')
spam=spam.rename(str.strip, axis="columns")
phishing=phishing.rename(str.strip, axis="columns")
defacement=defacement.rename(str.strip, axis="columns")


In [6]:
# In Malware Dataset nearly 40% of values are NULL values in the NumberRate_Extension column
 # and also from the Binary classification analysis it is clear that the Entropy_DirectoryName has no 
  # impact on the output column so, dropping the column NumberRate_Extension from Spam, Malware and 
   # Phishing Datasets

malware1=malware.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
malware1=malware1.dropna()
malware1=shuffle(malware1)
malware1=malware1.reset_index(drop=True)
malware1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14312 entries, 0 to 14311
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14312 non-null  int64  
 1   domain_token_count               14312 non-null  int64  
 2   path_token_count                 14312 non-null  int64  
 3   avgdomaintokenlen                14312 non-null  float64
 4   longdomaintokenlen               14312 non-null  int64  
 5   avgpathtokenlen                  14312 non-null  float64
 6   tld                              14312 non-null  int64  
 7   charcompvowels                   14312 non-null  int64  
 8   charcompace                      14312 non-null  int64  
 9   ldl_url                          14312 non-null  int64  
 10  ldl_domain                       14312 non-null  int64  
 11  ldl_path                         14312 non-null  int64  
 12  ldl_filename      

In [7]:
# In Spam Dataset nearly 34% of values are NULL values in the NumberRate_Extension column

spam1=spam.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
spam1=spam1.dropna()
spam1=shuffle(spam1)
spam1=spam1.reset_index(drop=True)
spam1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14294 entries, 0 to 14293
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14294 non-null  int64  
 1   domain_token_count               14294 non-null  int64  
 2   path_token_count                 14294 non-null  int64  
 3   avgdomaintokenlen                14294 non-null  float64
 4   longdomaintokenlen               14294 non-null  int64  
 5   avgpathtokenlen                  14294 non-null  float64
 6   tld                              14294 non-null  int64  
 7   charcompvowels                   14294 non-null  int64  
 8   charcompace                      14294 non-null  int64  
 9   ldl_url                          14294 non-null  int64  
 10  ldl_domain                       14294 non-null  int64  
 11  ldl_path                         14294 non-null  int64  
 12  ldl_filename      

In [8]:
# In Phishing Dataset nearly 48% of values are NULL values in the NumberRate_Extension column

phishing1=phishing.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
phishing1=phishing1.dropna()
phishing1=shuffle(phishing1)
phishing1=phishing1.reset_index(drop=True)
phishing1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14889 entries, 0 to 14888
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14889 non-null  int64  
 1   domain_token_count               14889 non-null  int64  
 2   path_token_count                 14889 non-null  int64  
 3   avgdomaintokenlen                14889 non-null  float64
 4   longdomaintokenlen               14889 non-null  int64  
 5   avgpathtokenlen                  14889 non-null  float64
 6   tld                              14889 non-null  int64  
 7   charcompvowels                   14889 non-null  int64  
 8   charcompace                      14889 non-null  int64  
 9   ldl_url                          14889 non-null  int64  
 10  ldl_domain                       14889 non-null  int64  
 11  ldl_path                         14889 non-null  int64  
 12  ldl_filename      

In [9]:
# In Defacement Dataset nearly 39% of values are NULL values in the Entropy_DirectoryName
 # and 32% of values are NULL values in the NumberRate_Extension column

defacement1=defacement.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
defacement1=defacement1.dropna()
defacement1=shuffle(defacement1) 
defacement1=defacement1.reset_index(drop=True)
defacement1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15477 entries, 0 to 15476
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      15477 non-null  int64  
 1   domain_token_count               15477 non-null  int64  
 2   path_token_count                 15477 non-null  int64  
 3   avgdomaintokenlen                15477 non-null  float64
 4   longdomaintokenlen               15477 non-null  int64  
 5   avgpathtokenlen                  15477 non-null  float64
 6   tld                              15477 non-null  int64  
 7   charcompvowels                   15477 non-null  int64  
 8   charcompace                      15477 non-null  int64  
 9   ldl_url                          15477 non-null  int64  
 10  ldl_domain                       15477 non-null  int64  
 11  ldl_path                         15477 non-null  int64  
 12  ldl_filename      

### Merging all the four Data sets ###

In [10]:
# The final Dataset consists of 58972 rows, 77 features and 1 output feature
 # Appending all the DataFrames along rows

all_files=np.concatenate((malware1, spam1, phishing1, defacement1), axis=0)
all_files=pd.DataFrame(all_files, columns=malware1.columns)

all_files.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58972 entries, 0 to 58971
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Querylength                      58972 non-null  object
 1   domain_token_count               58972 non-null  object
 2   path_token_count                 58972 non-null  object
 3   avgdomaintokenlen                58972 non-null  object
 4   longdomaintokenlen               58972 non-null  object
 5   avgpathtokenlen                  58972 non-null  object
 6   tld                              58972 non-null  object
 7   charcompvowels                   58972 non-null  object
 8   charcompace                      58972 non-null  object
 9   ldl_url                          58972 non-null  object
 10  ldl_domain                       58972 non-null  object
 11  ldl_path                         58972 non-null  object
 12  ldl_filename                    

In [11]:
# Functions:-

# Function for Data Scaling
def dataset_scaling(dataset):
        
  # Scaling dataset
  scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling object for features 
  
  df_X = dataset.loc[:, dataset.columns!="URL_Type_obf_Type"]
  df_Y = dataset[["URL_Type_obf_Type"]]
  
  df_X = scaler.fit_transform(df_X)
  dataset1=np.concatenate((df_X, df_Y), axis=1)
  
  return dataset1, scaler    

# Function for appending the features with high correlation
def columns_highcorr(corr, columns):
    index_highcorr=[]
    for i in range(0, corr.shape[0]):
        for j in range(i, corr.shape[1]):
            if(0.8<=corr.iloc[i][j]<1):
                print(columns[i],"and",columns[j],"with Correlation",round(corr.iloc[i][j],2))
                if(i not in index_highcorr):
                    index_highcorr.append(i);
                 
    return index_highcorr

In [12]:
# From the Previous Analysis it is clear that there are 45 features with no impact on the output variable
 # So, dropping all the 45 columns from the final Dataset
 
names_columns_worst=['Querylength', 'path_token_count', 'avgdomaintokenlen', 'longdomaintokenlen', 
                     'avgpathtokenlen', 'charcompvowels', 'charcompace', 'ldl_url', 'ldl_path', 
                     'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path', 
                     'dld_filename', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 
                     'this.fileExtLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 
                     'argPathRatio', 'executable', 'isPortEighty', 'ISIpAddressInDomainName', 
                     'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount', 'Directory_DigitCount', 
                     'Extension_DigitCount', 'Query_LetterCount', 'Path_LongestWordLength', 'URL_sensitiveWord', 
                     'URLQueries_variable', 'spcharUrl', 'delimeter_Count', 'NumberRate_Domain', 'NumberRate_DirectoryName', 
                     'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_FileName', 'SymbolCount_Extension'] 
 

In [13]:
all_files1=all_files.drop(columns=names_columns_worst) # Dropping all the above features
 
# Saving the above Dataframe as a csv file and loading it for further evaluation

all_files1.to_csv("/Users/ruthvikrajam.v/Desktop/All Files.csv",index=False)
all_files1=pd.read_csv("/Users/ruthvikrajam.v/Desktop/All Files.csv")


In [14]:
# Checking further if there are any features with high Correlation 

corr=all_files1.corr()
index_allfiles_highcorr=columns_highcorr(corr, corr.columns)
all_files2=all_files1.drop(columns=corr.columns[index_allfiles_highcorr])
# 58,972 rows and 27 columns

domain_token_count and SymbolCount_Domain with Correlation 1.0
tld and SymbolCount_Domain with Correlation 1.0
pathDomainRatio and URL_Letter_Count with Correlation 0.91
pathDomainRatio and Extension_LetterCount with Correlation 0.89
pathDomainRatio and LongestPathTokenLength with Correlation 0.92
Query_DigitCount and URL_Letter_Count with Correlation 0.86
Query_DigitCount and Extension_LetterCount with Correlation 0.87
Query_DigitCount and LongestPathTokenLength with Correlation 0.89
URL_Letter_Count and Extension_LetterCount with Correlation 0.95
URL_Letter_Count and LongestPathTokenLength with Correlation 0.97
Extension_LetterCount and LongestPathTokenLength with Correlation 0.97


In [16]:
# Assigning labels to the classes

all_files3=all_files2.copy() # Modifications done in all_files3 wont be reflected in all_files2
        
all_files3["URL_Type_obf_Type"]=all_files3["URL_Type_obf_Type"].map({"Defacement":0, "benign":1, "malware":2, "phishing":3, "spam":4 })


In [17]:
# Scaling the Dataset

all_files4, scaler=dataset_scaling(all_files3)
all_files4=pd.DataFrame(all_files4, columns=all_files3.columns)


In [18]:
# Principal Component Analysis -> Dimensionality Reduction

# Separating the input and output features
X=all_files4.loc[:, all_files4.columns!="URL_Type_obf_Type"]
y=all_files4["URL_Type_obf_Type"]
  
# Let's say, components = 10
pca = PCA(n_components = 10)
pca.fit(X)
X_pca = pca.transform(X)


In [19]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)


In [22]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=knn.predict(train_x)


0.9810070991516113


In [23]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 96% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 97% for training

# Time taken by PCA to execute -> 0.933 seconds

95.85566031336906
97.11275407538041


## If we consider all the columns in the Dataset i.e 77 features:-

In [24]:
all_files_features=all_files.copy()
all_files_features["URL_Type_obf_Type"]=all_files_features["URL_Type_obf_Type"].map({"Defacement":0, "benign":1, "malware":2, "phishing":3, "spam":4 })


In [25]:
all_files_features1, scaler=dataset_scaling(all_files_features)
all_files_features1=pd.DataFrame(all_files_features1, columns=all_files_features.columns)


In [26]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(all_files_features1.loc[:, all_files_features1.columns!="URL_Type_obf_Type"], all_files_features1["URL_Type_obf_Type"], random_state=0, test_size=0.25)


In [27]:
# Machine Learning Models
start = time.time()
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=knn.predict(train_x)


21.16399383544922


In [30]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100)
print(accuracy_score(train_y, predicted_train) * 100)

# Takes 22.5 seconds, Testing accuracy score -> 95.9% and Training accuracy score -> 97%, when we try to use all the features


95.81496303330394
96.92961631508739


### <---------------------- THE END -----------------------> ###