## ENGG* 6600 Security of Cyber Grids
### Prof. Hadis Karimipour

### Project Title - Identifying suspicious URLs using Supervised Learning and Lexical Analysis

#### Coded by Ruthvik Raja M.V (1162634) and Debanjan Mitra (1126062)

### Binary Classification using Random Forest Classifier and PCA ###

In [1]:
# Importing all the necessary libraries

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA

In [2]:
# Loading Malware, Spam, Phishing and Defacement Datasets
# Each Dataset also consists of Benign URL's

malware=pd.read_csv("/Users/ruthvikrajam.v/Desktop/ENGG*6600 Project/FinalDataset/Malware.csv")
spam=pd.read_csv("/Users/ruthvikrajam.v/Desktop/ENGG*6600 Project/FinalDataset/Spam.csv")
phishing=pd.read_csv("/Users/ruthvikrajam.v/Desktop/ENGG*6600 Project/FinalDataset/Phishing.csv")
defacement=pd.read_csv("/Users/ruthvikrajam.v/Desktop/ENGG*6600 Project/FinalDataset/Defacement.csv")

In [3]:
# Data Cleaning: Strip whitespaces from the column names and drop NA values

malware = malware.rename(str.strip, axis='columns')
spam=spam.rename(str.strip, axis="columns")
phishing=phishing.rename(str.strip, axis="columns")
defacement=defacement.rename(str.strip, axis="columns")

In [4]:
malware.info() # NumerRate_Extension values are missing for most of the rows
spam.info() # NumerRate_Extension values are missing for most of the rows
phishing.info() # NumerRate_Extension values are missing for most of the rows
defacement.info() # Entropy_DirectoryName and NumberRate_Extension values are missing for most of the rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14493 entries, 0 to 14492
Data columns (total 80 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14493 non-null  int64  
 1   domain_token_count               14493 non-null  int64  
 2   path_token_count                 14493 non-null  int64  
 3   avgdomaintokenlen                14493 non-null  float64
 4   longdomaintokenlen               14493 non-null  int64  
 5   avgpathtokenlen                  14482 non-null  float64
 6   tld                              14493 non-null  int64  
 7   charcompvowels                   14493 non-null  int64  
 8   charcompace                      14493 non-null  int64  
 9   ldl_url                          14493 non-null  int64  
 10  ldl_domain                       14493 non-null  int64  
 11  ldl_path                         14493 non-null  int64  
 12  ldl_filename      

### Malware and Benign ###

In [5]:
# In Malware Dataset nearly 40% of values are NULL values in the NumberRate_Extension column

malware1=malware.drop(["NumberRate_Extension"], axis=1)
malware1=malware1.dropna()
malware1=shuffle(malware1)
malware1=malware1.reset_index(drop=True)
malware1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12442 entries, 0 to 12441
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      12442 non-null  int64  
 1   domain_token_count               12442 non-null  int64  
 2   path_token_count                 12442 non-null  int64  
 3   avgdomaintokenlen                12442 non-null  float64
 4   longdomaintokenlen               12442 non-null  int64  
 5   avgpathtokenlen                  12442 non-null  float64
 6   tld                              12442 non-null  int64  
 7   charcompvowels                   12442 non-null  int64  
 8   charcompace                      12442 non-null  int64  
 9   ldl_url                          12442 non-null  int64  
 10  ldl_domain                       12442 non-null  int64  
 11  ldl_path                         12442 non-null  int64  
 12  ldl_filename      

In [6]:
# Functions:-
# If URL_Type_obf_Type == "malware" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_malware(class_name):
    if(class_name=="malware"):
        return 0
    else:
        return 1
   
# Function for Data Scaling
def dataset_scaling(dataset):
        
  # Scaling dataset
  scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling object for features 
  
  df_X = dataset.loc[:, dataset.columns!="URL_Type_obf_Type"]
  df_Y = dataset[["URL_Type_obf_Type"]]
  
  df_X = scaler.fit_transform(df_X)
  dataset1=np.concatenate((df_X, df_Y), axis=1)
  
  return dataset1, scaler    
    
# Function for appending the column indices with correlation value closer to 0
def columns_lowcorr(corr, columns):
    index_lowcorr=[]
    for i in range(0, corr.shape[1]):
        if((-0.1 < corr.iloc[-1,i] < 0.1) or (pd.isnull(corr.iloc[-1,i]))):
            print(columns[-1],"and",columns[i],"with Correlation",round(corr.iloc[-1][i],2))
            index_lowcorr.append(i)
    return index_lowcorr        

# Function for appending the features with high correlation i.e >= 0.8
def columns_highcorr(corr, columns):
    index_highcorr=[]
    for i in range(0, corr.shape[0]-1):
        for j in range(i, corr.shape[1]-1):
            if(0.8<=corr.iloc[i][j]<1):
                print(columns[i],"and",columns[j],"with Correlation",round(corr.iloc[i][j],2))
                if(i not in index_highcorr):
                    index_highcorr.append(i);
                 
    return index_highcorr

# Function to determine the final Index column indices with low and high correlation values
def final_index(index_lowcorr, index_highcorr):
    index=[]
    for i in index_lowcorr:
        if i not in index:
            index.append(i)
    for j in index_highcorr:
        if j not in index:
            index.append(j) 
    index.sort()
    return index

In [7]:

malware1["URL_Type_obf_Type"]=malware1["URL_Type_obf_Type"].apply(lambda x:class_labels_malware(x))
# Mapping the classes with their respective labels

malware_columns=malware1.columns

corr=malware1.corr()
      
# Removing all the columns with correlation almost equal to 0 ->
 # Which shows no impact on the O/P variable and the features with High Correlation      

print("<-----Features with Low Correlation Value----->")    
index_malware_lowcorr=columns_lowcorr(corr, malware_columns)
print("<---------------------------------------------->")
print("<-----Features with High Correlation Value----->")
index_malware_highcorr=columns_highcorr(corr, malware_columns)
index_malware=final_index(index_malware_lowcorr, index_malware_highcorr)

malware3=malware1.drop(columns=malware_columns[[index_malware]])
            
malware2, scaler=dataset_scaling(malware3) # Scaling the Dataset

malware2=pd.DataFrame(malware2,columns=malware3.columns)
# After, performing scaling the output will be of type array so storing it as a DataFrame

<-----Features with Low Correlation Value----->
URL_Type_obf_Type and Querylength with Correlation -0.09
URL_Type_obf_Type and avgdomaintokenlen with Correlation -0.0
URL_Type_obf_Type and avgpathtokenlen with Correlation 0.07
URL_Type_obf_Type and ldl_filename with Correlation -0.04
URL_Type_obf_Type and dld_domain with Correlation nan
URL_Type_obf_Type and dld_filename with Correlation -0.01
URL_Type_obf_Type and urlLen with Correlation 0.06
URL_Type_obf_Type and fileNameLen with Correlation 0.02
URL_Type_obf_Type and this.fileExtLen with Correlation 0.07
URL_Type_obf_Type and argDomanRatio with Correlation -0.08
URL_Type_obf_Type and executable with Correlation -0.07
URL_Type_obf_Type and isPortEighty with Correlation -0.03
URL_Type_obf_Type and ISIpAddressInDomainName with Correlation nan
URL_Type_obf_Type and LongestVariableValue with Correlation -0.07
URL_Type_obf_Type and Directory_DigitCount with Correlation 0.02
URL_Type_obf_Type and File_name_DigitCount with Correlation -0.0


  result = getitem(key)


In [8]:
# Principal Component Analysis

# Separating the input and output features
X=malware2.loc[:, malware2.columns!="URL_Type_obf_Type"]
y=malware2["URL_Type_obf_Type"]
  
# Let's say, components = 10
pca = PCA(n_components = 10)
pca.fit(X)
X_pca = pca.transform(X)

In [9]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)

In [10]:
# Machine Learning Models
start = time.time()
rfc=RandomForestClassifier(n_estimators=20, criterion="gini", n_jobs=-1)
rfc.fit(train_x, train_y)
predicted_test=rfc.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=rfc.predict(train_x)

0.31168508529663086


In [11]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 98.9% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.9% for training
 
print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.98


98.90710382513662
99.97856607008895
0.9897836538461539


### Spam and Benign ###

In [12]:
# In Spam  Dataset nearly 34% of values are NULL values in the NumberRate_Extension column

spam1=spam.drop(["NumberRate_Extension"], axis=1)
spam1=spam1.dropna()
spam1=shuffle(spam1)
spam1=spam1.reset_index(drop=True)
spam1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12420 entries, 0 to 12419
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      12420 non-null  int64  
 1   domain_token_count               12420 non-null  int64  
 2   path_token_count                 12420 non-null  int64  
 3   avgdomaintokenlen                12420 non-null  float64
 4   longdomaintokenlen               12420 non-null  int64  
 5   avgpathtokenlen                  12420 non-null  float64
 6   tld                              12420 non-null  int64  
 7   charcompvowels                   12420 non-null  int64  
 8   charcompace                      12420 non-null  int64  
 9   ldl_url                          12420 non-null  int64  
 10  ldl_domain                       12420 non-null  int64  
 11  ldl_path                         12420 non-null  int64  
 12  ldl_filename      

In [13]:
# If URL_Type_obf_Type == "spam" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_spam(class_name):
    if(class_name=="spam"):
        return 0
    else:
        return 1

In [14]:
spam1["URL_Type_obf_Type"]=spam1["URL_Type_obf_Type"].apply(lambda x:class_labels_spam(x))
spam_columns=spam1.columns

corr=spam1.corr() # Function to find the correlation of a DataFrame

In [15]:
# Removing all the columns with correlation almost equal to 0 ->
 # Which shows no impact on the O/P variable and the features with High Correlation      

print("<-----Features with Low Correlation Value----->")    
index_spam_lowcorr=columns_lowcorr(corr, spam_columns)
print("<---------------------------------------------->")
print("<-----Features with High Correlation Value----->")    
index_spam_highcorr=columns_highcorr(corr, spam_columns)
index_spam=final_index(index_spam_lowcorr, index_spam_highcorr)

spam3=spam1.drop(columns=spam_columns[[index_spam]])
            
spam2, scaler=dataset_scaling(spam3) # Scaling the Dataset
spam2=pd.DataFrame(spam2,columns=spam3.columns)


<-----Features with Low Correlation Value----->
URL_Type_obf_Type and path_token_count with Correlation 0.1
URL_Type_obf_Type and avgpathtokenlen with Correlation 0.09
URL_Type_obf_Type and dld_domain with Correlation nan
URL_Type_obf_Type and dld_filename with Correlation 0.1
URL_Type_obf_Type and this.fileExtLen with Correlation 0.03
URL_Type_obf_Type and pathurlRatio with Correlation 0.04
URL_Type_obf_Type and executable with Correlation nan
URL_Type_obf_Type and isPortEighty with Correlation nan
URL_Type_obf_Type and ISIpAddressInDomainName with Correlation nan
URL_Type_obf_Type and host_DigitCount with Correlation -0.01
URL_Type_obf_Type and Directory_DigitCount with Correlation 0.1
URL_Type_obf_Type and Directory_LetterCount with Correlation -0.09
URL_Type_obf_Type and Path_LongestWordLength with Correlation 0.02
URL_Type_obf_Type and sub-Directory_LongestWordLength with Correlation -0.02
URL_Type_obf_Type and URL_sensitiveWord with Correlation -0.08
URL_Type_obf_Type and spcharU

pathDomainRatio and URL_Letter_Count with Correlation 0.98
pathDomainRatio and Extension_LetterCount with Correlation 0.97
pathDomainRatio and Query_LetterCount with Correlation 0.96
pathDomainRatio and LongestPathTokenLength with Correlation 0.98
argPathRatio and NumberRate_AfterPath with Correlation 0.83
argPathRatio and Entropy_Afterpath with Correlation 0.8
LongestVariableValue and URL_DigitCount with Correlation 0.92
LongestVariableValue and Extension_DigitCount with Correlation 0.93
LongestVariableValue and Query_DigitCount with Correlation 0.96
LongestVariableValue and URL_Letter_Count with Correlation 0.99
LongestVariableValue and Extension_LetterCount with Correlation 0.99
LongestVariableValue and Query_LetterCount with Correlation 1.0
LongestVariableValue and LongestPathTokenLength with Correlation 0.99
URL_DigitCount and Extension_DigitCount with Correlation 0.97
URL_DigitCount and Query_DigitCount with Correlation 0.94
URL_DigitCount and URL_Letter_Count with Correlation 0.

  result = getitem(key)


In [16]:
# Principal Component Analysis

# Separating the input and output features
X=spam2.loc[:, spam2.columns!="URL_Type_obf_Type"]
y=spam2["URL_Type_obf_Type"]
  
# Let's say, components = 6
pca = PCA(n_components = 6)
pca.fit(X)
X_pca = pca.transform(X)

In [17]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)


In [18]:
# Machine Learning Models
start = time.time()
rfc=RandomForestClassifier(n_estimators=20, criterion="gini", n_jobs=-1)
rfc.fit(train_x, train_y)
predicted_test=rfc.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=rfc.predict(train_x)

0.21056103706359863


In [19]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 99.4% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.9% for training

print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.99


99.48470209339774
100.0
0.9951544518473652


### Phishing and Benign ###

In [20]:
phishing1=phishing.drop(["NumberRate_Extension"], axis=1)
phishing1=phishing1.dropna()
phishing1=shuffle(phishing1)
phishing1=phishing1.reset_index(drop=True)
phishing1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13084 entries, 0 to 13083
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      13084 non-null  int64  
 1   domain_token_count               13084 non-null  int64  
 2   path_token_count                 13084 non-null  int64  
 3   avgdomaintokenlen                13084 non-null  float64
 4   longdomaintokenlen               13084 non-null  int64  
 5   avgpathtokenlen                  13084 non-null  float64
 6   tld                              13084 non-null  int64  
 7   charcompvowels                   13084 non-null  int64  
 8   charcompace                      13084 non-null  int64  
 9   ldl_url                          13084 non-null  int64  
 10  ldl_domain                       13084 non-null  int64  
 11  ldl_path                         13084 non-null  int64  
 12  ldl_filename      

In [21]:
# If URL_Type_obf_Type == "phishing" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_phishing(class_name):
    if(class_name=="phishing"):
        return 0
    else:
        return 1

In [22]:
phishing1["URL_Type_obf_Type"]=phishing1["URL_Type_obf_Type"].apply(lambda x:class_labels_phishing(x))
phishing_columns=phishing1.columns

corr=phishing1.corr()


In [23]:
# Removing all the columns with correlation almost equal to 0 ->
 # Which shows no impact on the O/P variable and the features with High Correlation      

print("<-----Features with Low Correlation Value----->")    
index_phishing_lowcorr=columns_lowcorr(corr, phishing_columns)
print("<---------------------------------------------->")
print("<-----Features with High Correlation Value----->")
index_phishing_highcorr=columns_highcorr(corr, phishing_columns)
index_phishing=final_index(index_phishing_lowcorr, index_phishing_highcorr)

phishing3=phishing1.drop(columns=phishing_columns[[index_phishing]])
            
phishing2, scaler=dataset_scaling(phishing3) # Scaling the Dataset
phishing2=pd.DataFrame(phishing2,columns=phishing3.columns)


<-----Features with Low Correlation Value----->
URL_Type_obf_Type and Querylength with Correlation 0.02
URL_Type_obf_Type and avgpathtokenlen with Correlation -0.05
URL_Type_obf_Type and ldl_domain with Correlation -0.07
URL_Type_obf_Type and ldl_filename with Correlation -0.0
URL_Type_obf_Type and dld_domain with Correlation -0.04
URL_Type_obf_Type and dld_filename with Correlation 0.04
URL_Type_obf_Type and urlLen with Correlation 0.1
URL_Type_obf_Type and ArgLen with Correlation -0.03
URL_Type_obf_Type and ArgUrlRatio with Correlation -0.01
URL_Type_obf_Type and argDomanRatio with Correlation 0.03
URL_Type_obf_Type and argPathRatio with Correlation -0.06
URL_Type_obf_Type and executable with Correlation -0.01
URL_Type_obf_Type and isPortEighty with Correlation nan
URL_Type_obf_Type and ISIpAddressInDomainName with Correlation nan
URL_Type_obf_Type and LongestVariableValue with Correlation 0.01
URL_Type_obf_Type and URL_DigitCount with Correlation -0.06
URL_Type_obf_Type and host_Dig

  result = getitem(key)


In [24]:
# Principal Component Analysis

# Separating the input and output features
X=phishing2.loc[:, phishing2.columns!="URL_Type_obf_Type"]
y=phishing2["URL_Type_obf_Type"]
  
# Let's say, components = 10
pca = PCA(n_components = 10)
pca.fit(X)
X_pca = pca.transform(X)

In [25]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)


In [26]:
# Machine Learning Models
start = time.time()
rfc=RandomForestClassifier(n_estimators=20, criterion="gini", n_jobs=-1)
rfc.fit(train_x, train_y)
predicted_test=rfc.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=rfc.predict(train_x)

0.31255483627319336


In [27]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 97% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.9% for training

print(f1_score(test_y, predicted_test)) # f1 score for test data ->  0.97

96.63711403240599
99.97961887292367
0.9663608562691132


### Defacement and Benign ###

In [None]:
# In Defacement Dataset nearly 39% of values are NULL values in the Entropy_DirectoryName
 # and 32% of values are NULL values in the NumberRate_Extension column

# If we are using the dropna() on the Defacement Dataset, the number of rows falls from 
 # 15711 to 5186 rows but if we drop the above two columns and apply dropna() then the number
  # of rows reduces from 15711 to 15477 so, better to drop the two columns


In [28]:
defacement1=defacement.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
defacement1=defacement1.dropna()
defacement1=shuffle(defacement1) 
defacement1=defacement1.reset_index(drop=True)
defacement1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15477 entries, 0 to 15476
Data columns (total 78 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      15477 non-null  int64  
 1   domain_token_count               15477 non-null  int64  
 2   path_token_count                 15477 non-null  int64  
 3   avgdomaintokenlen                15477 non-null  float64
 4   longdomaintokenlen               15477 non-null  int64  
 5   avgpathtokenlen                  15477 non-null  float64
 6   tld                              15477 non-null  int64  
 7   charcompvowels                   15477 non-null  int64  
 8   charcompace                      15477 non-null  int64  
 9   ldl_url                          15477 non-null  int64  
 10  ldl_domain                       15477 non-null  int64  
 11  ldl_path                         15477 non-null  int64  
 12  ldl_filename      

In [29]:
# If URL_Type_obf_Type == "Defacement" [class label 0], else [class label 1]

# Function to label the classes in a DataFrame
def class_labels_defacement(class_name):
    if(class_name=="Defacement"):
        return 0
    else:
        return 1
    

In [30]:
defacement1["URL_Type_obf_Type"]=defacement1["URL_Type_obf_Type"].apply(lambda x:class_labels_defacement(x))
defacement_columns=defacement1.columns

corr=defacement1.corr()

In [31]:
# Removing all the columns with correlation almost equal to 0 ->
 # Which shows no impact on the O/P variable and the features with High Correlation      

print("<-----Features with Low Correlation Value----->")    
index_defacement_lowcorr=columns_lowcorr(corr, defacement_columns)
print("<---------------------------------------------->")
print("<-----Features with High Correlation Value----->")
index_defacement_highcorr=columns_highcorr(corr, defacement_columns)
index_defacement=final_index(index_defacement_lowcorr, index_defacement_highcorr)

defacement3=defacement1.drop(columns=defacement_columns[[index_defacement]])
            
defacement2, scaler=dataset_scaling(defacement3) # Scaling the Dataset
defacement2=pd.DataFrame(defacement2,columns=defacement3.columns)


<-----Features with Low Correlation Value----->
URL_Type_obf_Type and charcompvowels with Correlation -0.02
URL_Type_obf_Type and charcompace with Correlation 0.03
URL_Type_obf_Type and ldl_url with Correlation -0.01
URL_Type_obf_Type and ldl_path with Correlation -0.02
URL_Type_obf_Type and dld_url with Correlation 0.05
URL_Type_obf_Type and dld_domain with Correlation -0.01
URL_Type_obf_Type and dld_path with Correlation 0.05
URL_Type_obf_Type and dld_filename with Correlation 0.1
URL_Type_obf_Type and dld_getArg with Correlation -0.09
URL_Type_obf_Type and pathLength with Correlation -0.01
URL_Type_obf_Type and subDirLen with Correlation -0.01
URL_Type_obf_Type and this.fileExtLen with Correlation 0.09
URL_Type_obf_Type and executable with Correlation nan
URL_Type_obf_Type and isPortEighty with Correlation -0.01
URL_Type_obf_Type and ISIpAddressInDomainName with Correlation nan
URL_Type_obf_Type and host_DigitCount with Correlation -0.02
URL_Type_obf_Type and URL_sensitiveWord with 

  result = getitem(key)


In [32]:
# Principal Component Analysis

# Separating the input and output features
X=defacement2.loc[:, defacement2.columns!="URL_Type_obf_Type"]
y=defacement2["URL_Type_obf_Type"]
  
# Let's say, components = 10
pca = PCA(n_components = 10)
pca.fit(X)
X_pca = pca.transform(X)

In [33]:
# Splitting the samples
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)


In [34]:
# Machine Learning Models
start = time.time()
rfc=RandomForestClassifier(n_estimators=20, criterion="gini", n_jobs=-1)
rfc.fit(train_x, train_y)
predicted_test=rfc.predict(test_x)
end = time.time()
print((end-start)) # Prints Time taken by the Model to train and execute the samples in seconds

# Predicting the Output
predicted_train=rfc.predict(train_x)


0.28610682487487793


In [35]:
# Performance Metrics
print(accuracy_score(test_y, predicted_test) * 100) # 99% for testing
print(accuracy_score(train_y, predicted_train) * 100) # 99.5% for training

print(f1_score(test_y, predicted_test)) # f1 score for test data -> 0.99
 

99.43152454780362
100.0
0.994204425711275


In [36]:
# When we try to reduce the n_components value of PCA the accuracy score is also getting 
 # reduced Hence, better value is chosen to capture the variability of the features


### Analysis of all the features for Multi-Classification ###

In [79]:
l=[0]*78 # Creating an Empty list of size -> Number of features

# Counting the Unnecessary features that has appeared for Number of times in all the four Dataframes

for i in index_defacement:
    l[i]=l[i]+1;
for j in index_malware:
    l[j]=l[j]+1;
for k in index_spam:
    l[k]=l[k]+1;
for m in index_phishing:
    l[m]=l[m]+1;

In [80]:
# Extracting the feature indices that has appeared in all the Dataframes for most of the times

columns=malware1.columns # Column names and it respective indices
names_columns_worst=[] # Names of features that are useless

In [81]:
# Appending all the features that are considered as useless for the evaluation
for i in range(0,78):
    if(l[i]==4 or l[i]==3): 
        names_columns_worst.append(columns[i])


In [82]:
names_columns_worst.remove("Entropy_DirectoryName") # Removing this feature explicitly because in further
# analysis the feature  "Entropy_DirectoryName" will be explicitly dropped from all the Dataframes
 
print(names_columns_worst)    

['Querylength', 'domain_token_count', 'path_token_count', 'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld', 'charcompvowels', 'charcompace', 'ldl_url', 'ldl_path', 'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path', 'dld_filename', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'argPathRatio', 'executable', 'isPortEighty', 'ISIpAddressInDomainName', 'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount', 'Directory_DigitCount', 'Extension_DigitCount', 'Query_LetterCount', 'Path_LongestWordLength', 'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl', 'delimeter_Count', 'NumberRate_Domain', 'NumberRate_DirectoryName', 'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_FileName', 'SymbolCount_Extension']


In [None]:
# Thereby from above it is clear that there are 46 features that shows no impact on the Output variable 

### <---------------------- THE END -----------------------> ###