# Testing the Machine Learning Model 

In this notebook, we ran ML algorithms ( Decision tree, Kernal SVM and Random Forest ) on our URL features for classification.  The results from each of these algorithms are:

Decision Tree: 0.867

KNN          : 0.8655

Random Forest: 0.879

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [3]:
training_feature_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Training/Features/features(short).csv")
training_label_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Training/Data/merged_whois_verified_training(short).csv")
#testing data set containing new URLs that are not present in the training dataset.
testing_feature_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Testing/Features/features(short).csv")
testing_label_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Testing/Data/merged_whois_verified_testing.csv")

In [4]:
training_feature_dataset.head()

Unnamed: 0,whois_regDate,whois_expDate,whois_updatedDate,dot_count,url_len,digit_count,special_count,hyphen_count,double_slash,single_slash,at_the_rate,protocol,protocol_count
0,747,346,2,6,225,58,12,4,0,10,0,0,0
1,3864,153,210,7,177,47,0,1,0,11,0,0,0
2,6568,5,358,6,60,0,0,0,0,2,0,0,0
3,237,127,234,1,116,21,1,1,1,10,0,0,0
4,-1,-1,-1,3,36,0,0,0,0,1,0,0,0


In [5]:
training_label_dataset.head()

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
2,mail.printakid.com/www.online.americanexpress....,1
3,thewhiskeydregs.com/wp-content/themes/widescre...,1
4,smilesvoegol.servebbs.org/voegol.php,1


In [6]:
testing_label_dataset.head()

Unnamed: 0,url,label
0,www.auraform.com.au/news/ramax/remax/index.htm,1
1,bit.ly/Oceask,1
2,'www.edyshsdf32.hut4.ru/Redirecionamento.html?...,1
3,paypal.com.us.cgi.bin.webscr.cmd.login.member....,1
4,www.wightman.ca/~blitz/,1


In [7]:
testing_feature_dataset.head()

Unnamed: 0,whois_regDate,whois_expDate,whois_updatedDate,dot_count,url_len,digit_count,special_count,hyphen_count,double_slash,single_slash,at_the_rate,protocol,protocol_count
0,-1,-1,76,4,46,0,0,0,0,4,0,0,0
1,5115,3650,32,1,13,0,0,0,0,1,0,0,0
2,6342,231,-1,10,146,88,2,0,0,1,0,0,0
3,4620,127,272,12,124,32,0,0,0,3,0,0,0
4,7827,102,29,2,23,0,0,0,0,2,0,0,0


In [8]:
Xtr = training_feature_dataset.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12] ].values #not including protocol feature
ytr = training_label_dataset.iloc[:, [1]].values
Xte = testing_feature_dataset.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12] ].values #not including protocol feature
yte = testing_label_dataset.iloc[:, [1]].values

Protocol feature which labels 1 for HTTP ( Malicious ) and 0 for HTTPS ( Benign ). Here, we dropped the because we noticed that majority of benign URLs are neither starting from HTTP nor HTTPS. For example, WWW.EXAMPLE.COM

In [9]:
Xtr

array([[ 747,  346,    2, ...,   10,    0,    0],
       [3864,  153,  210, ...,   11,    0,    0],
       [6568,    5,  358, ...,    2,    0,    0],
       ...,
       [9368, 3048,  419, ...,    2,    0,    0],
       [8402,  363,   61, ...,    1,    0,    0],
       [9002,  127,  241, ...,    2,    0,    0]])

In [10]:
ytr

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    

In [11]:
Xte

array([[  -1,   -1,   76,    4,   46,    0,    0,    0,    0,    4,    0,
           0],
       [5115, 3650,   32,    1,   13,    0,    0,    0,    0,    1,    0,
           0],
       [6342,  231,   -1,   10,  146,   88,    2,    0,    0,    1,    0,
           0],
       [4620,  127,  272,   12,  124,   32,    0,    0,    0,    3,    0,
           0],
       [7827,  102,   29,    2,   23,    0,    0,    0,    0,    2,    0,
           0],
       [  87,  276,   87,    3,   31,    1,    1,    0,    0,    1,    0,
           0],
       [  -1,   -1,   47,    3,   26,    4,    0,    0,    0,    2,    0,
           0],
       [  -1,   -1,   -1,    3,   30,    0,    1,    0,    0,    1,    0,
           0],
       [8869,  625, 1597,    2,   22,    0,    0,    0,    0,    2,    0,
           0],
       [8455,  676, 1087,    2,   36,    0,    0,    0,    0,    2,    0,
           0],
       [8994,  866,  204,    2,   26,    0,    0,    0,    0,    2,    0,
           0],
       [7288,   16,  

In [12]:
yte

array([[1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [13]:
print(f'X training shape: {Xtr.shape}')
print(f'y training shape: {ytr.shape}')
print(f'X testing shape: {Xte.shape}')
print(f'y testing shape: {yte.shape}')

X training shape: (501, 12)
y training shape: (501, 1)
X testing shape: (15, 12)
y testing shape: (15, 1)


In [14]:
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=10,random_state=1)

In [15]:
# print('X_train', X_train.shape)
# print('X_test', X_test.shape)
# print('y_train', y_train.shape)
# print('y_test', y_test.shape)

In [16]:
# X_test

In [17]:
# y_test

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tr = scaler.fit_transform(Xtr)
X_te_scaled = scaler.transform(Xte)

In [19]:
X_tr

array([[-1.08965116, -0.09276029, -0.66850999, ...,  4.27333095,
        -0.04472136, -0.07761505],
       [-0.27287724, -0.39441061, -0.16779059, ...,  4.85416079,
        -0.04472136, -0.07761505],
       [ 0.43567479, -0.62572796,  0.18849053, ..., -0.3733078 ,
        -0.04472136, -0.07761505],
       ...,
       [ 1.16938252,  4.13034423,  0.33533612, ..., -0.3733078 ,
        -0.04472136, -0.07761505],
       [ 0.91625335, -0.06619005, -0.52647901, ..., -0.95413765,
        -0.04472136, -0.07761505],
       [ 1.07347643, -0.43504744, -0.09316414, ..., -0.3733078 ,
        -0.04472136, -0.07761505]])

In [20]:
X_te_scaled

array([[-1.28565594, -0.63510569, -0.49036943,  0.2972589 , -0.21367927,
        -0.35573721, -0.36747952, -0.45549363, -0.10040242,  0.78835188,
        -0.04472136, -0.07761505],
       [ 0.05493289,  5.07124316, -0.59629085, -1.01680031, -0.89515292,
        -0.35573721, -0.36747952, -0.45549363, -0.10040242, -0.95413765,
        -0.04472136, -0.07761505],
       [ 0.37645409, -0.27250012, -0.67573191,  2.92537734,  1.85139238,
         3.72644445,  0.48094094, -0.45549363, -0.10040242, -0.95413765,
        -0.04472136, -0.07761505],
       [-0.07477616, -0.43504744, -0.01853769,  3.80141682,  1.39707662,
         1.12869249, -0.36747952, -0.45549363, -0.10040242,  0.20752204,
        -0.04472136, -0.07761505],
       [ 0.76558123, -0.47412132, -0.60351276, -0.57878057, -0.68864575,
        -0.35573721, -0.36747952, -0.45549363, -0.10040242, -0.3733078 ,
        -0.04472136, -0.07761505],
       [-1.26259656, -0.20216714, -0.46388908, -0.14076083, -0.52344002,
        -0.30934878,  

In [21]:
Xte

array([[  -1,   -1,   76,    4,   46,    0,    0,    0,    0,    4,    0,
           0],
       [5115, 3650,   32,    1,   13,    0,    0,    0,    0,    1,    0,
           0],
       [6342,  231,   -1,   10,  146,   88,    2,    0,    0,    1,    0,
           0],
       [4620,  127,  272,   12,  124,   32,    0,    0,    0,    3,    0,
           0],
       [7827,  102,   29,    2,   23,    0,    0,    0,    0,    2,    0,
           0],
       [  87,  276,   87,    3,   31,    1,    1,    0,    0,    1,    0,
           0],
       [  -1,   -1,   47,    3,   26,    4,    0,    0,    0,    2,    0,
           0],
       [  -1,   -1,   -1,    3,   30,    0,    1,    0,    0,    1,    0,
           0],
       [8869,  625, 1597,    2,   22,    0,    0,    0,    0,    2,    0,
           0],
       [8455,  676, 1087,    2,   36,    0,    0,    0,    0,    2,    0,
           0],
       [8994,  866,  204,    2,   26,    0,    0,    0,    0,    2,    0,
           0],
       [7288,   16,  

In [22]:
def get_random_forest_results():
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=1)
    classifier.fit(X_tr,ytr)
    y_pred = classifier.predict(X_te_scaled)
    print("Predicted_label \t Tested label")
    for i in range(10):
      if y_pred[i] == yte[i,0]:
        print("            ",y_pred[i]," \t ",yte[i,0],"     \t: URL classified correctly")
      else:
        print("            ",y_pred[i]," \t ",yte[i,0],"     \t: URL not classified correctly")
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(yte,y_pred)
    accuracy = accuracy_score(yte,y_pred)
    precision = precision_score(yte,y_pred)
    f1score = f1_score(yte,y_pred)
    recall = recall_score(yte,y_pred)
    result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
    print(result_dict)

In [23]:
get_random_forest_results()

Predicted_label 	 Tested label
             1  	  1      	: URL classified correctly
             1  	  1      	: URL classified correctly
             1  	  1      	: URL classified correctly
             1  	  1      	: URL classified correctly
             0  	  1      	: URL not classified correctly
             0  	  0      	: URL classified correctly
             0  	  0      	: URL classified correctly
             0  	  0      	: URL classified correctly
             0  	  0      	: URL classified correctly
             0  	  0      	: URL classified correctly
{'accuracy': 0.9333333333333333, 'precision': 1.0, 'f1score': 0.888888888888889, 'recall': 0.8, 'cm': array([[10,  0],
       [ 1,  4]])}


  after removing the cwd from sys.path.


In [24]:
# def get_dt_results():
#     from sklearn.tree import DecisionTreeClassifier
#     classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [25]:
# def get_knn_results():
#     #Fitting into KNN
#     from sklearn.neighbors import KNeighborsClassifier
#     classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2) #To select which method to use to calculate 
#     #distance, we need to define metric first and then p value 1 for manhattan distance, 2 for euclidian distance
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict
    

In [26]:
# def get_kernel_SVM_results():
#     from sklearn.svm import SVC
#     classifier = SVC(kernel="rbf",random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [27]:
# def get_logistic_reg_results():
#     from sklearn.linear_model import LogisticRegression
#     classifier = LogisticRegression(random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [28]:
# def get_naive_bayes_results():
#     from sklearn.naive_bayes import GaussianNB
#     classifier = GaussianNB()
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [29]:
# def get_svm_results():
#     #Fitting SVM to Training set
#     from sklearn.svm import SVC
#     classifier = SVC(kernel='linear',random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [30]:
# def get_classification_results():
#     results_dict = {}
#     dt = get_dt_results()
#     knn = get_knn_results()
#     kernelsvm = get_kernel_SVM_results()
#     logreg = get_logistic_reg_results()
#     nb = get_naive_bayes_results()
#     rf = get_random_forest_results()
#     svm = get_svm_results()
#     results_dict = {"Decision Tree":dt,"KNN":knn,"Kernel SVM":kernelsvm,"Log Regression":logreg,"Naive Bayes":nb,"Random Forest":rf,"SVM":svm}
#     #results_dict = {"Decision Tree":dt,"Kernel SVM":kernelsvm,"Random Forest":rf}
#     return results_dict


In [31]:
# get_classification_results()

In [32]:
# classification_results

In [33]:
# for k,v in classification_results.items():
#     print(f"{k}: {v['accuracy'],v['cm']}")

***