# Validating Machine Learning Model for URL Classification 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [3]:
feature_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Training/Features/features(short).csv")
label_dataset = pd.read_csv("/content/drive/MyDrive/URL_classification using ML/Training/Data/merged_whois_verified_training(short).csv")

In [4]:
feature_dataset.head()

Unnamed: 0,whois_regDate,whois_expDate,whois_updatedDate,dot_count,url_len,digit_count,special_count,hyphen_count,double_slash,single_slash,at_the_rate,protocol,protocol_count
0,747,346,2,6,225,58,12,4,0,10,0,0,0
1,3864,153,210,7,177,47,0,1,0,11,0,0,0
2,6568,5,358,6,60,0,0,0,0,2,0,0,0
3,237,127,234,1,116,21,1,1,1,10,0,0,0
4,-1,-1,-1,3,36,0,0,0,0,1,0,0,0


In [5]:
label_dataset.head()

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
2,mail.printakid.com/www.online.americanexpress....,1
3,thewhiskeydregs.com/wp-content/themes/widescre...,1
4,smilesvoegol.servebbs.org/voegol.php,1


In [6]:
X = feature_dataset.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12] ].values #not including protocol feature
y = label_dataset.iloc[:, [1]].values

Protocol feature which labels 1 for HTTP ( Malicious ) and 0 for HTTPS ( Benign ). Here, we dropped them because we noticed that majority of benign URLs are neither starting from HTTP nor HTTPS. For example, WWW.EXAMPLE.COM

In [7]:
X

array([[ 747,  346,    2, ...,   10,    0,    0],
       [3864,  153,  210, ...,   11,    0,    0],
       [6568,    5,  358, ...,    2,    0,    0],
       ...,
       [9368, 3048,  419, ...,    2,    0,    0],
       [8402,  363,   61, ...,    1,    0,    0],
       [9002,  127,  241, ...,    2,    0,    0]])

In [8]:
y

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    

In [9]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (501, 12)
y shape: (501, 1)


In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [11]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

X_train (400, 12)
X_test (101, 12)
y_train (400, 1)
y_test (101, 1)


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
X_train

array([[ 0.93966186, -0.32537699, -0.32469702, ..., -0.96097979,
         0.        , -0.08692914],
       [-0.07080975, -0.12291348, -0.57150101, ..., -0.96097979,
         0.        , -0.08692914],
       [-1.34524237, -0.63511593, -0.70808186, ...,  0.78097753,
         0.        , -0.08692914],
       ...,
       [ 1.29330023, -0.11686979,  0.59303047, ..., -0.38032735,
         0.        , -0.08692914],
       [-0.18077126, -0.60338658, -0.12581612, ..., -0.96097979,
         0.        , -0.08692914],
       [-1.34524237, -0.63511593, -0.70808186, ...,  0.20032509,
         0.        , -0.08692914]])

In [14]:
X_test

array([[-1.34524237, -0.63511593, -0.70808186, ..., -0.38032735,
         0.        , -0.08692914],
       [ 1.09579654,  1.00121226, -0.61702796, ...,  0.20032509,
         0.        , -0.08692914],
       [-1.34524237, -0.63511593, -0.70808186, ...,  1.36162998,
         0.        , -0.08692914],
       ...,
       [-1.34524237, -0.63511593, -0.70808186, ..., -0.96097979,
         0.        , -0.08692914],
       [ 0.33300524, -0.20148141, -0.44929709, ..., -0.38032735,
         0.        , -0.08692914],
       [-0.01209243, -0.45531625, -0.07789302, ...,  0.20032509,
         0.        , -0.08692914]])

In [15]:
def get_random_forest_results():
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1score = f1_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
    return result_dict

In [16]:
classification_results = get_random_forest_results()

  after removing the cwd from sys.path.


In [17]:
classification_results

{'accuracy': 0.8613861386138614, 'cm': array([[51,  7],
        [ 7, 36]]), 'f1score': 0.8372093023255814, 'precision': 0.8372093023255814, 'recall': 0.8372093023255814}

In [None]:
# def get_dt_results():
#     from sklearn.tree import DecisionTreeClassifier
#     classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [None]:
# def get_knn_results():
#     #Fitting into KNN
#     from sklearn.neighbors import KNeighborsClassifier
#     classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2) #To select which method to use to calculate 
#     #distance, we need to define metric first and then p value 1 for manhattan distance, 2 for euclidian distance
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict
    

In [None]:
# def get_kernel_SVM_results():
#     from sklearn.svm import SVC
#     classifier = SVC(kernel="rbf",random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [None]:
# def get_logistic_reg_results():
#     from sklearn.linear_model import LogisticRegression
#     classifier = LogisticRegression(random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [None]:
# def get_naive_bayes_results():
#     from sklearn.naive_bayes import GaussianNB
#     classifier = GaussianNB()
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [None]:
# def get_svm_results():
#     #Fitting SVM to Training set
#     from sklearn.svm import SVC
#     classifier = SVC(kernel='linear',random_state=0)
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_test)
#     from sklearn.metrics import confusion_matrix,accuracy_score
#     cm = confusion_matrix(y_test,y_pred)
#     accuracy = accuracy_score(y_test,y_pred)
#     precision = precision_score(y_test,y_pred)
#     f1score = f1_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     result_dict = {"accuracy":accuracy,"precision":precision,"f1score":f1score,"recall":recall,"cm":cm}
#     return result_dict

In [None]:
# def get_classification_results():
#     results_dict = {}
#     dt = get_dt_results()
#     knn = get_knn_results()
#     kernelsvm = get_kernel_SVM_results()
#     logreg = get_logistic_reg_results()
#     nb = get_naive_bayes_results()
#     rf = get_random_forest_results()
#     svm = get_svm_results()
#     results_dict = {"Decision Tree":dt,"KNN":knn,"Kernel SVM":kernelsvm,"Log Regression":logreg,"Naive Bayes":nb,"Random Forest":rf,"SVM":svm}
#     #results_dict = {"Decision Tree":dt,"Kernel SVM":kernelsvm,"Random Forest":rf}
#     return results_dict


In [None]:
# for k,v in classification_results.items():
#     print(f"{k}: {v['accuracy'],v['cm']}")

***