In [49]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import OneHotEncoder
import pickle
import matplotlib.pyplot as plt

In [50]:
def getProcesssedDataset(fp):
    dataset = arff.loadarff(fp)
    dataframe = pd.DataFrame(dataset[0])
    selectiveDF = dataframe.select_dtypes(include=[np.object])
    selectiveDF =  selectiveDF.stack().str.decode('utf-8').unstack()

    for col in selectiveDF.columns:
        selectiveDF[col] = selectiveDF[col].astype(int)
    return selectiveDF


In [51]:
trainingSet = getProcesssedDataset("./TrainingDataset.arff")
trainingSet

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,-1,1,0,1,-1
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,1,1,1,-1,1,1,1,-1


In [52]:
reduced_dataframe =  trainingSet[['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Redirect', 'on_mouseover', 'RightClick', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Statistical_report', 'Result']]

In [53]:
def convertToPositiveValues(neg_dataframe):
    map = {-1: 2, 0: 0, 1: 1}   # sus -> 0 phishing-> 2 safe -> 1
    column_mapping = {}

    for col in neg_dataframe:
        column_mapping[col] = map
    
    for i in range(neg_dataframe.shape[0]):
        for j in range(neg_dataframe.shape[1]):
            neg_dataframe.loc[i][j] = map[neg_dataframe.loc[i][j]]

In [54]:
convertToPositiveValues(reduced_dataframe)

In [55]:
reduced_dataframe

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,Redirect,on_mouseover,RightClick,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Statistical_report,Result
0,2,1,1,1,2,2,2,2,2,1,...,0,1,1,1,2,2,2,2,2,2
1,1,1,1,1,1,2,0,1,2,1,...,0,1,1,1,2,2,0,2,1,2
2,1,0,1,1,1,2,2,2,2,1,...,0,1,1,1,1,2,1,2,2,2
3,1,0,1,1,1,2,2,2,1,1,...,0,1,1,1,2,2,1,2,1,2
4,1,0,2,1,1,2,1,1,2,1,...,0,2,1,1,2,2,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1,2,1,2,1,1,1,1,2,2,...,0,2,2,2,1,1,2,2,1,1
11051,2,1,1,2,2,2,1,2,2,2,...,1,2,1,1,1,1,1,1,1,2
11052,1,2,1,1,1,2,1,2,2,1,...,0,1,1,1,1,1,1,2,1,2
11053,2,2,1,1,1,2,2,2,1,2,...,0,2,1,1,1,1,1,2,1,2


In [56]:
X = reduced_dataframe.iloc[:,0:25]
y = reduced_dataframe.iloc[:,-1]

In [60]:
X_train , X_test, y_train, y_test =  train_test_split(X,y, test_size=0.2,random_state=7,stratify=y)

X_test.shape

(2211, 25)

In [61]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [13]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [63]:
def getOptimalParams(X_trn, y_train):
    params_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel':['rbf','linear']}

    gridSearchSVM = GridSearchCV(svm.SVC(),params_grid,cv=3)
    gridSearchResults = gridSearchSVM.fit(X_trn,y_train)

    return gridSearchResults.best_params_

bestparams_SVM = getOptimalParams(X_train,y_train)
print(bestparams_SVM)


{'C': 1, 'gamma': 1, 'kernel': 'rbf'}


In [64]:
kf = KFold(n_splits=5, shuffle=True, random_state=786)

In [66]:
def predictUsingSVM(X_trn,y_t):
    for train , val in kf.split(X_trn):
        svm_classifier = svm.SVC(kernel='rbf',gamma=1,C = 1, probability=True)
        svm_classifier = svm_classifier.fit(X_trn[train],y_t[train])
        tp, fn, fp, tn = confusion_matrix(y_t[val], svm_classifier.predict(X_trn[val])).ravel()

        accuracy = (tn+tp)/(fp+fn+tp+tn)
        return accuracy

print(predictUsingSVM(X_train,y_train))

0.9604296212549462


In [68]:
def testSVMmodel(X_tst,y_tst,X_trn,y_trn):
    svm_classifier = svm.SVC(kernel='rbf', gamma= 1, C = 1 , probability=True)
    svm_classifier = svm_classifier.fit(X_trn,y_trn)

    prediction = svm_classifier.predict(X_tst)
    accuracy = metrics.accuracy_score(y_tst,prediction)

    pickle.dump(svm_classifier, open("SVM_MODEL", 'wb'))

    return accuracy

print(testSVMmodel(X_test,y_test,X_train,y_train))

0.9642695612844867


In [69]:
loaded_model = pickle.load(open('./SVM_MODEL',"rb"))
loaded_prediction = loaded_model.predict(X_test)

print(loaded_prediction)
print(metrics.accuracy_score(y_test,loaded_prediction))

[2 1 2 ... 1 1 1]
0.9642695612844867
