In [1]:
import random
import math
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn import svm
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
class FireflyAlgorithm():

    def __init__(self, function):
        #Initialize parameters of the firefly algorithm
        self.D = 111  # Number of dimensions. This is equivalent to the number of features in the dataset
        self.NP = 30  # This is the number of fireflies in the model
        self.nFES = 1  # number of function evaluations (repeate number)
        self.alpha = 1  # alpha parameter,(randomization parameter)
        self.betamin = 0.8  # beta parameter
        self.gamma = 1  # gamma parameter (light intensity coefficency)
        # sort of fireflies according to fitness value
        self.Index = [0] * self.NP
        self.Fireflies = [[np.random.rand() for i in range(self.D)] for j in range(self.NP)]  # firefly agents,
        self.Fireflies_tmp = [[np.random.rand() for i in range(self.D)] for j in range(
            self.NP)]  # intermediate pop
        self.Fitness = [0.0] * self.NP  # fitness values (Accuracy)
        self.I = [0.0] * self.NP  # light intensity
        self.nbest = [0.0] * self.NP  # the best solution found so far
        self.LB = 0  # lower bound
        self.UB = 1  # upper bound
        self.fbest = None  # the best
        self.evaluations = 0
        self.Fun = function

    def alpha_new(self, a):
        delta = 1.0 - math.pow((math.pow(10.0, -4.0) / 0.9), 1.0 / float(a))
        return (1 - delta) * self.alpha

    def sort_ffa(self):  # implementation of bubble sort

        for i in range(self.NP):
            self.Index[i] = i

        for i in range(0, (self.NP - 1)):
            j = i + 1
            for j in range(j, self.NP):
                if (self.I[i] > self.I[j]):
                    z = self.I[i]  # exchange attractiveness
                    self.I[i] = self.I[j]
                    self.I[j] = z
                    z = self.Fitness[i]  # exchange fitness
                    self.Fitness[i] = self.Fitness[j]
                    self.Fitness[j] = z
                    z = self.Index[i]  # exchange indexes
                    self.Index[i] = self.Index[j]
                    self.Index[j] = z


    def replace_ffa(self):  # replace the old population according to the new Index values
        # copy original population to a temporary area
        for i in range(self.NP):
            for j in range(self.D):
                self.Fireflies_tmp[i][j] = self.Fireflies[i][j]

        # generational selection in the sense of an EA
        for i in range(self.NP):
            for j in range(self.D):
                self.Fireflies[i][j] = self.Fireflies_tmp[self.Index[i]][j]

    def FindLimits(self, k):
        for i in range(self.D):
            if self.Fireflies[k][i] < self.LB:
                self.Fireflies[k][i] = self.LB
            if self.Fireflies[k][i] > self.UB:
                self.Fireflies[k][i] = self.UB

    def move_ffa(self):
        for i in range(self.NP):
            scale = abs(self.UB - self.LB)
            for j in range(self.NP):
                r = 0.0
                for k in range(self.D):
                    r += (self.Fireflies[i][k] - self.Fireflies[j][k]) * \
                        (self.Fireflies[i][k] - self.Fireflies[j][k])
                r = math.sqrt(r)
                if self.I[i] > self.I[j]:  # brighter and more attractive
                    beta0 = 1.0
                    beta = (beta0 - self.betamin) * math.exp(-self.gamma * math.pow(r, 2.0)) + self.betamin
                    for k in range(self.D):
                        r = random.uniform(0, 1)
                        tmpf = self.alpha * (r - 0.5) * scale
                        self.Fireflies[i][k] = self.Fireflies[i][
                            k] * (1.0 - beta) + self.Fireflies_tmp[j][k] * beta + tmpf
            self.FindLimits(i)

    def Run(self):
        while self.evaluations < self.nFES:

            # optional reducing of alpha
            #self.alpha = self.alpha_new(self.nFES/self.NP)
            self.evaluations = self.evaluations + 1
            # evaluate new solutions
            for i in range(self.NP):
                self.Fitness[i] = self.Fun(self.Fireflies[i])

                self.I[i] = self.Fitness[i]


            # ranking fireflies by their light intensit
            self.sort_ffa()
            # replace old population
            self.replace_ffa()
            # move all fireflies to the better locations
            self.move_ffa()

        bestFirefly = self.Fireflies[self.NP - 1]

        return bestFirefly

In [5]:
Phishing = pd.read_csv ("C:/Users/hp/Desktop/NEWDATA.csv")
Phishing.head(3)

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0


In [4]:
Phishing.to_excel(r'C:\Users\hp\Desktop\Firefly_Data.xlsx', sheet_name='FirelyData', index = True)

In [6]:

y = Phishing['phishing'].values
X = Phishing.drop('phishing', axis=1).values

In [7]:
def evaluation(feature_possibilities):
        feature_possibilities = np.round(feature_possibilities)

        feature_possibilities = feature_possibilities > np.float32(0.7)

        selectedX = X[:, feature_possibilities]

        s = svm.SVC(kernel="linear", C=1)

        loocv = LeaveOneOut()
        evaluation = cross_val_score(s, selectedX, y,  cv=loocv)

        return evaluation.mean()

In [8]:
Algorithm = FireflyAlgorithm(evaluation)

In [10]:
Best = Algorithm.Run()

In [14]:
a = np.round(Best)

Feature_Filter = a > np.float32(0.7)

print(Feature_Filter)

[False  True  True  True False  True False False False  True  True False
  True False  True  True  True  True  True  True False False False False
  True False  True  True  True  True False False  True  True False  True
  True False  True False False  True  True  True False  True False  True
  True  True  True False False  True False  True False  True False False
 False False  True  True False False  True False  True False  True False
  True  True  True  True  True False  True False False False False False
  True False  True False  True  True False False False False False  True
 False False  True  True False False  True False  True  True False  True
 False False  True]


In [15]:
#The actual feature selection where best features are given the label 'TRUE'
#Not best features are given the label 'FALSE'
#Best features have an objective score greater than 0.7
Feature_Filter_Appended_to_main_Data = pd.DataFrame(Feature_Filter)
New_Features = Feature_Filter_Appended_to_main_Data.T
New_Features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,False,True,True,True,False,True,False,False,False,True,...,False,True,False,True,True,False,True,False,False,True


In [16]:
X_new = pd.DataFrame(X)
New_Features.to_excel(r'C:\Users\hp\Desktop\Features.xlsx', sheet_name='Features', index = True)

In [17]:
print(Best)

[0.4936804494053556, 0.5640331858507205, 0.9213918536413664, 0.8397352788200455, 0.27852558635790037, 0.5549552355552607, 0.2279516430222417, 0.12134061449246658, 0.22638261980081453, 0.7697781796916838, 0.8915553556719583, 0.42827897031699846, 0.7000285157096126, 0.028525921962541223, 0.5416527329577543, 0.5659082512320062, 0.9814228849005089, 0.6945352236426431, 0.7927486246552883, 0.7895175748827638, 0.4256077251575685, 0.21988010355068266, 0.37503321872395023, 0.4564365905980924, 0.9298678479747491, 0.043749123301875525, 0.6882044780707866, 0.9152439123660097, 0.8071919041919956, 0.8623096187375372, 0.03419583309469232, 0.4704362938607767, 0.5253557091347384, 0.9262395971403914, 0.44831483667028016, 0.6339406792808162, 0.932344347824846, 0.24140159824803487, 0.5272957238255057, 0.13046440638908063, 0.4611275996318892, 0.6741669596952115, 0.7843411866021388, 0.6110574402413614, 0.036876719463489915, 0.6200896614056416, 0.08212735671313276, 0.5463306674724184, 0.689684862668256, 0.69

In [24]:
New_Best = pd.DataFrame(Best)

In [18]:
true_number = np.array(np.unique(Feature_Filter, return_counts=True)).T
bestX = X[:, Feature_Filter]
print(true_number)

[[ 0 54]
 [ 1 57]]


In [2]:
#Elimination of FALSE features was done manually in Excel
#The new dataset with only best features is loaded in this cell
#As per the true number above, the number of best features with scores above 70% was 59
Filtered_Dataset1 = pd.read_csv ("C:/Users/hp/Desktop/SVM_FEATURES.csv")
Filtered_Dataset1.head(4)

Unnamed: 0,qty_dot_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_plus_url,qty_hashtag_url,qty_percent_url,length_url,qty_hyphen_domain,...,qty_exclamation_params,qty_tilde_params,qty_hashtag_params,qty_dollar_params,email_in_url,domain_spf,asn_ip,tls_ssl_certificate,url_google_index,phishing
0,3,0,0,0,0,0,0,0,25,0,...,-1,-1,-1,-1,0,0,60781,0,0,1
1,5,3,0,2,0,0,0,0,223,0,...,0,0,0,0,0,-1,36024,1,0,1
2,2,0,0,0,0,0,0,0,15,0,...,-1,-1,-1,-1,0,0,4766,1,0,0
3,4,0,0,0,0,0,0,0,81,0,...,-1,-1,-1,-1,0,0,20454,1,0,1


In [4]:
#Replace all missing and zero values with NaNs
Filtered_Dataset2 = Filtered_Dataset1.replace(np.nan, 0)
Filtered_Dataset2

Unnamed: 0,qty_dot_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_plus_url,qty_hashtag_url,qty_percent_url,length_url,qty_hyphen_domain,...,qty_exclamation_params,qty_tilde_params,qty_hashtag_params,qty_dollar_params,email_in_url,domain_spf,asn_ip,tls_ssl_certificate,url_google_index,phishing
0,3,0,0,0,0,0,0,0,25,0,...,-1,-1,-1,-1,0,0,60781,0,0,1
1,5,3,0,2,0,0,0,0,223,0,...,0,0,0,0,0,-1,36024,1,0,1
2,2,0,0,0,0,0,0,0,15,0,...,-1,-1,-1,-1,0,0,4766,1,0,0
3,4,0,0,0,0,0,0,0,81,0,...,-1,-1,-1,-1,0,0,20454,1,0,1
4,2,0,0,0,0,0,0,0,19,0,...,-1,-1,-1,-1,0,0,53831,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1623,4,0,0,0,0,0,0,0,47,0,...,-1,-1,-1,-1,0,0,20013,0,0,1
1624,1,0,0,0,0,0,0,0,18,0,...,-1,-1,-1,-1,0,1,46606,1,0,0
1625,1,0,0,0,0,0,0,0,33,1,...,-1,-1,-1,-1,0,0,8560,0,0,1
1626,2,0,0,0,0,0,0,0,20,0,...,-1,-1,-1,-1,0,0,6354,0,0,0


In [12]:
y_data = Filtered_Dataset2['phishing']
X_data1 = Filtered_Dataset2.drop('phishing', axis=1)

In [14]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
# creating object
stand= StandardScaler()
# fit data
Fit= stand.fit(X_data1)
# transform data
X_data = Fit.transform(X_data1)
X_data

array([[ 0.68967291, -0.22245027, -0.14775547, ...,  0.73674121,
        -1.04653624, -0.04688897],
       [ 2.36297769,  3.10001793, -0.14775547, ...,  0.13724127,
         0.95553309, -0.04688897],
       [-0.14697947, -0.22245027, -0.14775547, ..., -0.6196828 ,
         0.95553309, -0.04688897],
       ...,
       [-0.98363186, -0.22245027, -0.14775547, ..., -0.52780968,
        -1.04653624, -0.04688897],
       [-0.14697947, -0.22245027, -0.14775547, ..., -0.58122879,
        -1.04653624, -0.04688897],
       [ 1.5263253 ,  0.88503913,  5.865892  , ...,  4.04364069,
        -1.04653624, -0.04688897]])

In [15]:
##We will use two machine learning algorithms to perform classification
#The first one is the random forest classifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix


In [16]:
# Split into training and test set 
##In this cell, we use the random forest classifier to predict whether a link is phishing or not
X_train, X_test, y_train, y_test = train_test_split( 
             X_data, y_data, test_size = 0.4,random_state=5)

In [17]:
from sklearn import svm
#create a classifier
cls = svm.SVC(kernel="linear")
#train the model
cls.fit(X_train,y_train)
#predict the response
pred = cls.predict(X_test)

In [18]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[399  39]
 [ 33 181]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.92       438
           1       0.82      0.85      0.83       214

    accuracy                           0.89       652
   macro avg       0.87      0.88      0.88       652
weighted avg       0.89      0.89      0.89       652



In [20]:
import matplotlib.pyplot as plt
y_pred_proba = cls.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  pred_proba)
auc = metrics.roc_auc_score(y_test, pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

AttributeError: predict_proba is not available when  probability=False