In [1]:
#Import packages
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#Read in dataset
df = pd.read_csv('./Phishing.csv')

In [3]:
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [4]:
df['Result'].unique()

array([-1,  1])

The 'Result' column identifies an email as phishing (value = 1) and non-phishing (value = -1).
We want to call the 'Result' column 'Class' instead.
Also we want to change the -1 values to 0 because negative values can effect model performance.
In the next code block we take care of both of these issues.

In [5]:
#Create new column 'Class' from 'Result'
df['Class'] = df['Result']

#Replace all values where Class == -1 with 0
df['Class'][df['Class'] == -1] = 0

Let's reuse some of our code from Chapter 2 to validate the 'Class' column we just made.

In [6]:
#Generate summary of column Results
phishing_results = dict(Counter(df['Result']))
ds = pd.Series(phishing_results, name = 'Num_Observations')
ds.index.name = 'Class'
phishing_summary = ds.reset_index()
phishing_summary

Unnamed: 0,Class,Num_Observations
0,-1,4898
1,1,6157


In [7]:
#Generate summary of column Class
phishing_results = dict(Counter(df['Class']))
ds = pd.Series(phishing_results, name = 'Num_Observations')
ds.index.name = 'Class'
phishing_summary = ds.reset_index()
phishing_summary

Unnamed: 0,Class,Num_Observations
0,0,4898
1,1,6157


We can see that we have sucessfully created and changed the 'Class' column.

Finally we check if the dataset contains any null (i.e. missing) values"

In [8]:
#Count number of null values in each column
df.isnull().sum()

having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Result    

We can see that the data does NOT have any missing values.

# Split the data into test and training sets

In [10]:
#Create the class label dataset
y = pd.DataFrame(df['Class'])
y.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,1


In [11]:
#Create the predictor dataset
X = df.drop(columns = ['Class','Result'])
X.head(5)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,1,-1,-1,-1,-1,1,1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,1,-1,-1,0,-1,1,1,1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,1,-1,1,-1,1,0,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,1,-1,1,-1,1
4,1,0,-1,1,1,-1,1,1,-1,1,...,1,-1,1,-1,-1,0,-1,1,1,1


In [12]:
#Perform the test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify = y)

In [13]:
#Create the class label dataset
y = pd.DataFrame(df['Class'])

Next lets preview the outputs of the split

In [14]:
#Create the predictor dataset
X = df.drop(columns = ['Class','Result'])

In [15]:
X_train.head(10)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
3339,1,1,1,1,1,-1,-1,1,-1,-1,...,1,1,1,-1,1,1,1,1,0,-1
7971,1,1,1,1,1,-1,0,-1,-1,1,...,1,1,1,1,1,0,-1,1,0,1
9448,1,-1,1,1,1,-1,1,1,-1,1,...,1,1,1,-1,1,-1,-1,-1,0,1
4277,1,-1,1,1,1,-1,1,1,1,1,...,1,1,1,1,1,-1,-1,1,0,1
575,1,-1,1,1,1,1,-1,1,-1,1,...,1,1,1,1,1,1,1,1,1,1
9673,1,-1,1,1,1,-1,0,1,-1,1,...,1,1,1,1,1,-1,-1,1,0,1
8852,1,-1,1,1,1,-1,1,1,-1,1,...,1,1,1,1,1,0,-1,-1,0,1
1377,1,1,1,1,1,-1,1,1,-1,1,...,1,1,1,1,1,0,-1,-1,0,1
4466,1,-1,1,1,1,-1,-1,1,-1,1,...,1,1,1,-1,1,1,1,-1,0,1
6830,1,-1,1,1,1,-1,1,1,-1,1,...,1,1,1,-1,1,1,1,1,0,1


In [16]:
X_test.head(10)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
6526,-1,-1,1,1,1,-1,1,-1,1,1,...,1,1,1,-1,1,-1,1,1,0,1
1210,1,-1,1,1,1,-1,0,1,1,1,...,1,1,1,-1,1,-1,-1,1,0,1
2314,-1,-1,-1,1,-1,-1,0,0,1,1,...,1,1,1,1,1,-1,-1,1,0,1
2153,1,1,1,1,1,-1,1,1,-1,-1,...,-1,-1,-1,1,-1,0,-1,1,1,-1
5133,-1,-1,-1,1,1,-1,0,-1,1,1,...,1,1,1,-1,1,0,1,1,1,-1
7711,-1,-1,1,1,1,-1,1,0,-1,1,...,1,1,1,-1,1,0,-1,1,0,-1
2392,-1,1,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,-1,0,-1,1,1,1
1640,1,-1,1,1,1,1,-1,1,-1,1,...,1,1,1,1,1,1,1,1,0,1
3960,1,-1,1,1,1,-1,0,1,-1,-1,...,1,-1,1,-1,-1,1,-1,1,1,-1
3045,-1,-1,-1,1,-1,-1,-1,1,1,1,...,1,1,1,1,-1,1,-1,1,1,-1


In [17]:
y_test.head()

Unnamed: 0,Class
6526,0
1210,1
2314,0
2153,1
5133,0


In [18]:
y_train.head()

Unnamed: 0,Class
3339,1
7971,0
9448,1
4277,1
575,1
