In [28]:
#Import packages
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [29]:
#Read in dataset
df = pd.read_csv('./Phishing.csv')

In [30]:
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [31]:
df['Result'].unique()

array([-1,  1])

The 'Result' column identifies an email as phishing (value = 1) and non-phishing (value = -1).
We want to call the 'Result' column 'Class' instead.
Also we want to change the -1 values to 0 because negative values can effect model performance.
In the next code block we take care of both of these issues.

In [32]:
#Create new column 'Class' from 'Result'
df['Class'] = df['Result']

#Replace all values where Class == -1 with 0
df['Class'][df['Class'] == -1] = 0

Let's reuse some of our code from Chapter 2 to validate the 'Class' column we just made.

In [33]:
#Generate summary of column Results
phishing_results = dict(Counter(df['Result']))
ds = pd.Series(phishing_results, name = 'Num_Observations')
ds.index.name = 'Class'
phishing_summary = ds.reset_index()
phishing_summary

Unnamed: 0,Class,Num_Observations
0,-1,4898
1,1,6157


In [34]:
#Generate summary of column Class
phishing_results = dict(Counter(df['Class']))
ds = pd.Series(phishing_results, name = 'Num_Observations')
ds.index.name = 'Class'
phishing_summary = ds.reset_index()
phishing_summary

Unnamed: 0,Class,Num_Observations
0,0,4898
1,1,6157


We can see that we have sucessfully created and changed the 'Class' column.

Finally we check if the dataset contains any null (i.e. missing) values"

In [57]:
#Count number of null values in each column
df.isnull().sum()

having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Result    

We can see that the data does NOT have any missing values.

Let's also take a look at all of the distinct values for the columns to double-check the null analysis and see if there's any other possible weirdness with the data.

In [59]:
def summarize_df(df,column_name):
    '''Output a dataframe containing the row count for each unique value 
    in column column_name for dataframe df'''
    distinct_values = dict(Counter(df[column_name]))
    ds = pd.Series(distinct_values, name = 'Num_Observations')
    ds.index.name = column_name
    distinct_values = ds.reset_index()
    return distinct_values

#Generate summary of all columns
for column_name in df:
    display(summarize_df(df,column_name))


Unnamed: 0,having_IP_Address,Num_Observations
0,-1,3793
1,1,7262


Unnamed: 0,URL_Length,Num_Observations
0,1,1960
1,0,135
2,-1,8960


Unnamed: 0,Shortining_Service,Num_Observations
0,1,9611
1,-1,1444


Unnamed: 0,having_At_Symbol,Num_Observations
0,1,9400
1,-1,1655


Unnamed: 0,double_slash_redirecting,Num_Observations
0,-1,1429
1,1,9626


Unnamed: 0,Prefix_Suffix,Num_Observations
0,-1,9590
1,1,1465


Unnamed: 0,having_Sub_Domain,Num_Observations
0,-1,3363
1,0,3622
2,1,4070


Unnamed: 0,SSLfinal_State,Num_Observations
0,-1,3557
1,1,6331
2,0,1167


Unnamed: 0,Domain_registeration_length,Num_Observations
0,-1,7389
1,1,3666


Unnamed: 0,Favicon,Num_Observations
0,1,9002
1,-1,2053


Unnamed: 0,port,Num_Observations
0,1,9553
1,-1,1502


Unnamed: 0,HTTPS_token,Num_Observations
0,-1,1796
1,1,9259


Unnamed: 0,Request_URL,Num_Observations
0,1,6560
1,-1,4495


Unnamed: 0,URL_of_Anchor,Num_Observations
0,-1,3282
1,0,5337
2,1,2436


Unnamed: 0,Links_in_tags,Num_Observations
0,1,2650
1,-1,3956
2,0,4449


Unnamed: 0,SFH,Num_Observations
0,-1,8440
1,1,1854
2,0,761


Unnamed: 0,Submitting_to_email,Num_Observations
0,-1,2014
1,1,9041


Unnamed: 0,Abnormal_URL,Num_Observations
0,-1,1629
1,1,9426


Unnamed: 0,Redirect,Num_Observations
0,0,9776
1,1,1279


Unnamed: 0,on_mouseover,Num_Observations
0,1,9740
1,-1,1315


Unnamed: 0,RightClick,Num_Observations
0,1,10579
1,-1,476


Unnamed: 0,popUpWidnow,Num_Observations
0,1,8918
1,-1,2137


Unnamed: 0,Iframe,Num_Observations
0,1,10043
1,-1,1012


Unnamed: 0,age_of_domain,Num_Observations
0,-1,5189
1,1,5866


Unnamed: 0,DNSRecord,Num_Observations
0,-1,3443
1,1,7612


Unnamed: 0,web_traffic,Num_Observations
0,-1,2655
1,0,2569
2,1,5831


Unnamed: 0,Page_Rank,Num_Observations
0,-1,8201
1,1,2854


Unnamed: 0,Google_Index,Num_Observations
0,1,9516
1,-1,1539


Unnamed: 0,Links_pointing_to_page,Num_Observations
0,1,4351
1,0,6156
2,-1,548


Unnamed: 0,Statistical_report,Num_Observations
0,-1,1550
1,1,9505


Unnamed: 0,Result,Num_Observations
0,-1,4898
1,1,6157


Unnamed: 0,Class,Num_Observations
0,0,4898
1,1,6157


From this analysis we can see that the values seem to be well behaved and match the data spec.

# Extra stuff

In [56]:
#See if any values are null or na
display(df.isnull().sum()[df.isnull().sum() > 0])
display(df.isna().sum()[df.isna().sum() > 0])
#df[df['Result']==1].sum()
#df[df['Result']==-1].sum()
#(df['Class']==1).sum()

Series([], dtype: int64)

Series([], dtype: int64)