In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
import seaborn as sns

In [28]:
data = pd.read_csv('final_dataset.csv')

In [29]:
len(data.columns)

20

In [30]:
data.head()

Unnamed: 0.1,Unnamed: 0,URL,embedded_domain_in_path,ip_address,number_of_dots,host_has_dash,dictionary_word,http_in_hostname,targeted_brand,redirecting url,ext url,Phishing,Subdomains,Prefix-Suffix,Anchor Tag,Tags,Forms,SSL Expiry Date,SSL Cert Check,Alexa Ranking
0,0,https://locking-app-adverds.000webhostapp.com/...,1,-1,-1,1,1,-1,-1,-1,-1,yes,3,-1,N-N,N-N,N-N,253,0,3664.0
1,1,https://mxtoolbox.com/public/upgradev2.aspx?ps...,0,-1,-1,-1,-1,-1,-1,-1,-1,no,2,1,155.3191489,100,0,256,0,6619.0
2,2,http://parolishop.com.br/questionreviews.htm,0,-1,-1,-1,-1,-1,-1,-1,-1,yes,3,1,293.4911243,100,0,36,0,
3,3,http://innovate.ieee.org/innovate/35675?lt=xpl...,1,-1,-1,-1,-1,-1,-1,-1,-1,no,3,1,N-N,N-N,N-N,58,0,717.0
4,4,http://ietbhaddal.edu.in/bb.mobile/mobile/inde...,0,-1,-1,-1,-1,-1,1,-1,-1,yes,3,1,N-N,N-N,N-N,716,1,


# DATA CLEANING

In [31]:
data.drop('Unnamed: 0', axis=1,inplace=True)

In [40]:
data[data['URL'] == "url"]

Unnamed: 0,URL,embedded_domain_in_path,ip_address,number_of_dots,host_has_dash,dictionary_word,http_in_hostname,targeted_brand,redirecting url,ext url,Phishing,Subdomains,Prefix-Suffix,Anchor Tag,Tags,Forms,SSL Expiry Date,SSL Cert Check,Alexa Ranking
1705,url,embedded_domain_in_path,ip_address,number_of_dots,host_has_dash,dictionary_word,http_in_hostname,targeted_brand,redirecting url,ext url,result,1,1,N-N,N-N,N-N,-1,-1,


In [48]:
data.drop(1705,axis=0,inplace=True)

In [52]:
data.columns

Index(['URL', 'embedded_domain_in_path', 'ip_address', 'number_of_dots',
       'host_has_dash', 'dictionary_word', 'http_in_hostname',
       'targeted_brand', 'redirecting url', 'ext url', 'Phishing',
       'Subdomains', 'Prefix-Suffix', 'Anchor Tag', 'Tags', 'Forms',
       'SSL Expiry Date', 'SSL Cert Check', 'Alexa Ranking'],
      dtype='object')

# Cleaning Anchor Tag feature

In [77]:
data[data['Anchor Tag'] == 'N-N']['Phishing'].value_counts()

yes    875
no     179
Name: Phishing, dtype: int64

In [94]:
data['Anchor Tag'].replace('N-N',np.nan,inplace=True)

In [104]:
data['Anchor Tag'] = data['Anchor Tag'].astype(float)

In [106]:
data['Anchor Tag'] = round(data['Anchor Tag'],ndigits=0)

In [121]:
def impute_anchors_val(vals):
    anc = vals[0]
    phish = vals[1]
    if phish == 'yes' and pd.isnull(anc):
        return 300
    elif phish == 'no' and pd.isnull(anc):
        return 150
    else:
        return anc

In [124]:
data['Anchor Tag'] = data[['Anchor Tag', 'Phishing']].apply(impute_anchors_val,axis=1)

In [127]:
data[data['Anchor Tag'].isnull()]

Unnamed: 0,URL,embedded_domain_in_path,ip_address,number_of_dots,host_has_dash,dictionary_word,http_in_hostname,targeted_brand,redirecting url,ext url,Phishing,Subdomains,Prefix-Suffix,Anchor Tag,Tags,Forms,SSL Expiry Date,SSL Cert Check,Alexa Ranking


In [129]:
data[data['Phishing'] == 'yes']['Anchor Tag'].mean()

268.5291902071563

In [130]:
data[data['Phishing'] == 'no']['Anchor Tag'].mean()

162.8074074074074

# Tags feature processing

In [141]:
data['Tags'].value_counts()

N-N    1118
100    1024
Name: Tags, dtype: int64

OBSERVATION: It is better to exclude this feature as not much useful information can be extracted from it. Moreover, for more than half the data points the information is not available (exception case)

# Forms feature processing

In [147]:
data[data['Phishing'] == 'no']['Forms'].value_counts()

1      447
0      442
N-N    175
-1      16
Name: Forms, dtype: int64

In [148]:
data[data['Phishing'] == 'yes']['Forms'].value_counts()

N-N    875
1      102
0       79
-1       6
Name: Forms, dtype: int64

In [151]:
data['Forms'].replace('N-N',np.nan, inplace=True)

In [159]:
# This function checks whether any information related to the form links is available or not
def form_info_available(form_val):
    val = form_val[0]
    if pd.isnull(val):
        return 0
    return 1

In [161]:
data['Forms'] = data[['Forms']].apply(form_info_available,axis=1)

In [185]:
data[data['Phishing'] == "yes"]['SSL Expiry Date'].value_counts()

-1       543
 253      59
 252      33
 39       19
 62       16
 55       16
 2501     10
 35        9
 22        8
 80        8
 85        8
 64        8
 36        8
 43        8
 12        8
 251       7
 42        7
 84        7
 76        7
 79        7
 54        6
 70        6
 243       6
 37        6
 61        6
 44        6
 2500      5
 11        5
 75        5
 41        5
        ... 
 654       1
 676       1
 715       1
 731       1
 742       1
 9043      1
 857       1
 286       1
 282       1
 264       1
 175       1
 98        1
 99        1
 126       1
 2193      1
 148       1
 9157      1
 158       1
 167       1
 179       1
 235       1
 180       1
 182       1
 190       1
 204       1
 218       1
 227       1
 229       1
 6374      1
 154       1
Name: SSL Expiry Date, Length: 141, dtype: int64

# Alexa Ranking Feature Processing

In [196]:
data[data['Phishing'] == "no"]['Alexa Ranking'].median()

7538.0

In [197]:
data[data['Phishing'] == "yes"]['Alexa Ranking'].median()

538085.0

In [198]:
def impute_alexa(vals):
    rank = vals[0]
    phis = vals[1]
    if pd.isnull(rank):
        if phis == "no":
            return 7500
        else:
            return 538000
    return rank

In [201]:
data['Alexa Ranking'] = data[['Alexa Ranking', 'Phishing']].apply(impute_alexa,axis=1)

In [203]:
data.columns

Index(['URL', 'embedded_domain_in_path', 'ip_address', 'number_of_dots',
       'host_has_dash', 'dictionary_word', 'http_in_hostname',
       'targeted_brand', 'redirecting url', 'ext url', 'Phishing',
       'Subdomains', 'Prefix-Suffix', 'Anchor Tag', 'Tags', 'Forms',
       'SSL Expiry Date', 'SSL Cert Check', 'Alexa Ranking'],
      dtype='object')

In [213]:
data['SSL Expiry Date'].groupby(data['Phishing']).mean()

Phishing
no     223.114815
yes    149.979284
Name: SSL Expiry Date, dtype: float64

In [228]:
data[data['Phishing'] == "yes"]['SSL Expiry Date'].value_counts()

-1       543
 253      59
 252      33
 39       19
 62       16
 55       16
 2501     10
 35        9
 22        8
 80        8
 85        8
 64        8
 36        8
 43        8
 12        8
 251       7
 42        7
 84        7
 76        7
 79        7
 54        6
 70        6
 243       6
 37        6
 61        6
 44        6
 2500      5
 11        5
 75        5
 41        5
        ... 
 654       1
 676       1
 715       1
 731       1
 742       1
 9043      1
 857       1
 286       1
 282       1
 264       1
 175       1
 98        1
 99        1
 126       1
 2193      1
 148       1
 9157      1
 158       1
 167       1
 179       1
 235       1
 180       1
 182       1
 190       1
 204       1
 218       1
 227       1
 229       1
 6374      1
 154       1
Name: SSL Expiry Date, Length: 141, dtype: int64

In [219]:
data[data['Phishing'] == "no"]['SSL Expiry Date'].value_counts()

-1      161
 69      78
 68      26
 163     14
 182     13
 219     12
 316     10
 380     10
 67      10
 151     10
 322      9
 626      9
 249      8
 77       8
 315      8
 74       7
 150      7
 434      7
 582      7
 93       7
 176      7
 49       7
 133      7
 34       7
 750      7
 147      6
 193      6
 66       6
 180      6
 175      6
       ... 
 191      1
 294      1
 298      1
 423      1
 351      1
 409      1
 408      1
 406      1
 404      1
 391      1
 390      1
 389      1
 387      1
 382      1
 359      1
 358      1
 348      1
 303      1
 344      1
 341      1
 339      1
 337      1
 333      1
 325      1
 323      1
 317      1
 313      1
 311      1
 307      1
 287      1
Name: SSL Expiry Date, Length: 326, dtype: int64

In [229]:
def impute_expiry(vals):
    days = vals[0]
    phis = vals[1]
    if days == -1:
        if phis == 'yes':
            return 69
        else:
            return 253
    return days

In [231]:
data["SSL Expiry Date"] = data[['SSL Expiry Date','Phishing']].apply(impute_expiry, axis=1)

In [232]:
data.head()

Unnamed: 0,URL,embedded_domain_in_path,ip_address,number_of_dots,host_has_dash,dictionary_word,http_in_hostname,targeted_brand,redirecting url,ext url,Phishing,Subdomains,Prefix-Suffix,Anchor Tag,Tags,Forms,SSL Expiry Date,SSL Cert Check,Alexa Ranking
0,https://locking-app-adverds.000webhostapp.com/...,1,-1,-1,1,1,-1,-1,-1,-1,yes,3,-1,300.0,N-N,0,253,0,3664.0
1,https://mxtoolbox.com/public/upgradev2.aspx?ps...,0,-1,-1,-1,-1,-1,-1,-1,-1,no,2,1,155.0,100,1,256,0,6619.0
2,http://parolishop.com.br/questionreviews.htm,0,-1,-1,-1,-1,-1,-1,-1,-1,yes,3,1,293.0,100,1,36,0,538000.0
3,http://innovate.ieee.org/innovate/35675?lt=xpl...,1,-1,-1,-1,-1,-1,-1,-1,-1,no,3,1,150.0,N-N,0,58,0,717.0
4,http://ietbhaddal.edu.in/bb.mobile/mobile/inde...,0,-1,-1,-1,-1,-1,1,-1,-1,yes,3,1,300.0,N-N,0,716,1,538000.0


In [235]:
data.to_csv("final_cleaned_dataset.csv",index=False)