# Perceptron Learning Phishing Email Classification

In [1]:
%run Perceptron.py

In [2]:
from urllib.request import urlopen
import numpy as np
import pandas as pd

## Reading the Data

In [3]:
def parse_row(line, len_row):
    row = np.zeros(len_row)
    for i, value in enumerate(line.split(",")):
        row[i] = int(value)
    return row

num_cols = 0
columns = []
rows = []
len_attr = len('@attribute')


with urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/00327/Training%20Dataset.arff") as f:
    for line in f:
        line = line.decode('utf-8')
        if line.startswith(("-1","0","1")):
            rows.append(parse_row(line, num_cols))
        elif line.startswith("@attribute "):
            col_name = line[len_attr:line.find("{}")].strip().split()[0]
            columns.append(col_name)
            num_cols += 1
        
df = pd.DataFrame(data=rows, columns=columns)

df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0,1.0,-1.0,0.0,1.0,-1.0,1.0,...,1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,1.0,-1.0
2,1.0,0.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,0.0,-1.0,-1.0
3,1.0,0.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0
4,1.0,0.0,-1.0,1.0,1.0,-1.0,1.0,1.0,-1.0,1.0,...,-1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,1.0,1.0


## Initial Summary Statistics

In [13]:
df.shape

(11055, 31)

In [12]:
df.describe().loc[["mean", "std", "50%"]].T.rename(columns={"50%" : "median"})

Unnamed: 0,mean,std,median
having_IP_Address,0.313795,0.949534,1.0
URL_Length,-0.633198,0.766095,-1.0
Shortining_Service,0.738761,0.673998,1.0
having_At_Symbol,0.700588,0.713598,1.0
double_slash_redirecting,0.741474,0.671011,1.0
Prefix_Suffix,-0.734962,0.678139,-1.0
having_Sub_Domain,0.063953,0.817518,0.0
SSLfinal_State,0.250927,0.911892,1.0
Domain_registeration_length,-0.336771,0.941629,-1.0
Favicon,0.628584,0.777777,1.0


In [20]:
df.corr()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
having_IP_Address,1.0,-0.052411,0.403461,0.158699,0.397389,-0.005257,-0.080745,0.071414,-0.022739,0.087025,...,0.096882,0.054694,-0.010446,-0.050733,0.002922,-0.091774,0.029153,-0.339065,-0.019103,0.09416
URL_Length,-0.052411,1.0,-0.097881,-0.075108,-0.081247,0.055247,0.003997,0.048754,-0.221892,-0.042497,...,-0.049381,-0.013838,0.179426,-0.040823,0.008993,0.183518,0.002902,-0.022987,-0.067153,0.05743
Shortining_Service,0.403461,-0.097881,1.0,0.104447,0.842796,-0.080471,-0.041916,-0.061426,0.060923,0.006101,...,0.036616,0.016581,-0.052596,0.436064,-0.047074,0.014591,0.155844,-0.19841,0.085461,-0.067966
having_At_Symbol,0.158699,-0.075108,0.104447,1.0,0.08696,-0.011726,-0.058976,0.03122,0.015522,0.304899,...,0.290893,0.28441,-0.005499,-0.047872,0.032918,-0.064735,0.037061,-0.00608,-0.080357,0.052948
double_slash_redirecting,0.397389,-0.081247,0.842796,0.08696,1.0,-0.08559,-0.043079,-0.0362,0.047464,0.0351,...,0.054463,0.010459,-0.050107,0.431409,-0.062369,-0.003132,0.178415,-0.194165,0.07039,-0.038608
Prefix_Suffix,-0.005257,0.055247,-0.080471,-0.011726,-0.08559,1.0,0.087891,0.261391,-0.096799,-0.007504,...,-0.014733,-0.036904,0.074116,-0.016556,0.110598,-0.006834,0.067781,0.067423,-0.002763,0.348606
having_Sub_Domain,-0.080745,0.003997,-0.041916,-0.058976,-0.043079,0.087891,1.0,0.267649,-0.082839,-0.016704,...,-0.025312,0.010637,0.119254,0.125493,-0.005764,0.12073,0.057673,-0.010526,0.081627,0.298323
SSLfinal_State,0.071414,0.048754,-0.061426,0.03122,-0.0362,0.261391,0.267649,1.0,-0.193622,-0.014757,...,-0.013005,-0.002773,0.162809,0.050972,0.258768,0.074545,0.096051,-0.01171,0.063411,0.714741
Domain_registeration_length,-0.022739,-0.221892,0.060923,0.015522,0.047464,-0.096799,-0.082839,-0.193622,1.0,0.054253,...,0.05141,0.004393,-0.062851,-0.010477,-0.134454,-0.059898,-0.039766,0.122672,-0.002212,-0.225789
Favicon,0.087025,-0.042497,0.006101,0.304899,0.0351,-0.007504,-0.016704,-0.014757,0.054253,1.0,...,0.939633,0.627607,-0.002628,0.088211,-0.050922,0.011699,-0.016668,-0.127243,0.300917,-0.00028


For specific information on how these categorical variables were determined, refer to the "Phishing Website Features" file.

In [None]:
num_examples = data.shape[0]
num_features = len(meta.names()) - 1
data = np.array(data.tolist(), dtype=np.int8)
np.random.shuffle(data)
train_examples = round(num_examples*.7)
test_examples = ((num_examples - train_examples) // 2)
train_labels = data[:train_examples,30]
dev_labels = data[train_examples:train_examples+test_examples,30]
test_labels = data[train_examples+test_examples:,30]
train_data = data[:train_examples,:30]
dev_data = data[train_examples:train_examples+test_examples,:30]
test_data = data[train_examples+test_examples:,:30]

In [None]:
PhishingSimplePerceptron = SimplePerceptron(num_features, epochs=100)
SimpleWeights = PhishingSimplePerceptron.train(train_data, train_labels, dev_data, dev_labels)[0]

In [None]:
PhishingDynamicPerceptron = DynamicPerceptron(num_features, epochs=100)
DynamicWeights = PhishingDynamicPerceptron.train(train_data, train_labels)[0]

In [None]:
PhishingMarginPerceptron = MarginPerceptron(num_features, epochs=100)
MarginWeights = PhishingMarginPerceptron.train(train_data, train_labels)[0]

In [None]:
PhishingAveragedPerceptron = AveragedPerceptron(num_features, epochs=100)
AveragedWeights = PhishingAveragedPerceptron.train(train_data, train_labels)[0]

In [None]:
PhishingAggressiveMarginPerceptron = AggressiveMarginPerceptron(num_features, epochs=100)
AggressiveMarginWeights = PhishingAggressiveMarginPerceptron.train(train_data, train_labels)[0]

In [None]:
PhishingSimplePerceptron.test(test_data, test_labels)
PhishingDynamicPerceptron.test(test_data, test_labels)
PhishingMarginPerceptron.test(test_data, test_labels)
PhishingAveragedPerceptron.test(test_data, test_labels)
PhishingAggressiveMarginPerceptron.test(test_data, test_labels)