In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as ss # statistical functions
import matplotlib.pyplot as plt # plotting
import seaborn as sns # pretty plots
import math
import warnings

from collections import Counter
from sklearn.model_selection import train_test_split # data splitting

import tensorflow as tf  # building NN model
from tensorflow.keras.layers.experimental import preprocessing # input preprocessing

# model evaluation metrics
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score,confusion_matrix

warnings.filterwarnings('ignore')


In [None]:
print(tf.__version__)


### **Pishing** refers to a malicious attempt to acquire a user's personal data by using deceptive emails or webpages.
### The objective of this notebook is to use machine learning to filter out such pishing webpages from the legitimate ones.

#### **This notebook includes the following**

##### 1. Analysis of the features included in the dataset
##### 2. Development of an MLP to determine the legitimacy of a webpage
##### 3. Evaluation of the developed model

In [None]:
# load the data
data = pd.read_csv("/kaggle/input/phishing-dataset-for-machine-learning/Phishing_Legitimate_full.csv")

In [None]:
# list of features
data.info()



### **Feature Analysis**
#### In the following section, we will separate the categorical and numerical features and analyse them separately

In [None]:
# separate the target variable
label = data.pop('CLASS_LABEL')
ids   = data.pop('id')

# list of categorical features
categorical_features = [
    'AtSymbol','TildeSymbol','NoHttps','RandomString','IpAddress','DomainInSubdomains','DomainInPaths',
    'HttpsInHostname','DoubleSlashInPath','EmbeddedBrandName','ExtFavicon','InsecureForms','RelativeFormAction',
    'ExtFormAction','AbnormalFormAction','FrequentDomainNameMismatch','FakeLinkInStatusBar','RightClickDisabled',
    'PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm','SubdomainLevelRT',
    'UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT'
]

# list of numerical features
numerical_features = [x for x in data.columns if x not in categorical_features]


In [None]:
print("Number of Categorical(Nominal) Features : ",len(categorical_features))
print("Number of Numerical(Ordianl) Features : ",len(numerical_features))

In [None]:
categorical_data = data[categorical_features]

# change the datatype to categorical for all columns
for feature in categorical_data.columns:
    categorical_data[feature] = categorical_data[feature].astype('category')
    
# summary statistics for categorical features
categorical_data.describe()




### **Checking Correlation of categorical features with the target variable**

#### The Uncertainity Coefficient (Theil's U) which represents the following relation between two nominal random variables X and Y, given X how well can we predict Y.



In [None]:
def conditional_entropy(x,y):
    """
    Calculates the conditional entropy of two random varibales X and Y
    wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
    """
    
    y_counter  = Counter(y) # count of all possible y's (the differnt values that the feature can take and their frequency)
    xy_counter = Counter(list(zip(x,y))) # count of all possible pairs of x and y
    
    total_occurences = sum(y_counter.values())
    entropy = 0.0
    
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy]/total_occurences
        p_y  = y_counter[xy[1]]/total_occurences
        entropy += p_xy*math.log(p_y/p_xy,math.e)
    
    return entropy

In [None]:
def theils_u(x,y):
    """
    Calculates the Theil's Uncertainity Coefficient between two random variables X and Y
    wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient
    """
    
    h_xy = conditional_entropy(x,y) # condtitonal entropy of X and Y
    x_counter = Counter(x) # count of all possible values of X
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n / total_occurrences, x_counter.values())) # probability distribution of X
    h_x = ss.entropy(p_x)
    if h_x == 0:
        return 1
    else:
        return (h_x - h_xy) / h_x
    

In [None]:
# calculating the coefficient of correlation between `label` and all the categorical features

uc_scores = dict()

for feature in categorical_features:
    y  = categorical_data[feature]
    uc = theils_u(label,y)
    uc_scores[feature] = uc
    
# sort the scores in descending order
uc_scores = sorted(uc_scores.items(),key=lambda kv: (kv[1],kv[0]),reverse=True)
for k,v in uc_scores:
    print("{} : {}".format(k,v))
    


### The categorical features with the highest correlation with target variable are

#### '**PctExtNullSelfRedirectHyperlinksRT**', '**FrequentDomainNameMismatch**', '**ExtMetaScriptLinkRT**', '**SubmitInfoToEmail**'

In [None]:
# filter the above features
filtered_cats = ['PctExtNullSelfRedirectHyperlinksRT','ExtMetaScriptLinkRT','FrequentDomainNameMismatch','SubmitInfoToEmail']
filtered_cat_data = categorical_data[filtered_cats]

# revert the datatype to int32
for feature in filtered_cat_data.columns:
    filtered_cat_data[feature] = filtered_cat_data[feature].astype('int32')     

# sample
filtered_cat_data.head()

In [None]:
# analysis of numerical features

numerical_data = data[numerical_features]

# summary statistics for numerical features
numerical_data.describe()

### **Checking Correlation of numerical features with the target variable**

#### The Point Biserial Correlation Coefficient measures the correlation between a dichotomous variable Y and a continous variable X.


In [None]:
# calculate point biserial coefficient between numerical features and the target variable

rpb_scores = dict()

for feature in numerical_features:
    y = numerical_data[feature]
    rpb = abs(ss.pointbiserialr(y,label).correlation)
    rpb_scores[feature] = rpb
    
# sort the scores in descending order
rpb_scores = sorted(rpb_scores.items(),key=lambda kv: (kv[1],kv[0]),reverse=True)
for k,v in rpb_scores:
    print("{} : {}".format(k,v))
    
    


### The numerical features with the highest correlation with target varibale are

#### '**NumDash**', '**PctNullSelfRedirectHyperlinks**', '**NumDots**', '**PctExtHyperlinks**', '**NumSenstiveWords**', '**PathLevel**', '**HostnameLength**', '**NumDashInHostname**', '**NumQueryComponents**'

In [None]:
# filter the above features
filtered_nums = ['NumDash', 'PctNullSelfRedirectHyperlinks', 'NumDots', 'PctExtHyperlinks', 'NumSensitiveWords', 'PathLevel', 'HostnameLength', 'NumDashInHostname', 'NumQueryComponents']
filtered_num_data = data[filtered_nums]



In [None]:
# merge the filtered categorical and numerical data
filtered_data = pd.concat([filtered_num_data,filtered_cat_data],axis=1)
filtered_data['label'] = label

In [None]:
filtered_data.head()




## **Model Creation**

In [None]:
# splitting the data into train,test and validation
train, test = train_test_split(filtered_data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')



In [None]:
def dataframe_to_dataset(dataframe, shuffle=True, batch_size=32):
    """
    Creates a tensorflow Dataset object from a Pandas Dataframe
    """
    df = dataframe.copy()
    labels = df.pop('label') # target varibale
    ds = tf.data.Dataset.from_tensor_slices((dict(df),labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds


In [None]:

# creating datasets
batch_size = 256
train_ds = dataframe_to_dataset(train, batch_size=batch_size)
val_ds = dataframe_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = dataframe_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
# preprocessing layer generators

# normalization layer for numerical features
def get_normalization_layer(feature,dataset):
    normalizer = preprocessing.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[feature])
    normalizer.adapt(feature_ds)
    return normalizer

# encoding layer for categorical features to convert class to onehot encodings
def get_encoding_layer(feature,dataset,dtype,max_tokens):
    index = preprocessing.IntegerLookup(max_values=max_tokens,oov_value=-2)
    feature_ds = dataset.map(lambda x,y : x[feature])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens = len(index.get_vocabulary()))
    
    return lambda feature : encoder(index(feature))
    

In [None]:
## model preprocessing layers

all_inputs = list()
encoded_features = list()

# add input layers for the numerical features
for header in filtered_nums:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    normalized_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(normalized_numeric_col)
    
# add input layers for the categorical features
for header in filtered_cats:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int32')
    encoding_layer = get_encoding_layer(header, train_ds, dtype='int32',max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)



In [None]:
# model layers

all_features = tf.keras.layers.concatenate(encoded_features)

x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])


In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")







## **Model Evaluation**

In [None]:
history = model.fit(train_ds, epochs=500, validation_data=val_ds)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
# evaluation on test data
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)


### **Evaluation Metrics**

In [None]:
testy = test['label']
y_pred = model.predict(test_ds)
yhat_classes = []
for pred in y_pred:
    if pred > 0:
        yhat_classes.append(1)
    else:
        yhat_classes.append(0)
        

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(testy, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(testy, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(testy, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(testy, yhat_classes)
print('F1 score: %f' % f1)

print("Confusion Matrix : ")
matrix = confusion_matrix(testy, yhat_classes)
print(matrix)