In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import sys
import os
!pip install tldextract -q
import tldextract
import warnings
import regex as re
import eli5
from typing import *

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRKAt3ZQq4ZYJEMWpqcILtjKPXToNMeGTu0KaDeeURlhgYv00tdTLMD8WpSx2JseUfEMOcJ8J7Ou3PRZw&usqp=CAU)ashish-pal.medium.com

#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt
import seaborn as sns 
from urllib.parse import urlparse
from nltk.tokenize import RegexpTokenizer

warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv(r'/kaggle/input/web-page-phishing-detection-dataset/dataset_phishing.csv')
df.head()

In [None]:
df_grp = df.groupby(["url"])[["status"]].sum().reset_index()
df_grp.head()

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models

def parse_url(url: str) -> Optional[Dict[str, str]]:
    try:
        no_scheme = not url.startswith('https://') and not url.startswith('http://')
        if no_scheme:
            parsed_url = urlparse(f"http://{url}")
            return {
                "scheme": None, # not established a value for this
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
        else:
            parsed_url = urlparse(url)
            return {
                "scheme": parsed_url.scheme,
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
    except:
        return None

In [None]:
df_grp["parsed_url"] = df_grp.url.apply(parse_url)
df_grp

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


df_grp = pd.concat([
    df_grp.drop(['parsed_url'], axis=1),
    df_grp['parsed_url'].apply(pd.Series)
], axis=1)
df_grp

In [None]:
df_grp = df_grp[~df_grp.netloc.isnull()]
df_grp

#The first meaningful bit of data to extract is the length of the URL.

In [None]:
df_grp["length"] = df_grp.url.str.len()

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


#The TLD is then extracted using a python library, and if no TLD is present simply add 'None'.

df_grp["tld"] = df_grp.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
df_grp['tld'] = df_grp['tld'].replace('','None')

In [None]:
#Next is a regex to determine if the URL is an IP address.

df_grp["is_ip"] = df_grp.netloc.str.fullmatch(r"\d+\.\d+\.\d+\.\d+")

The next few sections relate to certain punctuation in the URL which may be an indicator one way or another that a URL is malicious. My reasoning behind this is that typosquatted domains (which are almost always malicious) may contain this punctation to appear similar to a legitimate domain. There may also be more of each in the path of the URL for a legitimate URL as blogs often use underscores in a URL.

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


df_grp['domain_hyphens'] = df_grp.netloc.str.count('-')
df_grp['domain_underscores'] = df_grp.netloc.str.count('_')
df_grp['path_hyphens'] = df_grp.path.str.count('-')
df_grp['path_underscores'] = df_grp.path.str.count('_')
df_grp['slashes'] = df_grp.path.str.count('/')

Full stops in the path could indicate that theres an attempt to fool a user into thinking a domain is legit. For example, attacker.com/paypal.com may be used to trick a user. Full stops may also be a sign of files in the URL such as shell.exe

In [None]:
df_grp['full_stops'] = df_grp.path.str.count('.')

Similar to the previous datapoint, getting the full stops in a subdomain will count how many subdomains are present. Lots may be another visual trick such as paypal.com.attacker.com/

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


def get_num_subdomains(netloc: str) -> int:
    subdomain = tldextract.extract(netloc).subdomain 
    if subdomain == "":
        return 0
    return subdomain.count('.') + 1

df_grp['num_subdomains'] = df_grp['netloc'].apply(lambda net: get_num_subdomains(net))

As previous notebooks have shown, the lexical features of the URL will be important. In this instance, I have decided to separate the tokens from the path and the domain itself. My thinking here is that the same word in a path and domain may have very different meanings. By this i mean if you see 'paypal' in a URL path, it may be a malicious URL which is trying to seem legitimate, but 'paypal' in the domain may be more legitimate.

#Don't change any word in the snippet below! 

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


tokenizer = RegexpTokenizer(r'[A-Za-z]+')
def tokenize_domain(netloc: str) -> str:
    split_domain = tldextract.extract(netloc)
    no_tld = str(split_domain.subdomain +'.'+ split_domain.domain)
    return " ".join(map(str,tokenizer.tokenize(no_tld)))
         
df_grp['domain_tokens'] = df_grp['netloc'].apply(lambda net: tokenize_domain(net))

In [None]:
df_grp['path_tokens'] = df_grp['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))

In [None]:
df_grp.columns.tolist()

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


#The labels are now extracted and the URL column removed.

df_grp_y = df_grp['status'] #It was df_grp_y = df_grp['label'] But label Disappeared? Check columns tolist above
df_grp.drop('status', axis=1, inplace=True) #Where label disappeared? Label IS BACK!
df_grp.drop('url', axis=1, inplace=True)
df_grp.drop('scheme', axis=1, inplace=True)
df_grp.drop('netloc', axis=1, inplace=True)
df_grp.drop('path', axis=1, inplace=True)
df_grp.drop('params', axis=1, inplace=True)
df_grp.drop('query', axis=1, inplace=True)
df_grp.drop('fragment', axis=1, inplace=True)
df_grp

#Training

When using pipelines and vectorizers, you need a converter to feed the vectorizer every word of that column. It cannot add the values one row at a time and so a converter class must be created.

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_grp, df_grp_y, test_size=0.2)

The numeric features need their own pipeline to scale the data, MinMaxScaler was used as MultinomialNB needs no negative values to work.

In [None]:
numeric_features = ['length', 'domain_hyphens', 'domain_underscores', 'path_hyphens', 'path_underscores', 'slashes', 'full_stops', 'num_subdomains']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

The only categorical feature is TLD . (Those will be vectorized domain_tokens and path_tokens) and OneHot encoding will be used for this. Interestingly there is no difference between using this or converting and using the TfidfVectorizer. However, using OneHot encoding makes the TLD obvious in the feature importance section.

In [None]:
categorical_features = ['tld', 'is_ip']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
vectorizer_features = ['domain_tokens','path_tokens']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

CountVectorizer and TfidfVectorizer produce very similar results, but with the best performing model (spoiler its SVC) Tfidf slightly improved the score.

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


vectorizer_features = ['domain_tokens','path_tokens']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

The next step is to link all the transformers together in a ColumnTransformer, and create a pipeline for each classifier.

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('domvec', vectorizer_transformer, ['domain_tokens']),
        ('pathvec', vectorizer_transformer, ['path_tokens'])
    ])

svc_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearSVC())])

log_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

nb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MultinomialNB())])
svc_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)
nb_clf.fit(X_train, y_train)

#Results

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


def results(name: str, model: BaseEstimator) -> None:
    preds = model.predict(X_test)

    print(name + " score: %.3f" % model.score(X_test, y_test))
    print(classification_report(y_test, preds))
    labels = ['Good', 'Bad']

    conf_matrix = confusion_matrix(y_test, preds)

    font = {'family' : 'normal',
            'size'   : 14}

    plt.rc('font', **font)
    plt.figure(figsize= (10,6))
    sns.heatmap(conf_matrix, xticklabels=labels, yticklabels=labels, annot=True, fmt="d", cmap='Greens')
    plt.title("Confusion Matrix for " + name)
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')

In [None]:
results("SVC" , svc_clf)
results("Logistic Regression" , log_clf)
results("Naive Bayes" , nb_clf)

"As we see the NB performs best out of the three followed by MultinomialNB. While Logistic Regression performs the worst, we can see it produces less false negatives than Naive Bayes."

"Also if numerical features are removed, logistic regression performs better. I wouldn't know why this would be the case and would be interested to hear some ideas for it."

#Feature Importance

Finally, to see what features are most strongly weighted to the SVC classifier I use eli5 to show this. It is worth noting that weights may be high for rarer features and should be taken with a grain of salt.

In [None]:
#Code by Tyler Sullivan and Matthew Franglen https://www.kaggle.com/tylersullivan/classifying-phishing-urls-three-models


onehot_columns = list(svc_clf.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features))
domvect_columns = list(svc_clf.named_steps['preprocessor'].named_transformers_['domvec'].named_steps['tf'].get_feature_names())
pathvect_columns = list(svc_clf.named_steps['preprocessor'].named_transformers_['pathvec'].named_steps['tf'].get_feature_names())
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)
numeric_features_list.extend(domvect_columns)
numeric_features_list.extend(pathvect_columns)
eli5.explain_weights(svc_clf.named_steps['classifier'], top=20, feature_names=numeric_features_list)

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thank you Tyler Sullivan and Matthew Franglen for the script')