# ECS 171 Project - Detecting Phishing Websites
# label of 1 means not phishing, label of 0 means phishing

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/phishing_url_website.csv')
df.head(n=20)

Unnamed: 0,URL,Domain,TLD,URLSimilarityIndex,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,Title,DomainTitleMatchScore,URLTitleMatchScore,IsResponsive,HasDescription,HasSocialNet,HasSubmitButton,HasCopyrightInfo,NoOfImage,NoOfJS,NoOfSelfRef,label
0,https://www.southbankmosaics.com,www.southbankmosaics.com,com,100.0,1,0.032,1,558,à¸‚à¹ˆà¸²à¸§à¸ªà¸” à¸‚à¹ˆà¸²à¸§à¸§à¸±à¸™à¸™à¸µ...,0.0,0.0,1,0,0,1,1,34,28,119,1
1,https://www.uni-mainz.de,www.uni-mainz.de,de,100.0,2,0.087,1,618,johannes gutenberg-universitÃ¤t mainz,55.555556,55.555556,0,0,1,1,1,50,8,39,1
2,https://www.voicefmradio.co.uk,www.voicefmradio.co.uk,uk,100.0,2,0.069,1,467,voice fm southampton,46.666667,46.666667,1,1,0,1,1,10,7,42,1
3,https://www.globalreporting.org,www.globalreporting.org,org,100.0,1,0.033,1,1210,gri - home,0.0,0.0,1,1,1,0,1,35,11,86,1
4,https://www.nerdscandy.com,www.nerdscandy.com,com,100.0,1,0.04,1,514,nerds candy,100.0,100.0,1,1,1,1,1,24,22,36,1
5,https://www.hyderabadonline.in,www.hyderabadonline.in,in,100.0,1,0.034,1,2371,hyderabadonline - business listing in hyderaba...,100.0,100.0,1,1,1,1,1,71,9,40,1
6,https://www.aap.org,www.aap.org,org,100.0,1,0.056,1,2730,home,0.0,0.0,1,1,1,0,1,10,12,173,1
7,https://www.religionenlibertad.com,www.religionenlibertad.com,com,100.0,1,0.03,1,2616,religiÃ³n en libertad | noticias de religiÃ³n,55.555556,55.555556,1,1,1,1,1,80,10,376,1
8,http://www.teramill.com,www.teramill.com,com,82.644628,1,0.045,0,2,0,0.0,0.0,0,0,0,0,0,0,0,0,0
9,https://www.aoh61.com,www.aoh61.com,com,100.0,1,0.05,1,5966,0,0.0,0.0,1,0,1,0,0,16,7,4,1


In [3]:
# Equalizing the number of samples for each class
count = df['label'].value_counts()

n = count[0]

df_0 = df[df['label'] == 0]
df_1 = df[df['label'] == 1]
df_0 = df_0.sample(n=n, random_state=42, replace=False)
df_1 = df_1.sample(n=n, random_state=42, replace=False)

df = pd.concat([df_0, df_1])

In [4]:
import re

def tokenizeURL(url):
    tokens = re.split(r'\W+', url)
    tokens = [token.lower() for token in tokens if token]
    return tokens

df['tokenized_url'] = df['URL'].apply(tokenizeURL)
df.head()

Unnamed: 0,URL,Domain,TLD,URLSimilarityIndex,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,Title,DomainTitleMatchScore,...,IsResponsive,HasDescription,HasSocialNet,HasSubmitButton,HasCopyrightInfo,NoOfImage,NoOfJS,NoOfSelfRef,label,tokenized_url
55693,https://safnbyu-8uv.web.app/,safnbyu-8uv.web.app,app,61.717452,4,0.143,1,108,safnbyu-8uvweb,0.0,...,1,0,0,0,0,0,0,0,0,"[https, safnbyu, 8uv, web, app]"
52794,https://stepp17-d761f.web.app/,stepp17-d761f.web.app,app,56.400742,3,0.103,1,9,0,0.0,...,0,0,0,0,0,0,0,0,0,"[https, stepp17, d761f, web, app]"
145305,https://bt8m1f.webwave.dev/,bt8m1f.webwave.dev,dev,47.953216,3,0.111,1,58,bt8m1fwebwave,0.0,...,0,1,0,0,0,0,1,2,0,"[https, bt8m1f, webwave, dev]"
72762,https://www.capitalcomputer.com,www.capitalcomputer.com,com,82.644628,1,0.033,1,9,capitalmputer,0.0,...,0,0,0,0,0,0,0,0,0,"[https, www, capitalcomputer, com]"
28644,https://servizi-id.info/utenza,servizi-id.info,info,47.466762,3,0.1,1,63,servizi-idutenza,0.0,...,1,1,0,0,0,1,1,1,0,"[https, servizi, id, info, utenza]"


In [5]:
from gensim.models import Word2Vec

# Get the tokenized URLs and train the Word2Vec model
tokenized_urls = df['tokenized_url'].tolist()
model = Word2Vec(sentences=tokenized_urls, vector_size=100, window=5, min_count=1, workers=4)

In [6]:
# Function to aggregate token vectors
def aggregate_vectors(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [7]:
# Create URL vectors
df['url_vector'] = df['tokenized_url'].apply(lambda x: aggregate_vectors(x, model))

In [8]:
df.head()

Unnamed: 0,URL,Domain,TLD,URLSimilarityIndex,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,Title,DomainTitleMatchScore,...,HasDescription,HasSocialNet,HasSubmitButton,HasCopyrightInfo,NoOfImage,NoOfJS,NoOfSelfRef,label,tokenized_url,url_vector
55693,https://safnbyu-8uv.web.app/,safnbyu-8uv.web.app,app,61.717452,4,0.143,1,108,safnbyu-8uvweb,0.0,...,0,0,0,0,0,0,0,0,"[https, safnbyu, 8uv, web, app]","[-0.017126203, 0.9795521, -0.13356423, 0.95448..."
52794,https://stepp17-d761f.web.app/,stepp17-d761f.web.app,app,56.400742,3,0.103,1,9,0,0.0,...,0,0,0,0,0,0,0,0,"[https, stepp17, d761f, web, app]","[-0.015961409, 0.9757695, -0.13010903, 0.94992..."
145305,https://bt8m1f.webwave.dev/,bt8m1f.webwave.dev,dev,47.953216,3,0.111,1,58,bt8m1fwebwave,0.0,...,1,0,0,0,0,1,2,0,"[https, bt8m1f, webwave, dev]","[-0.29783905, 0.51891303, 0.19461104, 0.078574..."
72762,https://www.capitalcomputer.com,www.capitalcomputer.com,com,82.644628,1,0.033,1,9,capitalmputer,0.0,...,0,0,0,0,0,0,0,0,"[https, www, capitalcomputer, com]","[-0.35378492, 1.084686, -0.30616823, 0.0597404..."
28644,https://servizi-id.info/utenza,servizi-id.info,info,47.466762,3,0.1,1,63,servizi-idutenza,0.0,...,1,0,0,0,1,1,1,0,"[https, servizi, id, info, utenza]","[0.06294253, 0.6361419, -0.5289027, 0.5133731,..."


In [9]:
# Expand URL vectors into separate columns
url_vector_df = pd.DataFrame(df['url_vector'].to_list(), index=df.index)

# Combine with other features
combined_df = pd.concat([df.drop(columns=['URL', 'tokenized_url', 'url_vector', "TLD", "Domain", "Title"]), url_vector_df], axis=1)

combined_df.head()

Unnamed: 0,URLSimilarityIndex,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,DomainTitleMatchScore,URLTitleMatchScore,IsResponsive,HasDescription,HasSocialNet,...,90,91,92,93,94,95,96,97,98,99
55693,61.717452,4,0.143,1,108,0.0,0.0,1,0,0,...,0.063745,0.711548,-0.270725,-0.061176,1.74511,0.921641,0.846571,-0.052175,0.866068,-0.265184
52794,56.400742,3,0.103,1,9,0.0,0.0,0,0,0,...,0.070377,0.708595,-0.270195,-0.058215,1.736837,0.919246,0.847465,-0.051315,0.862159,-0.258002
145305,47.953216,3,0.111,1,58,0.0,0.0,0,1,0,...,0.817025,1.017004,0.651736,0.122643,2.02669,1.511263,0.158471,-0.606113,0.057375,-0.443249
72762,82.644628,1,0.033,1,9,0.0,0.0,0,0,0,...,1.012025,1.148054,-0.223456,-0.306103,1.240283,1.010656,0.571385,0.217639,0.847472,0.03693
28644,47.466762,3,0.1,1,63,0.0,100.0,1,1,0,...,0.256848,0.835484,0.069049,0.112542,1.334846,0.582669,0.167004,0.026989,0.378382,-0.14933


In [10]:
correlation_matrix = combined_df.corr()

In [None]:
plt.figure(figsize=(75, 75))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.savefig('correlation_matrix.png')
plt.show()

# Display the correlation matrix
print(correlation_matrix)

In [11]:
CORRELATION_THRESHOLD = 0.3

# Get the features that are highly correlated with the target
correlated_features = correlation_matrix['label'][abs(correlation_matrix['label']) > CORRELATION_THRESHOLD].index

# Display the highly correlated features
print(correlated_features)

# Drop all features that are not highly correlated with the target
combined_df = combined_df[correlated_features].drop(columns=['NoOfOtherSpecialCharsInURL', 'URLTitleMatchScore', 'URLSimilarityIndex'])

# Display the new dataframe
combined_df.head()

# change all column labels to strings
combined_df.columns = [str(col) for col in combined_df.columns]

Index([        'URLSimilarityIndex', 'NoOfOtherSpecialCharsInURL',
            'SpacialCharRatioInURL',                    'IsHTTPS',
                       'LineOfCode',      'DomainTitleMatchScore',
               'URLTitleMatchScore',               'IsResponsive',
                   'HasDescription',               'HasSocialNet',
                  'HasSubmitButton',           'HasCopyrightInfo',
                        'NoOfImage',                     'NoOfJS',
                      'NoOfSelfRef',                      'label',
                                  4,                            7,
                                  8,                           10,
                                 12,                           13,
                                 15,                           17,
                                 19,                           20,
                                 22,                           24,
                                 25,                          

In [12]:
combined_df.head()

Unnamed: 0,SpacialCharRatioInURL,IsHTTPS,LineOfCode,DomainTitleMatchScore,IsResponsive,HasDescription,HasSocialNet,HasSubmitButton,HasCopyrightInfo,NoOfImage,...,82,83,84,86,87,89,90,92,98,99
55693,0.143,1,108,0.0,1,0,0,0,0,0,...,-0.525646,0.782921,0.474539,0.761101,0.402395,0.704559,0.063745,-0.270725,0.866068,-0.265184
52794,0.103,1,9,0.0,0,0,0,0,0,0,...,-0.523526,0.782946,0.471213,0.754034,0.402256,0.705806,0.070377,-0.270195,0.862159,-0.258002
145305,0.111,1,58,0.0,0,1,0,0,0,0,...,-1.61647,1.148826,-0.858202,0.299093,0.48005,-0.057359,0.817025,0.651736,0.057375,-0.443249
72762,0.033,1,9,0.0,0,0,0,0,0,0,...,-0.527989,0.11487,1.596691,-0.217577,1.589061,1.070845,1.012025,-0.223456,0.847472,0.03693
28644,0.1,1,63,0.0,1,1,0,0,0,1,...,-0.586474,0.376793,0.90001,0.099361,0.881638,0.402638,0.256848,0.069049,0.378382,-0.14933


In [14]:
print(combined_df.shape[0])

144074


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

RANDOM_STATE = 42

sklearn_models = {
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=combined_df.shape[0]),
    'Support Vector Machine': SVC(random_state=RANDOM_STATE)
}

In [16]:
from sklearn.model_selection import train_test_split

X = combined_df.drop(columns=['label'])
y = combined_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [17]:
def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    return accuracy, report, confusion

In [18]:
results = {}

for name, model in sklearn_models.items():
    print(f'Training {name}...')
    y_pred = train_model(model, X_train, y_train, X_test, y_test)
    accuracy, report, confusion = evaluate_model(y_test, y_pred)
    print(f'{name} accuracy: {accuracy}')
    print(report)
    print(confusion)
    print()

    results[name] = {
        'accuracy': accuracy,
        'report': report,
        'confusion': confusion
    }

Training Random Forest...
Random Forest accuracy: 0.9996182543813986
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14561
           1       1.00      1.00      1.00     14254

    accuracy                           1.00     28815
   macro avg       1.00      1.00      1.00     28815
weighted avg       1.00      1.00      1.00     28815

[[14553     8]
 [    3 14251]]

Training Logistic Regression...
Logistic Regression accuracy: 0.9993059170570883
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14561
           1       1.00      1.00      1.00     14254

    accuracy                           1.00     28815
   macro avg       1.00      1.00      1.00     28815
weighted avg       1.00      1.00      1.00     28815

[[14546    15]
 [    5 14249]]

Training Support Vector Machine...
Support Vector Machine accuracy: 0.9819538434842964
              precision    recall  f1-score  