Sources

Datasets
 - https://www.unb.ca/cic/datasets/url-2016.html
 - https://research.aalto.fi/en/datasets/phishstorm-phishing-legitimate-url-dataset
 - https://www.kaggle.com/teseract/urldataset

Papers
 - https://arxiv.org/pdf/1701.07179.pdf

In [None]:
!unzip urlset.csv.zip

Archive:  urlset.csv.zip
  inflating: urlset.csv              


In [None]:
!unzip archive.zip

Archive:  archive.zip
  inflating: malicious_phish.csv     


In [None]:
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from torchvision import transforms
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [None]:
!wget https://downloads.majestic.com/majestic_million.csv -O majestic_million.csv

--2021-12-04 05:07:51--  https://downloads.majestic.com/majestic_million.csv
Resolving downloads.majestic.com (downloads.majestic.com)... 185.17.199.9
Connecting to downloads.majestic.com (downloads.majestic.com)|185.17.199.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81358592 (78M) [text/csv]
Saving to: ‘majestic_million.csv’


2021-12-04 05:07:56 (19.1 MB/s) - ‘majestic_million.csv’ saved [81358592/81358592]



In [None]:
top1m = pd.read_csv('majestic_million.csv')

In [None]:
goodsites = top1m.iloc[:10000].Domain.to_list()
goodsites_target = [0] * len(goodsites)

In [None]:
df = pd.read_csv('malicious_phish.csv')

In [None]:
benign_urls = df[df.type=='benign'].url.to_list()[:100000]
benign_targets = [0]*len(benign_urls)

In [None]:
bad_urls1 = df[df.type == 'malware'].url.to_list()
bad_labels1 = [1]*len(bad_urls1)

In [None]:
labels = list()
urls = list()
cnt = 0
with open('urlset.csv', 'r') as f:
  row = f.readline().split(',')
  while True:
    try:
      row = f.readline().split(',')
      if row == [''] or row == ['\n']:
        break
      url = row[0].strip('"')
      label = int(float(row[-1].strip('\n')))

      urls.append(url)
      labels.append(label)
    except:
      pass

In [None]:
df3 = pd.read_csv('urldata.csv')

In [None]:
df3_neg = df3[df3.label == 'bad']
df3_pos = df3[df3.label != 'bad']
df3_pos = df3_pos.iloc[:len(df3_neg)]
df3_subset = pd.concat([df3_neg, df3_pos])

more_urls = df3_subset.url.to_list()
more_labels = ([1]*len(df3_neg)) + ([0]*len(df3_pos))

## Feature Extraction

Statistical Features.

In this project, we decided to extract lexical features from the URLs. These are features that describe the URL itself. For example, URL length, number to letter ratio, host name, domain...etc

Features:
  - link length
  - domain name length
  - number of dots
  - number of hyphens
  - number of numerals in domain name
  - number of semicolons
  - number of underscores
  - number of question marks
  - number of equals sign
  - number of ampersands
  - number of @s
  - number of non-alphanumeric characters
  - ratio of digit to letters

In [None]:
from collections import Counter

In [None]:
def extract_features(url):
    for prefix in ['https://', 'http://', 'www.']:
      if prefix in url:
        url = url.replace(prefix, '')
    link_len = len(url)
    n_perc20 = url.count('%20')

    counter = Counter(url)

    n_semi = counter[';']
    n_colo = counter[':']
    n_at = counter['@']
    n_amp = counter['&']
    n_perc = counter['%']
    n_eq = counter['=']
    n_slash = counter['/']

    n_ascii = 0
    n_nonascii = 0
    n_numbers = 0
    n_alpha = 0
    n_nonalpha = 0

    for c in url:
        if c.isalpha():
            n_alpha += 1
        elif c.isnumeric():
            n_numbers += 1
        elif not c.isalpha():
            n_nonalpha += 1
        elif c.isascii():
            n_ascii += 1
        elif not c.isascii():
            n_nonascii += 1
        else:
            print(f"Shouldn't really be here.. input: {c}")

    url_s = url.split('/')
    domain_name = url_s[0]
    n_hyph = domain_name.count('-')
    domain_name_len = len(domain_name)

    n_num_in_domain = 0
    domain_len = len(domain_name)
    n_subdomains = domain_name.count('.')
    for c in domain_name:
        if c.isnumeric():
            n_num_in_domain += 1
    n_dot_js = url.count('.js')

    return np.array([
        # domain_len,
        n_dot_js,
        n_subdomains,
        n_perc,
        n_hyph,
        n_amp,
        n_perc,
        n_numbers,
        n_alpha,
        n_nonalpha,
        n_num_in_domain,
        counter['a'],
        counter['b'],
        counter['c'],
        counter['d'],
        counter['e'],
        counter['f'],
        counter['g'],
        counter['h'],
        counter['i'],
        counter['j'],
        counter['k'],
        counter['l'],
        counter['m'],
        counter['n'],
        counter['o'],
        counter['p'],
        counter['q'],
        counter['r'],
        counter['s'],
        counter['t'],
        counter['u'],
        counter['v'],
        counter['w'],
        counter['x'],
        counter['y'],
        counter['z'],
    ], dtype=np.float).reshape(1, -1) # convert from shape (16,) to (1, 16)

In [None]:
url_features_list = list(map(extract_features, bad_urls1 + urls + more_urls + goodsites + benign_urls))
url_features = np.concatenate(url_features_list)
# n_samples, n_features
url_features.shape

(389717, 36)

In [None]:
targets = np.array(bad_labels1 + labels + more_labels + goodsites_target + benign_targets)
len(targets)

389717

#### Logistic Regresssion Model

In [None]:
reg = LogisticRegression(max_iter=2**30)

In [None]:
fold_idx = 0
for train_idx, test_idx in KFold(n_splits=5, shuffle=True).split(url_features):
  X, x = url_features[train_idx], url_features[test_idx]
  Y, y = targets[train_idx], targets[test_idx]
  reg.fit(X,Y)
  print(f"Fold {fold_idx}, Accuracy Score: {accuracy_score(reg.predict(x), y)}")
  fold_idx += 1


Fold 0, Accuracy Score: 0.7828825823668275
Fold 1, Accuracy Score: 0.7835753874576619
Fold 2, Accuracy Score: 0.78373939930462
Fold 3, Accuracy Score: 0.7789538508910357
Fold 4, Accuracy Score: 0.781827745916888


In [None]:
print(f"Precision\t: {precision_score(reg.predict(x), y)}")
print(f"Recall  \t: {recall_score(reg.predict(x), y)}")
print(f"Accuracy\t: {accuracy_score(reg.predict(x), y)}")
print(f"ROC AUC  \t: {roc_auc_score(reg.predict(x), y)}")

Precision	: 0.6055022474417419
Recall  	: 0.8040128682695564
Accuracy	: 0.781827745916888
ROC AUC  	: 0.7880960160490255


In [None]:
probs = reg.predict_proba(extract_features('https://www.google.com/hellow'))[0]

In [None]:
prob_good = probs[0]
prob_bad = probs[1]

In [None]:
print(prob_good, prob_bad)

0.834227591478219 0.16577240852178102


## Random Forest Model

In [None]:
clf = RandomForestClassifier()

In [None]:
fold_idx = 0
for train_idx, test_idx in KFold(n_splits=5, shuffle=True).split(url_features):
  X, x = url_features[train_idx], url_features[test_idx]
  Y, y = targets[train_idx], targets[test_idx]
  clf.fit(X,Y)
  print(f"Fold {fold_idx}, Accuracy Score: {accuracy_score(clf.predict(x), y)}")
  fold_idx += 1

Fold 0, Accuracy Score: 0.9070685119524191
Fold 1, Accuracy Score: 0.9083552556330778
Fold 2, Accuracy Score: 0.9060806656849149
Fold 3, Accuracy Score: 0.9070528859213931
Fold 4, Accuracy Score: 0.9066382625852479


In [None]:
print(f"Precision\t: {precision_score(clf.predict(x), y)}")
print(f"Recall  \t: {recall_score(clf.predict(x), y)}")
print(f"Accuracy\t: {accuracy_score(clf.predict(x), y)}")
print(f"ROC AUC  \t: {roc_auc_score(clf.predict(x), y)}")

Precision	: 0.8551971096972216
Recall  	: 0.9303975790462277
Accuracy	: 0.9066382625852479
ROC AUC  	: 0.9102271917176081


## Qualitative analysis

In [None]:
clf.predict(extract_features('http://219.155.142.211:55621/Mozi'))

array([1])

In [None]:
reg.predict(extract_features('facebook.com'))

array([0])

In [None]:
clf.predict(extract_features('https://github.com/sjsucmpe272-fall21/PhishBlocker'))

array([0])

In [None]:
clf.predict(extract_features('https://www.cyberciti.biz/faq/how-to-find-public-ip-address-aws-ec2-or-lightsail-vm/'))

array([0])

In [None]:
clf.predict(extract_features('http://211.137.225.95:47475/Mozi.m'))

array([1])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import joblib

In [None]:
joblib.dump(reg, 'model.joblib')

['model.joblib']

In [None]:
!mv model_test.joblib drive/MyDrive

mv: cannot stat 'model_test.joblib': No such file or directory


In [None]:
!ls -al

total 178976
drwxr-xr-x 1 root root     4096 Dec  4 05:54 .
drwxr-xr-x 1 root root     4096 Dec  4 05:05 ..
-rw-r--r-- 1 root root 17748901 Dec  4 05:07 archive.zip
drwxr-xr-x 4 root root     4096 Nov 18 14:35 .config
drwx------ 5 root root     4096 Dec  4 05:06 drive
drwxr-xr-x 2 root root     4096 Dec  4 05:10 .ipynb_checkpoints
-rw-r--r-- 1 root root 81358592 Dec  3 06:12 majestic_million.csv
-rw-r--r-- 1 root root 45664439 Jul 23 18:03 malicious_phish.csv
-rw-r--r-- 1 root root     1186 Dec  4 05:54 model.joblib
drwxr-xr-x 1 root root     4096 Nov 18 14:36 sample_data
-rw-r--r-- 1 root root 22774334 Dec  4 05:07 urldata.csv
-rwxrwxrwx 1 root root 12283023 Mar 27  2013 urlset.csv
-rw-r--r-- 1 root root  3400239 Dec  4 05:07 urlset.csv.zip
