In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np

In [None]:
import torch
import pickle
import torch.nn as nn
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
url_df = pd.read_csv('/content/drive/MyDrive/phishing_site_urls.csv', index_col=False)

In [None]:
url_df.tail(7)

Unnamed: 0,URL,Label
549339,free.ulohapp.info/?oq=CEh3h_PskJLFZaQWwjEKBegU...,bad
549340,mol.com-ho.me/cv_itworx.doc,bad
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z0-9]+')
stemmer = SnowballStemmer('english')
def join(l):
  f=''
  for i in l:
    f+=str(i)+' '
  return str(f)

In [None]:
urls = url_df.URL.values
urls

array(['nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526',
       'www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrcmd=_home-customer&nav=1/loading.php',
       'serviciosbys.com/paypal.cgi.bin.get-into.herf.secure.dispatch35463256rzr321654641dsf654321874/href/href/href/secure/center/update/limit/seccure/4d7a1ff5c55825a2e632a679c2fd5353/',
       ..., 'apple-iclods.org/', 'apple-uptoday.org/',
       'apple-search.info'], dtype=object)

In [None]:
tokenized_urls = [tokenizer.tokenize(str(url)) for url in urls]

In [None]:
url_df['TT'] = tokenized_urls

In [None]:
url_df.tail(7)

Unnamed: 0,URL,Label,TT
549339,free.ulohapp.info/?oq=CEh3h_PskJLFZaQWwjEKBegU...,bad,"[free, ulohapp, info, oq, CEh3h, PskJLFZaQWwjE..."
549340,mol.com-ho.me/cv_itworx.doc,bad,"[mol, com, ho, me, cv, itworx, doc]"
549341,23.227.196.215/,bad,"[23, 227, 196, 215]"
549342,apple-checker.org/,bad,"[apple, checker, org]"
549343,apple-iclods.org/,bad,"[apple, iclods, org]"
549344,apple-uptoday.org/,bad,"[apple, uptoday, org]"
549345,apple-search.info,bad,"[apple, search, info]"


In [None]:
stemmed_urls=[]
for i in range(len(tokenized_urls)):
  l=[stemmer.stem(word) for word in tokenized_urls[i]]
  stemmed_urls.append(l)

In [None]:
for i in stemmed_urls:
  print(join(i))
  break

nobel it 70ffb52d079109dca5664cce6f317373782 login skype com en cgi bin verif login 70ffb52d079109dca5664cce6f317373 index php cmd profil ach outdat page tmpl p gen fail to load nav 0 5 1 login access 1322408526 


In [None]:
url_df['SU'] = stemmed_urls
url_df.tail(7)

Unnamed: 0,URL,Label,TT,SU,FT
549339,free.ulohapp.info/?oq=CEh3h_PskJLFZaQWwjEKBegU...,bad,"[free, ulohapp, info, oq, CEh3h, PskJLFZaQWwjE...","[free, ulohapp, info, oq, ceh3h, pskjlfzaqwwje...","[ ' f r e e ' , ' u l o h a p p ' , ' i n ..."
549340,mol.com-ho.me/cv_itworx.doc,bad,"[mol, com, ho, me, cv, itworx, doc]","[mol, com, ho, me, cv, itworx, doc]","[ ' m o l ' , ' c o m ' , ' h o ' , ' m ..."
549341,23.227.196.215/,bad,"[23, 227, 196, 215]","[23, 227, 196, 215]","[ ' 2 3 ' , ' 2 2 7 ' , ' 1 9 6 ' , ' 2 ..."
549342,apple-checker.org/,bad,"[apple, checker, org]","[appl, checker, org]","[ ' a p p l ' , ' c h e c k e r ' , ' o r ..."
549343,apple-iclods.org/,bad,"[apple, iclods, org]","[appl, iclod, org]","[ ' a p p l ' , ' i c l o d ' , ' o r g ' ]"
549344,apple-uptoday.org/,bad,"[apple, uptoday, org]","[appl, uptoday, org]","[ ' a p p l ' , ' u p t o d a y ' , ' o r ..."
549345,apple-search.info,bad,"[apple, search, info]","[appl, search, info]","[ ' a p p l ' , ' s e a r c h ' , ' i n f ..."


In [None]:
final_text=[join(su) for su in stemmed_urls]

In [None]:
url_df['FT'] = final_text

In [None]:
url_df.tail(5)

Unnamed: 0,URL,Label,TT,SU,FT
549341,23.227.196.215/,bad,"[23, 227, 196, 215]","[23, 227, 196, 215]",23 227 196 215
549342,apple-checker.org/,bad,"[apple, checker, org]","[appl, checker, org]",appl checker org
549343,apple-iclods.org/,bad,"[apple, iclods, org]","[appl, iclod, org]",appl iclod org
549344,apple-uptoday.org/,bad,"[apple, uptoday, org]","[appl, uptoday, org]",appl uptoday org
549345,apple-search.info,bad,"[apple, search, info]","[appl, search, info]",appl search info


In [None]:
X = url_df.FT.values
Y=[]
for i in url_df.Label:
  if i=='bad':
    Y.append(1)
  else:
    Y.append(0)
Y = np.array(Y)

In [None]:
X[0]

'nobel it ffb d dca cce f login skype com en cgi bin verif login ffb d dca cce f index php cmd profil ach outdat page tmpl p gen fail to load nav login access '

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokeniser=Tokenizer()

In [None]:
maxlen=0
for url in X:
  maxlen = max(maxlen, len(str(url)))

maxlen

2203

In [None]:
tokeniser.fit_on_texts(X)

In [None]:
with open('/content/drive/MyDrive/kerasTokenizer.tokeniser', 'wb') as handle:
  pickle.dump(tokeniser, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
class CustomDataset:
    def __init__(self, X, tokeniser, targets, maxlen, pad_sequences):
        self.features = X
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        url = self.features[idx]
        tokenised_url = tokeniser.texts_to_sequences(url)
        padded_url = pad_sequences(tokenised_url, maxlen=maxlen, padding='post', truncating='post')
        target = self.targets[idx]
        return {
                'url':torch.tensor(padded_url).float(),
                'target':torch.tensor(target).float()
            }

In [None]:
with open('/content/drive/MyDrive/kerasTokenizer.tokeniser', 'rb') as handle:
  tokeniser = pickle.load(handle)



In [None]:
type(tokeniser)

keras.src.preprocessing.text.Tokenizer

In [None]:
Xtrain_dataset = CustomDataset(X, tokeniser, Y, maxlen, pad_sequences)

In [None]:
X[0]

'nobel it ffb d dca cce f login skype com en cgi bin verif login ffb d dca cce f index php cmd profil ach outdat page tmpl p gen fail to load nav login access '

[[51],
 [124],
 [14],
 [11],
 [69],
 [],
 [58],
 [52],
 [],
 [9],
 [9],
 [14],
 [],
 [3],
 [],
 [3],
 [8],
 [7],
 [],
 [8],
 [8],
 [11],
 [],
 [9],
 [],
 [69],
 [124],
 [50],
 [58],
 [51],
 [],
 [21],
 [97],
 [105],
 [36],
 [11],
 [],
 [8],
 [124],
 [39],
 [],
 [11],
 [51],
 [],
 [8],
 [50],
 [58],
 [],
 [14],
 [58],
 [51],
 [],
 [25],
 [11],
 [81],
 [58],
 [9],
 [],
 [69],
 [124],
 [50],
 [58],
 [51],
 [],
 [9],
 [9],
 [14],
 [],
 [3],
 [],
 [3],
 [8],
 [7],
 [],
 [8],
 [8],
 [11],
 [],
 [9],
 [],
 [58],
 [51],
 [3],
 [11],
 [64],
 [],
 [36],
 [88],
 [36],
 [],
 [8],
 [39],
 [3],
 [],
 [36],
 [81],
 [124],
 [9],
 [58],
 [69],
 [],
 [7],
 [8],
 [88],
 [],
 [124],
 [91],
 [52],
 [3],
 [7],
 [52],
 [],
 [36],
 [7],
 [50],
 [11],
 [],
 [52],
 [39],
 [36],
 [69],
 [],
 [36],
 [],
 [50],
 [11],
 [51],
 [],
 [9],
 [7],
 [58],
 [69],
 [],
 [52],
 [124],
 [],
 [69],
 [124],
 [7],
 [3],
 [],
 [51],
 [7],
 [25],
 [],
 [69],
 [124],
 [50],
 [58],
 [51],
 [],
 [7],
 [8],
 [8],
 [11],
 [21],
 [21],

In [None]:
word2index = tokeniser.word_index
tokens=[]
for su in tokenized_urls:
  l = [word2index[word] for word in su]
  tokens.append(l)

KeyError: ignored

In [None]:
url_df.to_csv('/content/drive/MyDrive/modifiedURLdf.csv', index=False)

In [None]:
vocab=[]
for su in stemmed_urls:
  for word in su:
    if word in vocab:
      continue
    else:
      vocab.append(word)

KeyboardInterrupt: ignored