# Importing libraries

In [23]:
import random

SEED = 32
random.seed(SEED)

import numpy as np 
import pandas as pd
import spacy

from sklearn.metrics import accuracy_score
from sklearn.metrics import  f1_score

from torch import nn
import torch
from torchtext import data
from torch.nn  import functional as F
import torch.optim as  optim 

"""

those are the libraries I use for processing text

"""

import nltk
nltk.download("punkt")

import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

tokenizer = Tokenizer(nlp.vocab)

from nltk import word_tokenize,sent_tokenize
from nltk.stem  import PorterStemmer


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
stops = stopwords.words("english")


[nltk_data] Downloading package punkt to /home/madan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/madan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Activate GPU

In [2]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
  print("gpu up")
else:  
  dev = "cpu"  
device = torch.device(dev)

gpu up


# Defining functions for data preprocessing

In [3]:
def removepunc(my_str): # function to remove punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
snowstem = SnowballStemmer("english")
portstem = PorterStemmer()



def myTokenizer(x):
    """
    this function is the tokenizer we are using, it does basic processing also  like ,
    Lowercase the text
    removing punctuation, stop words and numbers,
    it also removes extra spaces and unwanted characters (I use regex for that)


    before using the tokenizer I was testing it on the train dataframe manually  
    """
    return  [snowstem.stem(word.text)for word in 
              tokenizer(removepunc(re.sub(r"\s+\s+"," ",re.sub(r"[^A-Za-z0-9()!?\'\`\"\r+\n+]"," ",x.lower()))).strip()) 
              if (word.text not in stops and not hasNumbers(word.text)) ]



# Load Datasets

In [5]:
traindata = pd.read_csv("/home/madan/Desktop/All/phishing-detection-transformer/data/final-csv/train.csv")
test = pd.read_csv("/home/madan/Desktop/All/phishing-detection-transformer/data/final-csv/test.csv")
traindata.drop("Unnamed: 0",axis=1,inplace=True)
test.drop("Unnamed: 0",axis=1,inplace=True)

In [6]:
traindata

Unnamed: 0,URL,Label
0,xml.coverpages.org/xmlForms.html,2
1,pawsoft.com/files/,1
2,ibegin.com/directory/ca/quebec/anjou/,2
3,mxp4016.com,1
4,hooksgems.blogspot.com/2009/09/clark-terry-cla...,2
...,...,...
368056,pt-tkbi.com/providernet/provider/provider/webm...,1
368057,wiki.d-addicts.com/Ninomiya_Kazunari,2
368058,jlkc.org/,2
368059,picobong.com/www.redirect.com.htm,1


# Pytorch torchtext



In [15]:
! conda install -c pytorch torchtext==1.12.0

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - torchtext==1.12.0

Current channels:

  - https://conda.anaconda.org/pytorch/linux-64
  - https://conda.anaconda.org/pytorch/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/linux-64
  - https://repo.anaconda.com/pkgs/r/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [14]:
import torchtext


print(torchtext.__version__)

0.15.1+cpu


In [12]:
! python --version

Python 3.9.7


In [22]:
print(torch.__version__)

2.0.0+cu117


In [24]:
"""
here I'm using the torchtext fields and dataset classes they can ease the work to get
the dataset ready for the pytorch model

the class DataFrameDataset is the easiest way I found to turn a dataframe into a torchtext dataset

this cell will take sometime to finish
"""
# from torchtext.legacy import data


TEXT = data.Field(tokenize=myTokenizer,batch_first=True,fix_length=140)
LABEL = data.LabelField(dtype=torch.float ,batch_first=True)


class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('URL', text_field), ('Label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.Label 
            text = row.URL
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)
  

torchdataset = DataFrameDataset(traindata, TEXT,LABEL)
torchtest = DataFrameDataset(test, TEXT,LABEL)

AttributeError: module 'torchtext.data' has no attribute 'Field'

# Split Data for training and testing

In [10]:
train_data, valid_data = torchdataset.split(split_ratio=0.8, random_state = random.seed(SEED))

In [11]:
"""
this cell build the vocab which means it get all the used words and if also ignores any word 
that only appeared less than 3 times
"""
TEXT.build_vocab(train_data,min_freq=3)  
LABEL.build_vocab(train_data)


In [12]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  


Size of TEXT vocabulary: 36857
Size of LABEL vocabulary: 2
[('com', 210421), ('html', 43390), ('www', 28719), ('org', 26847), ('net', 18407), ('amp', 17995), ('htm', 17088), ('php', 16645), ('en', 16334), ('index', 15289)]


In [13]:
#set batch size
BATCH_SIZE = 128

"""
we are using batches for validation and test set because of memory usage we can't pass the whole set at once 
"""


train_iterator,valid_iterator,test_iterator= data.BucketIterator.splits(
    (train_data,valid_data,torchtest), 
    batch_size = BATCH_SIZE,
    device = device,
    sort =False,
shuffle=False)

In [16]:
for i in test_iterator:
    print(i.URL.shape)

torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size([128, 140])
torch.Size(

# Transformer Algorithm

In [21]:

"""
one major point here is that I encoded the embeddings in a different way 
I made an embedding layer for the position then I concatenated position embeddings with the word embeddings 
just thought it could be a usefull way to encode the positions 

had to reshape the output of the transformer layer to get the prediction
"""
class TextTransformer(nn.Module):
  def __init__(self):
    super(TextTransformer,self).__init__()
    self.wordEmbeddings = nn.Embedding(len(TEXT.vocab),140)
    self.positionEmbeddings = nn.Embedding(140,20)
    self.transformerLayer = nn.TransformerEncoderLayer(160,8) 
    self.linear1 = nn.Linear(160,  64)
    self.linear2 = nn.Linear(64,  1)
    self.linear3 = nn.Linear(140,  16)
    self.linear4 = nn.Linear(16,  1)
  def forward(self,x):
    positions = (torch.arange(0,140).reshape(1,140) + torch.zeros(x.shape[0],140)).to(device) 
    # broadcasting the tensor of positions 
    sentence = torch.cat((self.wordEmbeddings(x.long()),self.positionEmbeddings(positions.long())),axis=2)
    attended = self.transformerLayer(sentence)
    linear1 = F.relu(self.linear1(attended))
    linear2 = F.relu(self.linear2(linear1))
    linear2 = linear2.view(-1,140) # reshaping the layer as the transformer outputs a 2d tensor (or 3d considering the batch size)
    linear3 = F.relu(self.linear3(linear2))
    out = torch.sigmoid(self.linear4(linear3))
    return out

myTransformer = TextTransformer()
myTransformer.to(device)

    


TextTransformer(
  (wordEmbeddings): Embedding(36857, 140)
  (positionEmbeddings): Embedding(140, 20)
  (transformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=160, out_features=160, bias=True)
    )
    (linear1): Linear(in_features=160, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=160, bias=True)
    (norm1): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear1): Linear(in_features=160, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (linear3): Linear(in_features=140, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=1, bias=True)
)

# Define Accuracy Matrics

In [22]:
def calculateMetrics(ypred,ytrue):
  acc  = accuracy_score(ytrue,ypred)
  f1  = f1_score(ytrue,ypred)
  f1_average  = f1_score(ytrue,ypred,average="macro")
  return " f1 score: "+str(round(f1,3))+" f1 average: "+str(round(f1_average,3))+" accuracy: "+str(round(acc,3))
  

# Train Algorithm

In [24]:
"""
using adagrad because it assign bigger updates to less frequently updated weights 
(like words that are not used many times)

"""

optimizer = optim.Adagrad(myTransformer.parameters(),lr = 0.001)

for i in range(20):
  trainpreds = torch.tensor([])
  traintrues = torch.tensor([])
  for  batch in train_iterator:
    X = batch.URL
    y = batch.Label
    myTransformer.zero_grad()
    pred = myTransformer(X).squeeze()
    trainpreds = torch.cat((trainpreds,pred.cpu().detach()))
    traintrues = torch.cat((traintrues,y.cpu().detach()))
    err = F.binary_cross_entropy(pred,y)
    err.backward()
    optimizer.step()
  err = F.binary_cross_entropy(trainpreds,traintrues)
  print("train BCE loss: ",err.item(),calculateMetrics(torch.round(trainpreds).numpy(),traintrues.numpy()))
 

  valpreds = torch.tensor([])
  valtrues = torch.tensor([])
  for batch in valid_iterator:
    X = batch.URL
    y = batch.Label
    valtrues = torch.cat((valtrues,y.cpu().detach()))
    pred = myTransformer(X).squeeze().cpu().detach()
    # print(valtrues.shape)
    valpreds = torch.cat((valpreds,pred))
  err = F.binary_cross_entropy(valpreds,valtrues)
  print("validation BCE loss: ",err.item(),calculateMetrics(torch.round(valpreds).numpy(),valtrues.numpy()))
  

train BCE loss:  0.2852913737297058  f1 score: 0.763 f1 average: 0.84 accuracy: 0.877
validation BCE loss:  0.2725692689418793  f1 score: 0.757 f1 average: 0.839 accuracy: 0.881
train BCE loss:  0.25245267152786255  f1 score: 0.794 f1 average: 0.86 accuracy: 0.891
validation BCE loss:  0.24781163036823273  f1 score: 0.787 f1 average: 0.858 accuracy: 0.892
train BCE loss:  0.23617786169052124  f1 score: 0.81 f1 average: 0.87 accuracy: 0.899
validation BCE loss:  0.23649349808692932  f1 score: 0.802 f1 average: 0.867 accuracy: 0.898
train BCE loss:  0.2259751409292221  f1 score: 0.82 f1 average: 0.877 accuracy: 0.903
validation BCE loss:  0.22922779619693756  f1 score: 0.809 f1 average: 0.871 accuracy: 0.901
train BCE loss:  0.21884658932685852  f1 score: 0.826 f1 average: 0.881 accuracy: 0.906
validation BCE loss:  0.22347918152809143  f1 score: 0.819 f1 average: 0.877 accuracy: 0.904
train BCE loss:  0.2130022495985031  f1 score: 0.831 f1 average: 0.884 accuracy: 0.909
validation BCE l

# Prediction on test data

In [97]:
"""
now getting the results on the test set
"""

testpreds = torch.tensor([])
testtrues = torch.tensor([])
for batch in test_iterator:
#     print(batch)
    X = batch.URL
#     print(X)
    y = batch.Label
    testtrues = torch.cat((testtrues,y.cpu().detach()))
    pred = myTransformer(X).squeeze().cpu().detach()
    # print(valtrues.shape)
    testpreds = torch.cat((testpreds,pred))
err = F.binary_cross_entropy(testpreds,testtrues)
print("test BCE loss: ",err.item(),calculateMetrics(torch.round(testpreds).numpy(),testtrues.numpy()))
  

test BCE loss:  1.1983379125595093  f1 score: 0.43 f1 average: 0.297 accuracy: 0.322


In [26]:
test["predicted"] = torch.round(testpreds).numpy()


"""
this shows that the model understands the language well 

"""

test[test.predicted==1].iloc[32:37]


Unnamed: 0,URL,Label,predicted
128,platinumwindowcleaning.com/wp-content/plugins/...,1,1.0
131,paypal.com-service.confirm.cgi-bin.webscr-cmd....,1,1.0
134,servicepro.in/login/dropbox/,1,1.0
135,trade.mitc.mw/libraries/nls/excel/excel/,1,1.0
146,gkjx168.com/images?http://us.battle.net/login/...,1,1.0


# Save and load the model via state_dict


In [33]:
torch.save(myTransformer.state_dict(),"State_dict_Model")
torch.save(myTransformer.state_dict(), "State_dict_Model.pt")
torch.save(myTransformer.state_dict(), "State_dict_Model.pth")
torch.save(myTransformer.state_dict(), "State_dict_Model.pickle")

In [35]:
# Load
model = TextTransformer()
model.load_state_dict(torch.load("State_dict_Model.pt"))
model.eval()

TextTransformer(
  (wordEmbeddings): Embedding(36857, 140)
  (positionEmbeddings): Embedding(140, 20)
  (transformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=160, out_features=160, bias=True)
    )
    (linear1): Linear(in_features=160, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=160, bias=True)
    (norm1): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear1): Linear(in_features=160, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (linear3): Linear(in_features=140, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=1, bias=True)
)

# Save and load entire model

In [36]:
torch.save(myTransformer,"Entire_Model")
torch.save(myTransformer, "Entire_Model.pt")
torch.save(myTransformer, "Entire_Model.pth")
torch.save(myTransformer, "Entire_Model.pickle")

In [37]:
# Load
model = torch.load("Entire_Model.pt")
model.eval()

TextTransformer(
  (wordEmbeddings): Embedding(36857, 140)
  (positionEmbeddings): Embedding(140, 20)
  (transformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=160, out_features=160, bias=True)
    )
    (linear1): Linear(in_features=160, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=160, bias=True)
    (norm1): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear1): Linear(in_features=160, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (linear3): Linear(in_features=140, out_features=16, bias=True)
  (linear4): Linear(in_features=16, out_features=1, bias=True)
)

# Inference

In [29]:
from torchtext import data

# initialize list of lists
inpu = [['google.com', 1]]
  
# Create the pandas DataFrame
df = pd.DataFrame(inpu, columns=['URL', 'Label'])
dataf = DataFrameDataset(df,TEXT,LABEL)

In [34]:
_,_,test_iterator= data.BucketIterator.splits(
    (_,_,dataf), 
    batch_size = BATCH_SIZE,
    device = device,
    sort =False,
shuffle=False)

In [35]:
TEXT.build_vocab(train_data,min_freq=3)  
LABEL.build_vocab(train_data)


In [37]:
for batch in test_iterator:
#     print(batch)
    X = batch.URL
    print(X)
    y = batch.Label
    model = torch.load("Entire_Model.pt")
    model.eval()
    pred = model(X).squeeze().cpu().detach()
    print(pred)

tensor([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')


FileNotFoundError: [Errno 2] No such file or directory: 'Entire_Model.pt'