In [6]:
import torch
from torch import nn as nn
from torch.nn import functional as F
import pandas as pd
 from tqdm import tqdm

In [7]:
import os
cwd = os.getcwd()
cwd

'/Users/sankalp/Documents/URL Code'

In [8]:
data = pd.read_csv(cwd +'/malicious_phish.csv')
data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
class Label_Index:
    def __init__(self, dataset):
        self.labels = dataset['type'].unique()
        self.label_index = {label: index for index, label in enumerate(self.labels)}
        self.index_label = {index: label for index, label in enumerate(self.labels)}
    
    def indexes_labels(self, dataset):
        return dataset['type'].map(self.index_label)

    def labels_indexes(self, dataset):
        return dataset['type'].map(self.label_index)
    def __call__(self, label):
        return self.label_index[label]

label_index = Label_Index(dataset)
label_index('phishing')

0

In [4]:
class Char_Index:
    def __init__(self, urls) -> None:
        self.char_index = {}
        self.index_char = {}
        for url in urls:
            for char in url:
                if char not in self.char_index:
                    self.char_index[char] = len(self.char_index)
                    self.index_char[len(self.index_char)] = char
    
    def string_indexes(self, string):
        return [self.char_index[char] for char in string]

char_index = Char_Index(dataset['url'])
char_index.string_indexes(dataset.url[0]), len(char_index.char_index)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4, 6, 10, 9, 0, 1], 333)

In [5]:
# shuffle data
dataset = dataset.sample(frac=1).reset_index(drop=True)

# split data into train and test
train_data = dataset[:int(len(dataset)*0.8)]
test_data = dataset[int(len(dataset)*0.8):].reset_index(drop=True)

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, char_index: Char_Index, label_index: Label_Index) -> None:
        self.df = df
        self.char_index = char_index
        self.label_index = label_index

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        url = self.df.url[index]
        label = self.label_index(self.df.type[index])
        return torch.tensor(self.char_index.string_indexes(url)), torch.tensor(label)

trainDataset = Dataset(train_data, char_index, label_index)
testDataset = Dataset(test_data, char_index, label_index)
len(trainDataset), len(testDataset)

(520952, 130239)

In [7]:
def collate_fn(batch):
    urls, labels = zip(*batch)
    urls = nn.utils.rnn.pad_sequence(urls, batch_first=True)
    return urls, torch.tensor(labels)

trainGenerator = torch.utils.data.DataLoader(trainDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
testGenerator = torch.utils.data.DataLoader(testDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
for inputs, labels in trainGenerator:
    print(inputs, labels)
    break

tensor([[62, 13,  4,  ...,  0,  0,  0],
        [19, 20, 20,  ...,  0,  0,  0],
        [26, 26, 26,  ...,  0,  0,  0],
        ...,
        [ 0,  7,  1,  ...,  0,  0,  0],
        [ 0,  5,  6,  ...,  0,  0,  0],
        [26, 26, 26,  ...,  0,  0,  0]]) tensor([1, 3, 1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1, 1, 1, 0, 3, 1, 0, 2, 1, 1, 2,
        2, 0, 1, 2, 0, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
        0, 1, 1, 3, 1, 1, 1, 0, 1, 0, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 3, 1, 1, 0,
        1, 1, 2, 1, 1, 0, 1, 3, 1, 1, 2, 1, 2, 0, 1, 1, 3, 1, 1, 1, 1, 1, 1, 0,
        0, 1, 1, 0, 1, 1, 3, 1, 0, 1, 1, 1, 2, 2, 1, 2, 1, 3, 1, 1, 0, 2, 1, 1,
        1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1,
        1, 2, 0, 1, 0, 1, 

In [8]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers=1, dropout=0.2, bidirectional=False):
        super(GRU, self).__init__()
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)   # [batch_size, seq_len] -> [batch_size, seq_len, embedding_size]
        if self.bidirectional == True:
            h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)   # [num_layers*2, batch_size, hidden_size]
        else:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # [num_layers, batch_size, hidden_size]
        out, _ = self.gru(x, h0)    # [batch_size, seq_len, hidden_size]
        if self.bidirectional == True:
            out = out[:, -1, :self.hidden_size] + out[:, 0, self.hidden_size:]  # [batch_size, hidden_size]
        return self.fc(out)
gru_model = GRU(len(char_index.char_index), 128, 128, len(label_index.labels), bidirectional=True, num_layers=1)
gru_model(inputs)

  "num_layers={}".format(dropout, num_layers))


tensor([[ 0.2571,  0.1513,  0.3357, -0.1672],
        [ 0.2781,  0.1040,  0.0853, -0.1624],
        [ 0.3079,  0.2533,  0.2197, -0.6255],
        ...,
        [ 0.4140,  0.0814,  0.5379, -0.4283],
        [ 0.4566, -0.0758,  0.3949, -0.3915],
        [ 0.3127,  0.1790,  0.1862, -0.6998]], grad_fn=<AddmmBackward0>)

In [9]:
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()

epochs = 10

In [10]:
best_right = 0
early_stop = 0
gru_model.to('cuda')
for epoch in range(epochs):
    loss_value = 0.0
    gru_model.train()
    for inputs, label in tqdm(trainGenerator):
        inputs = inputs.cuda()
        label = label.cuda()
        optimizer.zero_grad()
        output = gru_model(inputs)
        l = loss(output, label)
        l.backward()
        loss_value += l.item()
        optimizer.step()
    print(f'epoch: {epoch+1}, train loss: {loss_value/len(trainGenerator)}')

    # eval
    gru_model.cuda()
    gru_model.eval()
    loss_value = 0.0
    right_num = 0
    for inputs, label in testGenerator:
        inputs = inputs.cuda()
        label = label.cuda()
        output = gru_model(inputs)
        l = loss(output, label)
        loss_value += l.item()
        right_num += (torch.argmax(output, dim=1) == label).sum().item()
    print(f'\t test loss: {loss_value/len(testGenerator)}, test acc: {right_num/len(testDataset)}')
    
    # save model or early stop
    if right_num > best_right:
        best_right = right_num
        torch.save(gru_model.state_dict(), './gru_model.pth')
        print('save model')
        early_stop = 0
    else:
        early_stop += 1
        if early_stop > 3:
            print('early stop')
            break

100%|██████████| 2035/2035 [01:55<00:00, 17.62it/s]

epoch: 1, train loss: 0.152655381333147





	 test loss: 0.08903452890883026, test acc: 0.9716214037269942
save model


100%|██████████| 2035/2035 [01:56<00:00, 17.48it/s]

epoch: 2, train loss: 0.06994125512256961





	 test loss: 0.06669379950799267, test acc: 0.9792074570597133
save model


100%|██████████| 2035/2035 [01:53<00:00, 17.94it/s]

epoch: 3, train loss: 0.05602555510150932





	 test loss: 0.06256626188491557, test acc: 0.9802056219719132
save model


100%|██████████| 2035/2035 [01:55<00:00, 17.69it/s]

epoch: 4, train loss: 0.04785536868630449





	 test loss: 0.0600509934625012, test acc: 0.9809043374104531
save model


100%|██████████| 2035/2035 [01:56<00:00, 17.51it/s]

epoch: 5, train loss: 0.04245174165233896



100%|██████████| 2035/2035 [01:56<00:00, 17.53it/s]

epoch: 6, train loss: 0.038177085626659876





	 test loss: 0.05500464021522837, test acc: 0.9833229677746297
save model


100%|██████████| 2035/2035 [01:56<00:00, 17.48it/s]

epoch: 8, train loss: 0.03147784808549929





	 test loss: 0.0548662106124117, test acc: 0.9831617257503513


100%|██████████| 2035/2035 [01:55<00:00, 17.63it/s]

epoch: 9, train loss: 0.028985044042893322





	 test loss: 0.056610636177064744, test acc: 0.9837222337395096
save model


100%|██████████| 2035/2035 [01:56<00:00, 17.52it/s]

epoch: 10, train loss: 0.02633294252294324





	 test loss: 0.057617524689089046, test acc: 0.9834381406491143
