In [1]:
# imports and settings

import torch
import torch.nn.functional as F
import random
import json
import numpy as np

from torchtext.data.utils import get_tokenizer
import pickle


with open ('../settings.json') as f:
    settings = json.load(f)

db_uri = settings['sqlalchemy_database_uri']

RANDOM_SEED = 4444
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.01
BATCH_SIZE = 16
NUM_EPOCHS = 30
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_CLASSES = 2

In [2]:
# load data
with open('reports.json') as f:
    reports = json.load(f)

# reports = [[r[0], r[1]] for r in reports if r[1] in ['bfs', 'dfs']]

In [3]:
# load vocab and model
with open('model_data/vocab_01.pkl', 'rb') as f:
    vocab = pickle.load(f)
    
model_checkpoint = torch.load('model_data/lstm_01.pt')


In [4]:
from models.lstm import LSTM

model = LSTM(input_dim=len(vocab),
             embedding_dim=EMBEDDING_DIM,
             hidden_dim=HIDDEN_DIM,
             output_dim=NUM_CLASSES)
model = model.to(DEVICE)

model.load_state_dict(model_checkpoint)

<All keys matched successfully>

In [5]:
def custom_tokenizer(line):
    line = line.lower()
    line = line.replace(',', ' ')
    line = line.replace('\\', ' ')
    line = line.replace('\\\\', ' ')
    return line.split()

tokenizer = get_tokenizer(tokenizer=custom_tokenizer)


def get_prediction(model, sentence):

    with torch.no_grad():
        model.eval()
        if type(sentence) == str:
            tokenized = tokenizer(sentence)
        elif type(sentence) == list:
            tokenized = sentence
        else:
            raise TypeError('sentence must be str or list')
        indexed = [vocab[t] for t in tokenized]
        # print(indexed)
        length = [len(indexed)]
        tensor = torch.LongTensor(indexed).to(DEVICE)
        tensor = tensor.unsqueeze(1)
        prediction = F.softmax(model(tensor), dim=1)
    return prediction.to("cpu").squeeze(dim=0).detach().numpy()

In [6]:
# shuffle data
random.shuffle(reports)

label_transform = lambda x: 1 if x == 'dfs' else 0

for i in range(100):
    report = reports[i]
    y_pred = get_prediction(model, report[0])
    print(report[1], label_transform(report[1]), np.argmax(y_pred), y_pred)


dfs 1 1 [5.6039954e-07 9.9999940e-01]
benign 0 0 [9.999298e-01 7.016074e-05]
bfs 0 0 [9.9999988e-01 1.2491958e-07]
bfs 0 0 [9.9999988e-01 1.2491958e-07]
dfs 1 1 [5.6039954e-07 9.9999940e-01]
benign 0 0 [0.9989182  0.00108187]
dfs 1 1 [5.6039954e-07 9.9999940e-01]
dfs 1 1 [5.6039954e-07 9.9999940e-01]
dfs 1 1 [5.603680e-07 9.999994e-01]
dfs 1 1 [5.6039954e-07 9.9999940e-01]
benign 0 0 [9.9999893e-01 1.1318369e-06]
benign 0 0 [9.9987316e-01 1.2684464e-04]
bfs 0 0 [9.9999988e-01 1.2491958e-07]
bfs 0 0 [9.9999988e-01 1.2491958e-07]
dfs 1 1 [5.6039954e-07 9.9999940e-01]


RuntimeError: Expected sequence length to be larger than 0 in RNN