In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
# from keras.utils.data_utils import GeneratorEnqueuer  # We only want this for multithreaded 

from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch import Tensor
import torch
import pickle

In [None]:
test_df = pd.read_csv("data/test_df_lstm.csv")
model = pickle.load("data/finalized_model_lstm.pkl")

In [None]:
embeddings_index = {}
f = open('data/glove.840B.300d.txt', encoding = 'utf-8')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
def text_to_array(text):
  
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:120]
  
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    seq_len = len(embeds)
 
    embeds+= [empyt_emb] * (120 - len(embeds))
    return np.array(embeds), seq_len

In [None]:
# generate test dataset
embeddings = [text_to_array(X_text) for X_text in tqdm(test_df["question_text"][:5000])]
test_x, test_xlen = zip(*embeddings)

sorder = np.argsort(test_xlen)[::-1]

test_xlen = np.array(test_xlen)[sorder]
test_x = np.array(test_x)[sorder]
test_y = np.array(test_df["target"][:5000])[sorder]

In [None]:
def test_gen(batch_size=256):
    n_batches = math.ceil(len(test_x) / batch_size)
    for idx in range(n_batches):
        xb = test_x[idx *batch_size:(idx+1) * batch_size]
        xlb = test_xlen[idx *batch_size:(idx+1) * batch_size]
        yb = test_y[idx *batch_size:(idx+1) * batch_size]
        yield xb, xlb, yb

In [None]:
all_preds = []
    for x, xlen, y in test_gen():
        y_pred = model(Variable(Tensor(x)), xlen)
        all_preds.extend(y_pred.cpu().data.numpy())

In [None]:
print("f1 score: ", f1_score(test_y, np.array(all_preds).flatten() > 0.5))
print("f1 score: ", precision_score(test_y, np.array(all_preds).flatten() > 0.5))
print("f1 score: ", recall_score(test_y, np.array(all_preds).flatten() > 0.5))