In [19]:
import pandas as pd 
import numpy as np 

import torch

import os
import sys
sys.path.append(os.path.abspath('./src'))
sys.path.append(os.path.abspath('./src/data_preprocessing'))
sys.path.append(os.path.abspath('./src/architectures'))

# Data processing
from torch.utils.data import DataLoader
from data_preprocessing.custom_preprocess import custom_preprocess
from data_preprocessing.custom_dataset import CustomDataset
from data_preprocessing.load_cached_or_preprocess_and_cache import load_cached_or_preprocess_and_cache

import spacy

In [3]:
device = 'cuda'
torch.manual_seed(42)

<torch._C.Generator at 0x138df6b6cd0>

In [4]:
model = torch.load('./models/best_model_paral_conv_linear.pt')

In [4]:
df = pd.read_csv('./sampled_df.csv')


USE_PREPROCESSED_SUBSETS_CACHING = True

folder = './cached_subsets/'

if USE_PREPROCESSED_SUBSETS_CACHING:
    X_train, X_dev, X_test, y_train, y_dev, y_test = load_cached_or_preprocess_and_cache(df, folder)

else:
    X_train, X_dev, X_test, y_train, y_dev, y_test = custom_preprocess(df, dev_size=0.15, test_size=0.15, even_dist=True)

train_ds = CustomDataset(X_train, y_train)
dev_ds = CustomDataset(X_dev, y_dev)
test_ds = CustomDataset(X_test, y_test)

batch_size = 64

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=False, drop_last=True)
dev_dl = DataLoader(dev_ds, batch_size=batch_size, shuffle=False, drop_last=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, drop_last=True)

loss_fn = torch.nn.BCELoss()

All cached subsets were found, loading...
Loaded!


In [5]:
from test_fn import test

test(model=model, 
     dataloaders=[train_dl, dev_dl, test_dl], 
     loss_fn=loss_fn, 
     model_device=device)

Train
Loss: 0.09492773565104584 Accuracy: 97.418 %
------------
Dev
Loss: 0.15963801041972348 Accuracy: 93.969 %
------------
Test
Loss: 0.1762202772788885 Accuracy: 93.045 %
------------


In [42]:
nlp = spacy.load('en_core_web_lg')

def text_2_matrix(text):
    doc = nlp(text)
    mat = torch.tensor(np.array([token.vector for token in doc])).T
    return mat

def predict_from_text(text):
    mat = text_2_matrix(text).unsqueeze(0).to(device)
    return model(mat).item()

texts = ['Horrible movie. Definetelly do not recommend', 'Everything was nice. I really loved this movie', 'It was average. Not good, not bad. Just decent', 'Almost perfect. It was just not enough drama for me. But the rest is good']

for text in texts:
    pred = predict_from_text(text)
    print(f'Prediction for text "{text}": {pred}')

Prediction for text "Horrible movie. Definetelly do not recommend": 0.026074232533574104
Prediction for text "Everything was nice. I really loved this movie": 0.9911736845970154
Prediction for text "It was average. Not good, not bad. Just decent": 0.04250673949718475
Prediction for text "Almost perfect. It was just not enough drama for me. But the rest is good": 0.7048246264457703
