In [39]:
import pandas as pd
import tiktoken
import re
from train_v2 import Dictionary, Model
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', None) 

In [40]:
max_len = 32
file_path = "imdb_clean.csv"

In [41]:
d = Dictionary(file_path=file_path, max_len=max_len)

In [42]:
len(d.vocab)

59064

In [5]:
df = pd.read_csv(file_path)
max_size = max_len
df['small_review'] = df['review'].str.lower().str.split(n=max_size).str[:max_size].str.join(' ')
df['small_review'] = df['small_review'].apply(lambda x:  re.sub(r'[^a-z ]+', ' ', x))

In [6]:
# Pre pare data

In [7]:
def get_data(df: pd.DataFrame, data: Dictionary) -> (torch.Tensor, torch.Tensor):
    x = []
    y = []
    max_size = 32
    for index, row in df.iterrows():
        encoded = data.encode(row['small_review'])[0:max_size]
        length = len(encoded)
        encoded = np.pad(encoded, (0, max_size - length), 'constant')
        x.append(torch.tensor(encoded))

        y.append(row['sentiment'])
    x = np.vstack(x)
    y = np.vstack(y)

    return torch.tensor(x, dtype=torch.int), torch.tensor(y, dtype=torch.float32)


In [8]:
def encode_custom_text(text, data, max_size):
    encoded = data.encode(text)[0:max_size]
    length = len(encoded)
    encoded = np.pad(encoded, (0, max_size - length), 'constant')
    return encoded

In [9]:
x, y = get_data(df,d)

In [10]:
x.shape

torch.Size([50000, 32])

In [11]:
y.shape

torch.Size([50000, 1])

In [12]:
x[0]

tensor([36943, 35861, 51841, 38579, 44641, 22835, 32095, 51805,  1191, 57185,
        28744, 39485, 17086, 58818, 30471,  6658, 24167, 52011,  4001, 44955,
         4747, 52101, 27803, 18411, 57601, 22658, 57921, 31875, 51841, 19971,
        52035, 51805], dtype=torch.int32)

In [13]:
y[0]

tensor([1.])

In [14]:
embedding_dim = 16
max_len = 32
num_epochs = 50
mini_batch_size = 1000
qkv_dim = 4
hidden = 16
vocab_size = len(d.vocab)

model = Model(embedding_dim=embedding_dim, vocab_size=vocab_size, max_len=max_len,qkv_dim=qkv_dim, hidden=hidden)

In [15]:
model.load_state_dict(torch.load('./imdb_sentiment_model.pth'))


<All keys matched successfully>

In [16]:
def predict_sentiment(text):
    text = text.lower()
    encoded = encode_custom_text(text, d, max_size)
    #print(encoded)
    test_input = torch.tensor(encoded)
    test_input = test_input.reshape(1, test_input.shape[0])
    with torch.inference_mode():
        y_pred = model(test_input)
        sentiment = ""
        if torch.round(y_pred).item() == 1.0 :
            sentiment = "Positive"
        else:
            sentiment = "Negative"
    return sentiment, y_pred

In [24]:
text = "A cinematic disaster! A convoluted plot, cringe-worthy dialogue, and forgettable performances. This film is a painful experience that will leave you questioning your life choices. Avoid at all costs."
sentiment, y_pred = predict_sentiment(text)
print(f"{sentiment} : {y_pred.item()}")

Negative : 0.0002499183174222708


In [35]:
train_index, test_index = train_test_split(list(df.index),test_size=0.1,random_state=42)

In [36]:
print(len(train_index),len(test_index))

45000 5000


Unnamed: 0,review,sentiment,small_review
33553,"I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna defended the world title against Lex Luger this match was boring and it has a terrible ending. However it deserves 8/10",1,i really liked this summerslam due to the look of the arena the curtains and just the look overall was interesting to me for some reason anyways this could have been one
9427,"Not many television shows appeal to quite as many different kinds of fans like Farscape does...I know youngsters and 30/40+ years old;fans both Male and Female in as many different countries as you can think of that just adore this T.V miniseries. It has elements that can be found in almost every other show on T.V, character driven drama that could be from an Australian soap opera; yet in the same episode it has science fact & fiction that would give even the hardiest ""Trekkie"" a run for his money in the brainbender stakes! Wormhole theory, Time Travel in true equational form...Magnificent. It embraces cultures from all over the map as the possibilities are endless having multiple stars and therefore thousands of planets to choose from.With such a broad scope; it would be expected that nothing would be able to keep up the illusion for long, but here is where ""Farscape"" really comes into it's own element...It succeeds where all others have failed, especially the likes of Star Trek (a universe with practically zero Kaos element!) They ran out of ideas pretty quickly + kept rehashing them! Over the course of 4 seasons they manage to keep the audience's attention using good continuity and constant character evolution with multiple threads to every episode with unique personal touches to camera that are specific to certain character groups within the whole. This structure allows for an extremely large area of subject matter as loyalties are forged and broken in many ways on many many issues. I happened to see the pilot (Premiere) in passing and just had to keep tuning in after that to see if Crichton would ever ""Get the girl"", after seeing them all on television I was delighted to see them available on DVD & I have to admit that it was the only thing that kept me sane whilst I had to do a 12 hour night shift and developed chronic insomnia...Farscape was the only thing to get me through those extremely long nights...Do yourself a favour; Watch the pilot and see what I mean...Farscape Comet",1,not many television shows appeal to quite as many different kinds of fans like farscape does i know youngsters and years old fans both male and female in as many different countries as
...,...,...,...
48445,"Chan is in New York and he gets involved with an attempt to sabotage a new aircraft design.The war was over a year away from reaching America but the second world war was already raging everywhere else in the world and so it colored everything since most people probably realized that war was coming. Here the War isn't mentioned but the fact that the film deals with the production of planes at the very least alludes to it. The mystery itself is pretty good, it the notion of plane sabotage lends itself nicely to a couple of rather tense moments. To be certain we are talking about Charlie Chan so we can be certain that he would live to fight another day, but there was no guarantee what condition he would be in, not whether anyone around him would survive.I really like this film a great deal. Its not one of the nest, and far from the worst. It is one of the truly rare things, a truly enjoyable one. Definitely worth a look or six.",1,chan is in new york and he gets involved with an attempt to sabotage a new aircraft design the war was over a year away from reaching america but the second world war
20382,"My wife and I both thought this film a watered-down, made-for-TV (BBC) version of Manhattan Murder Mystery...which is itself good, but not great. The story has little inter- character tension or chemistry, and not much of a plot. Woody Allen's character just sort of wanders around running off at the mouth, and Hugh Jackman and Scarlett Johannsson don't have a lot more to do. It's pretty disappointing, I must say. Ian McShane's role is just an expanded cameo appearance. The first thing that occurred to me was ""I wonder how much the BBC had to pay Woody Allen to dislodge him from Manhatttan?"" He must've needed the money, and they must have needed his appeal to expand their audience beyond the youth market drawn to the two stars. I'm giving this movie 4 stars instead of 3 because it is unbothersome background noise. If you ever want something to have on while you're knitting or sorting your stamp collection, this'll do the job. I wouldn't pay to rent it again.",0,my wife and i both thought this film a watered down made for tv bbc version of manhattan murder mystery which is itself good but not great the story has little inter character tension or chemistry


In [43]:
x,y = get_data(df.iloc[test_index], d)

In [44]:
len(x)

5000

In [45]:
len(y)

5000

In [46]:
x[0]

tensor([24838, 43320, 30217, 52101,     0, 14341, 52611, 51841, 30729, 35861,
        51841,  4029, 51841, 12022,  2840, 28744, 51841, 30729, 39081, 57099,
        27255, 52611, 31875, 20376, 48831, 43376,  3513, 52101, 11316, 22913,
         6754, 36943], dtype=torch.int32)

In [47]:
test_input = x[0]
test_input = test_input.reshape(1, test_input.shape[0])

In [52]:
it_index = 0
with torch.inference_mode():
    
    for test_input,sentiment in zip(x,y):
        text = df.iloc[test_index[it_index]]['small_review']
        test_input = test_input.reshape(1, test_input.shape[0])
        y_pred = model(test_input)
        if torch.round(y_pred).item() == sentiment :
            print('PASS',y_pred, sentiment, text)
        else:
            print('FAIL',y_pred, sentiment, text)
        it_index = it_index + 1

FAIL tensor([[0.4957]]) tensor([1.]) i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me for some reason  anyways  this could have been one
PASS tensor([[0.9792]]) tensor([1.]) i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me for some reason  anyways  this could have been one
PASS tensor([[0.0018]]) tensor([0.]) i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me for some reason  anyways  this could have been one
PASS tensor([[0.9992]]) tensor([1.]) i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me for some reason  anyways  this could have been one
PASS tensor([[0.0248]]) tensor([0.]) i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me f

In [53]:
test_index[it_index]

IndexError: list index out of range

In [57]:
df.iloc[test_index[0]]['small_review']

'i really liked this summerslam due to the look of the arena  the curtains and just the look overall was interesting to me for some reason  anyways  this could have been one'