In [1]:
import pandas as pd
import tiktoken
import re
from train_v2 import Dictionary, Model
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', None) 

In [4]:
max_len = 32
file_path = "imdb_clean.csv"

In [5]:
d = Dictionary(file_path=file_path, max_len=max_len)

In [6]:
len(d.vocab)

41676

In [7]:
df = pd.read_csv(file_path)
max_size = max_len
df['small_review'] = df['review'].str.lower().str.split(n=max_size).str[:max_size].str.join(' ')
df['small_review'] = df['small_review'].apply(lambda x:  re.sub(r'[^a-z ]+', ' ', x))

In [6]:
# Pre pare data

In [8]:
def get_data(df: pd.DataFrame, data: Dictionary) -> (torch.Tensor, torch.Tensor):
    x = []
    y = []
    max_size = 32
    for index, row in df.iterrows():
        encoded = data.encode(row['small_review'])[0:max_size]
        length = len(encoded)
        encoded = np.pad(encoded, (0, max_size - length), 'constant')
        x.append(torch.tensor(encoded))

        y.append(row['sentiment'])
    x = np.vstack(x)
    y = np.vstack(y)

    return torch.tensor(x, dtype=torch.int), torch.tensor(y, dtype=torch.float32)


In [9]:
def encode_custom_text(text, data, max_size):
    encoded = data.encode(text)[0:max_size]
    length = len(encoded)
    encoded = np.pad(encoded, (0, max_size - length), 'constant')
    return encoded

In [10]:
x, y = get_data(df,d)

In [11]:
x.shape

torch.Size([50000, 32])

In [12]:
y.shape

torch.Size([50000, 1])

In [13]:
x[0]

tensor([26010, 25854, 37025, 26279, 30813, 16439, 23333, 37016,   592, 40296,
        19787, 26556, 11997, 41431, 21588,  2948, 17346, 37111,  1729, 31015,
         1918, 37157, 19084, 12374, 40568, 16313, 40902, 23106, 37025, 13525,
        37129, 37016], dtype=torch.int32)

In [14]:
y[0]

tensor([1.])

In [15]:
embedding_dim = 16
max_len = 32
num_epochs = 5
mini_batch_size = 64
qkv_dim = 8
hidden = 16

vocab_size = len(d.vocab)

model = Model(embedding_dim=embedding_dim, vocab_size=vocab_size, max_len=max_len,qkv_dim=qkv_dim, hidden=hidden)

In [16]:
model.load_state_dict(torch.load('./imdb_sentiment_model.pth'))


<All keys matched successfully>

In [17]:
def predict_sentiment(text):
    text = text.lower()
    encoded = encode_custom_text(text, d, max_size)
    #print(encoded)
    test_input = torch.tensor(encoded)
    test_input = test_input.reshape(1, test_input.shape[0])
    with torch.inference_mode():
        y_pred = model(test_input)
        sentiment = ""
        if torch.round(y_pred).item() == 1.0 :
            sentiment = "Positive"
        else:
            sentiment = "Negative"
    return sentiment, y_pred

In [56]:
text = "robot jox is a great little film ok some of the sets are bad and the acting is not that great but the special effects are very good for a film of"
sentiment, y_pred = predict_sentiment(text)
print(f"{sentiment} : {y_pred.item()}")

Negative : 0.47692060470581055


In [22]:
train_index, test_index = train_test_split(df.index,test_size=0.1,random_state=42)

In [23]:
print(len(train_index),len(test_index))

45000 5000


In [28]:
test_df = df.iloc[test_index]

In [33]:
test_df['predicted_sentiment'] =  test_df['small_review'].apply(lambda x : predict_sentiment(x)[1].item()).round()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_sentiment'] =  test_df['small_review'].apply(lambda x : predict_sentiment(x)[1].item()).round()


In [34]:
test_df.to_csv("predicted_test_df.csv",index=False)

In [48]:
result_wrong_df = test_df.loc[abs(test_df['sentiment'].astype(float) - test_df['predicted_sentiment']).astype(bool)]

In [49]:
result_wrong_df.to_csv("result_wrong_df.csv",index=False)