In [88]:
import torch
import torch.nn as nn
import numpy as np 
import pandas as pd 
import re
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [89]:
class LSTMSentiment(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, num_classes):
        super(LSTMSentiment,self).__init__()
        self.embedding=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_size)
        self.lstm=nn.LSTM(input_size=embedding_size,hidden_size=hidden_size,batch_first=True)
        self.fc=nn.Sequential(
                    nn.Linear(in_features=hidden_size,out_features=64),
                    nn.ReLU(),
                    nn.Linear(in_features=64,out_features=num_classes)
                )
    def forward(self,x):
        output,(h_t,x_t)=self.lstm(self.embedding(x))
        last_hidden=h_t[-1]
        legits=self.fc(last_hidden)
        return legits



In [90]:
df= pd.read_csv('./data/movie_reviews.csv')
print(df.head())

   review_id    customer_name  rating review_date     movie_name  \
0          1     Frank Carter       3   4/30/2025   Pulp Fiction   
1          2    Scott Simmons       3    3/3/2025  The Godfather   
2          3     Marvin Smith       3   5/15/2025      Inception   
3          4  Brittney Taylor       5   2/25/2025        Titanic   
4          5    Allison Smith       2   5/11/2025  The Godfather   

                                        movie_review  
0  Pulp Fiction was okay, not the best but watcha...  
1  The Godfather was okay, not the best but watch...  
2     It was fine. Inception could have been better.  
3  Titanic was an outstanding film with brilliant...  
4           The Godfather was a huge disappointment.  


In [91]:
reviews=df.loc[:,'movie_review'].values

In [92]:
reviews=reviews.sum()
print(reviews)
reviews = re.sub(r"[^\w\s'’]", " ", reviews)
reviews = reviews.lower()
print(reviews)


Pulp Fiction was okay, not the best but watchable.The Godfather was okay, not the best but watchable.It was fine. Inception could have been better.Titanic was an outstanding film with brilliant performances.The Godfather was a huge disappointment.The Lion King was okay, not the best but watchable.I found The Matrix to be average. Some good moments though.The Avengers lacked a compelling story or performance.I didn’t enjoy Avatar at all.I didn’t enjoy Fight Club at all.Pulp Fiction was a thrilling and emotional ride.I absolutely loved Titanic! A masterpiece!A must-watch! Avatar exceeded my expectations.I absolutely loved Interstellar! A masterpiece!The Dark Knight lacked a compelling story or performance.A must-watch! Avatar exceeded my expectations.The Shawshank Redemption delivers an unforgettable cinematic experience.A must-watch! The Shawshank Redemption exceeded my expectations.The Dark Knight was an outstanding film with brilliant performances.The Shawshank Redemption was a thrill

In [93]:
unique_reviews=set(reviews.split())
vocab_size=len(set(reviews.split())) +1
vocab_size

94

In [94]:
#build vocabular dictionary
word_to_index = {word:i+1 for i, word in enumerate(unique_reviews)}
word_to_index
index_to_word={i+1:word for i, word in enumerate(unique_reviews)}

In [95]:
seq_max_len=max([len(review.split()) for review in df['movie_review']])
seq_max_len


13

In [96]:
def encode_review(review):
    review = re.sub(r"[^\w\s'’]", " ", review).lower()
    print(review)
    review=review.split()
    encoded_review= [word_to_index[word] for i,word in enumerate(review)]
    encoded_review+=[0] * (seq_max_len-len(encoded_review))
    return encoded_review

In [97]:
print(encode_review('The Lord of the Rings has potential but didn’t quite hit the mark.'))
print(encode_review('The Godfather was okay, not the best but watchable.'))

the lord of the rings has potential but didn’t quite hit the mark 
[77, 84, 47, 77, 28, 2, 73, 37, 81, 91, 44, 77, 74]
the godfather was okay  not the best but watchable 
[77, 52, 23, 48, 4, 77, 75, 37, 83, 0, 0, 0, 0]


In [98]:
df['sentiment']=df['rating'].apply(lambda r:0 if r<=2 else 1 if r == 3 else 2)

In [99]:
df.loc[:,['rating','sentiment']]

Unnamed: 0,rating,sentiment
0,3,1
1,3,1
2,3,1
3,5,2
4,2,0
...,...,...
95,1,0
96,2,0
97,1,0
98,4,2


In [100]:
X=df['movie_review'].apply(lambda r:encode_review(r))

pulp fiction was okay  not the best but watchable 
the godfather was okay  not the best but watchable 
it was fine  inception could have been better 
titanic was an outstanding film with brilliant performances 
the godfather was a huge disappointment 
the lion king was okay  not the best but watchable 
i found the matrix to be average  some good moments though 
the avengers lacked a compelling story or performance 
i didn’t enjoy avatar at all 
i didn’t enjoy fight club at all 
pulp fiction was a thrilling and emotional ride 
i absolutely loved titanic  a masterpiece 
a must watch  avatar exceeded my expectations 
i absolutely loved interstellar  a masterpiece 
the dark knight lacked a compelling story or performance 
a must watch  avatar exceeded my expectations 
the shawshank redemption delivers an unforgettable cinematic experience 
a must watch  the shawshank redemption exceeded my expectations 
the dark knight was an outstanding film with brilliant performances 
the shawshank rede

In [101]:
X
Y=df['sentiment']

In [102]:
X_train, X_test, y_train,y_test=train_test_split(X,Y,test_size=.2,shuffle=True,random_state=42)
len(X_train)
len(y_train)

80

In [103]:
#Hyper Parameters 
embedding_dim = 32
hidden_size=64
epochs=8
learning_rate=0.01
batch_size = 32
for x in X_train:
    if (len(x)>13):
        print(x)

X_train_tensor = torch.tensor([x for x in X_train],dtype=torch.long)  # long for embeddings
y_train_tensor = torch.tensor(y_train.tolist(),dtype=torch.long)
X_test_tensor  = torch.tensor([x for x in X_test],dtype=torch.long)
y_test_tensor  = torch.tensor(y_test.tolist(),dtype=torch.long)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

train_loader=DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=False)

In [104]:
model = LSTMSentiment(vocab_size,embedding_dim,hidden_size,3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(),lr=learning_rate)

In [105]:
epochs=10
num_steps=len(train_loader)
for epoch in range(epochs):
    model.train()
    for i,(x,y) in enumerate(train_loader):
        optimizer.zero_grad()
        ypred=model(x)
        loss= criterion(ypred,y)
        loss.backward()
        optimizer.step()
        if(i+1)%3 == 0:
            print(f"epoch {epoch+1}/{epochs} step {i+1}/{num_steps} loss:{loss.item():.4f}")

epoch 1/10 step 3/3 loss:1.0969
epoch 2/10 step 3/3 loss:1.1361
epoch 3/10 step 3/3 loss:0.9987
epoch 4/10 step 3/3 loss:0.7565
epoch 5/10 step 3/3 loss:0.9013
epoch 6/10 step 3/3 loss:0.6525
epoch 7/10 step 3/3 loss:0.3509
epoch 8/10 step 3/3 loss:0.3187
epoch 9/10 step 3/3 loss:0.1602
epoch 10/10 step 3/3 loss:0.0179


In [106]:
model.eval()
total_correct=0
total=0
for i,(x,y) in enumerate(test_loader):
        ypred=model(x)
        _,preds=torch.max(ypred,1)
        total_correct+=torch.sum(y==preds).item()
        total+=len(x)

acc=total_correct/total*100
print(f'Accuracy of the model on test data:{acc} %')

Accuracy of the model on test data:95.0 %
