In [None]:
import torch
import torch.nn as nn

import glob
import os
import pandas as pd

from tqdm import tqdm
import time

import kagglehub

In [None]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:01<00:00, 21.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [None]:
!ls /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1

'IMDB Dataset.csv'


In [None]:
path = '/root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv'
os.path.isfile(path)

True

In [None]:
dataset = pd.read_csv(path)
print(len(dataset))
print(dataset.shape)
pd.set_option('display.max_colwidth', 200)
dataset.head(5)

50000
(50000, 2)


Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue i...",positive
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenl...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what mone...",positive


In [None]:
def cleaner(text):
    text = text.replace('<br />', ' ')
    text = text.replace('<br/>', ' ')
    text = text.replace('<br>', ' ')
    text = text.lower()
    for punctuation in ".,!?:;:":
        text = text.replace(punctuation, f'')

    return text

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

tokens = []

def build_vocab_simple(texts, max_words=10000):
    
    for text in texts:
      for word in word_tokenize(cleaner(text)):
        tokens.append(word)

    freq_dist = FreqDist(tokens)

    
    vocab = ['<PAD>', '<UNK>'] + [word for word, _ in freq_dist.most_common(max_words)]

    
    return {word: idx for idx, word in enumerate(vocab)}

In [None]:
time_start = time.time()

vocab = build_vocab_simple(dataset['review'])
print("Vocabulary size:", len(vocab))

time_end = time.time()
print(f'Time to build vocab: {time_end - time_start}')

Vocabulary size: 10002
Time to build vocab: 53.13648462295532


In [None]:
vocab_20 = list(vocab.items())[:20]
for word, idx in vocab_20:
    print(f'{word}: {idx}')

<PAD>: 0
<UNK>: 1
the: 2
and: 3
a: 4
of: 5
to: 6
is: 7
it: 8
in: 9
i: 10
this: 11
that: 12
's: 13
was: 14
as: 15
with: 16
for: 17
movie: 18
but: 19


In [None]:
from torch.utils.data import TensorDataset, DataLoader

def prepare_movie_reviews(reviews, labels, vocab, max_len=200):
    all_input_ids = []
    all_labels = []

    for review, label in zip(reviews, labels): 

        tokens = word_tokenize(review)[:max_len]

        indices = [vocab.get(token, 1) for token in tokens] 

        
        if len(indices) < max_len:

            indices += [0] * (max_len - len(indices))

        all_input_ids.append(indices)
        all_labels.append(1 if label == 'positive' else 0)

    
    inputs_tensor = torch.tensor(all_input_ids)
    labels_tensor = torch.tensor(all_labels)

    
    dataset = TensorDataset(inputs_tensor, labels_tensor)

    return dataset


In [None]:
from sklearn.model_selection import train_test_split

reviews = dataset['review']
labels = dataset['sentiment']

x_train, x_test, y_train, y_test = train_test_split(reviews, labels)

print(f'x_train: {len(x_train)} x_test: {len(x_test)} y_train: {len(y_train)} y_test: {len(y_test)}')

x_train: 37500 x_test: 12500 y_train: 37500 y_test: 12500


In [None]:
start = time.time()
train_dataset = prepare_movie_reviews(x_train, y_train, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
end = time.time()
print(f'Time to prepare data: {end - start}')

Time to prepare data: 55.22710156440735


In [None]:
sample_batch = next(iter(train_loader))
print(f'This is the shape of the reviews: {sample_batch[0].shape}')
print(f'This is the shape of the labels: {sample_batch[1].shape}')


This is the shape of the reviews: torch.Size([32, 200])
This is the shape of the labels: torch.Size([32])


In [None]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=200, hidden_dim=256, n_layers=2):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=n_layers,
                            batch_first=True)

        self.linear1 = nn.Linear(hidden_dim, 1)  
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.dropout(self.embeddings(x))
        lstm_out, _ = self.lstm(embedded)  

        out = self.linear1(lstm_out[:, -1, :])  
        return torch.sigmoid(out)

model = SentimentLSTM(vocab_size=len(vocab))
model

SentimentLSTM(
  (embeddings): Embedding(10002, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True)
  (linear1): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)
model.to(device)

cuda


SentimentLSTM(
  (embeddings): Embedding(10002, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True)
  (linear1): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [None]:
criterion = nn.BCELoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
review, label = next(iter(train_loader))
print(review.shape)
print(label.shape)

torch.Size([32, 200])
torch.Size([32])


In [None]:

def train_model(model, device, train_loader, criterion, optimizer, epochs):
    print(device)
    model.to(device)

    model.train()

    for epoch in range(epochs): 
        total_loss = 0

        for reviews, label in tqdm(train_loader, desc="training"):
            
            inputs = reviews.to(device)
            labels = label.float().to(device)

            
            optimizer.zero_grad()

            
            output = model(inputs)

            output = output.squeeze(1)

            loss = criterion(output, labels)

            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')


In [None]:
train_model(model, device, train_loader, criterion, optimizer, epochs=15)

cuda


training: 100%|██████████| 1172/1172 [00:27<00:00, 42.97it/s]


Epoch 1, Loss: 0.6896


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.58it/s]


Epoch 2, Loss: 0.6932


training: 100%|██████████| 1172/1172 [00:24<00:00, 46.93it/s]


Epoch 3, Loss: 0.5313


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.68it/s]


Epoch 4, Loss: 0.3847


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.82it/s]


Epoch 5, Loss: 0.3235


training: 100%|██████████| 1172/1172 [00:25<00:00, 46.19it/s]


Epoch 6, Loss: 0.2990


training: 100%|██████████| 1172/1172 [00:25<00:00, 46.07it/s]


Epoch 7, Loss: 0.2689


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.70it/s]


Epoch 8, Loss: 0.2453


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.88it/s]


Epoch 9, Loss: 0.2209


training: 100%|██████████| 1172/1172 [00:26<00:00, 44.71it/s]


Epoch 10, Loss: 0.2014


training: 100%|██████████| 1172/1172 [00:26<00:00, 43.94it/s]


Epoch 11, Loss: 0.1802


training: 100%|██████████| 1172/1172 [00:25<00:00, 45.70it/s]


Epoch 12, Loss: 0.1607


training: 100%|██████████| 1172/1172 [00:25<00:00, 46.00it/s]


Epoch 13, Loss: 0.1465


training: 100%|██████████| 1172/1172 [00:26<00:00, 43.79it/s]


Epoch 14, Loss: 0.1300


training: 100%|██████████| 1172/1172 [00:26<00:00, 44.73it/s]

Epoch 15, Loss: 0.1178





In [None]:
test_dataset = prepare_movie_reviews(x_test, y_test, vocab)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
def test_model(model, device, test_loader, criterion):
    print(device)
    model.to(device)

    model.eval()  
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  
        for reviews, label in tqdm(test_loader, desc="testing"):
            
            inputs = reviews.to(device)
            labels = label.float().to(device)

            
            outputs = model(inputs)
            outputs = outputs.squeeze(1) 

            
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            
            predicted = (outputs >= 0.5).float()  
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

        avg_loss = total_loss / len(test_loader)  
        accuracy = (correct_predictions / total_predictions) * 100  

        print(f'Test Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')


In [None]:
test_model(NEW_MODEL, device, test_loader, criterion)


cuda


testing: 100%|██████████| 391/391 [00:03<00:00, 126.25it/s]

Test Loss: 0.4822, Accuracy: 85.61%





In [None]:
def predict(model, vocab, text, device, max_len=200):
    
    tokens = word_tokenize(text)

    
    indices = [vocab.get(token, 1) for token in tokens]

    
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))

    
    input_tensor = torch.tensor(indices).unsqueeze(0).to(device)  

    
    model.eval()  
    with torch.no_grad():
        output = model(input_tensor)  

    
    predicted_class = torch.round(output).item() 

    
    return "positive" if predicted_class == 1 else "negative"


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
text = '''The movie was so bad that I wanted to throw my popcorn at the person in front of
me and storm out of the theater yelling '''
prediction = predict(model, vocab, text, device)
print(f"Prediction: {prediction}")

Prediction: negative


In [None]:
text = '''This movie was the most beutiful piece of art Ive ever seen in my life'''
prediction = predict(model, vocab, text, device)
print(f"Prediction: {prediction}")

Prediction: positive


In [None]:
text = '''Trash, just trash'''
prediction = predict(model, vocab, text, device)
print(f"Prediction: {prediction}")

Prediction: negative


In [None]:
text = '''Trash, just trash. Never Recomending this movie to anyone'''
prediction = predict(model, vocab, text, device)
print(f"Prediction: {prediction}")

Prediction: negative


In [None]:
#any predictions