In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df=pd.read_csv("/home/sachchida/gitrepo/Deeplearning/data/imdb/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
from sklearn.preprocessing import LabelEncoder
import re

In [4]:
import re
import html
import unicodedata

def clean_text(text):
    if not isinstance(text, str):
        return text

    # Decode HTML entities (&nbsp; &amp; etc.)
    text = html.unescape(text)

    # Remove all HTML tags (<br>, <p>, <div>, etc.)
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', ' ', text)

    # Normalize unicode (remove unreadable chars like ï¿½)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("utf-8", "ignore")

    # Remove non-alphabetic characters (keep spaces)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    #Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [5]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
df['cleaned']=df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment,cleaned
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei s Love in the Time of Money is a...


In [7]:
lable_enc=LabelEncoder()

In [8]:
df['sentiment_encoded']=lable_enc.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,cleaned,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there s a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei s Love in the Time of Money is a...,1


In [9]:
texts = df["cleaned"].values
labels = df["sentiment_encoded"].values

In [10]:
import torch.nn as nn
import torch

In [11]:
def tokenize(text):
    return text.lower().split()

tokenized_texts = [tokenize(text) for text in texts]


In [12]:
from collections import Counter
counter = Counter()
for tokens in tokenized_texts:
    counter.update(tokens)

vocab = {word: idx + 2 for idx, (word, _) in enumerate(counter.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

vocab_size = len(vocab)
vocab_size = max(vocab.values()) + 1

In [13]:
MAX_LEN = 300

def encode(tokens):
    seq = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    return seq[:MAX_LEN]

encoded_texts = [encode(tokens) for tokens in tokenized_texts]


In [14]:
def pad_sequence(seq, max_len):
    return seq + [0] * (max_len - len(seq))

padded_texts = [pad_sequence(seq, MAX_LEN) for seq in encoded_texts]


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    padded_texts, labels, test_size=0.2, random_state=42
)

X_train = torch.tensor(X_train)
X_val = torch.tensor(X_val)
y_train = torch.tensor(y_train).float()
y_val = torch.tensor(y_val).float()


In [16]:
from torch.utils.data import Dataset, DataLoader
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(IMDBDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(IMDBDataset(X_val, y_val), batch_size=64)


In [17]:
class LSTMSentiment(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_dim):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim,padding_idx=0)
        self.lstm=nn.LSTM(embed_dim,hidden_dim,batch_first=True)
        self.fc=nn.Linear(hidden_dim,1)
    
    def forward(self,x):
        x=self.embedding(x)
        _, (hidden,_)=self.lstm(x)
        out=self.fc(hidden[-1])
        return torch.sigmoid(out).squeeze()


In [18]:
device="cuda" if torch.cuda.is_available() else "cpu"
model=LSTMSentiment(vocab_size=vocab_size,embed_dim=128,hidden_dim=128).to(device=device)
criterion=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)

In [19]:
EPOCHS = 5

for epoch in range(EPOCHS):

    # Put model in training mode
    # Enables dropout, gradient tracking
    model.train()

    total_loss = 0

    # Loop over mini-batches
    for X_batch, y_batch in train_loader:

        # Move data to GPU/CPU
        X_batch = X_batch.to(device)   # shape: (batch_size, seq_len)
        y_batch = y_batch.to(device)   # shape: (batch_size)

        # ------------------------------------------------
        # 1. FORWARD PASS (NO LABEL USED YET)
        # ------------------------------------------------
        preds = model(X_batch)
        # preds shape: (batch_size)
        # values between 0 and 1

        # ------------------------------------------------
        # 2. LOSS COMPUTATION (LABEL USED HERE)
        # ------------------------------------------------
        loss = criterion(preds, y_batch)
        # Measures how wrong predictions are

        # ------------------------------------------------
        # 3. BACKWARD PASS (GRADIENT COMPUTATION)
        # ------------------------------------------------
        optimizer.zero_grad()  # clear old gradients
        loss.backward()       # compute gradients

        # At this point:
        # - Embedding gradients exist
        # - LSTM gradients exist
        # - Linear layer gradients exist

        # ------------------------------------------------
        # 4. PARAMETER UPDATE
        # ------------------------------------------------
        optimizer.step()      # update weights

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Train Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Train Loss: 0.6926
Epoch 2, Train Loss: 0.6615
Epoch 3, Train Loss: 0.6481
Epoch 4, Train Loss: 0.5988
Epoch 5, Train Loss: 0.6570


In [20]:
model.eval()   # evaluation mode (no dropout)

correct = 0
total = 0

with torch.no_grad():  # disables gradient computation
    for X_batch, y_batch in val_loader:

        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        preds = model(X_batch)
        predicted = (preds > 0.5).float()

        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

val_accuracy = correct / total
print("Validation Accuracy:", val_accuracy)


Validation Accuracy: 0.5167
