<a href="https://colab.research.google.com/github/rajnishkumar1906/Deep-Learning/blob/main/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Uninstall preinstalled versions
!pip uninstall -y torch torchvision torchaudio torchtext

# Install compatible versions (example for torch 2.1.2)
# Allow pip to choose a compatible torchtext version
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 torchtext

Found existing installation: torch 2.1.2
Uninstalling torch-2.1.2:
  Successfully uninstalled torch-2.1.2
Found existing installation: torchvision 0.16.2
Uninstalling torchvision-0.16.2:
  Successfully uninstalled torchvision-0.16.2
Found existing installation: torchaudio 2.1.2
Uninstalling torchaudio-2.1.2:
  Successfully uninstalled torchaudio-2.1.2
Found existing installation: torchtext 0.16.2
Uninstalling torchtext-0.16.2:
  Successfully uninstalled torchtext-0.16.2
Collecting torch==2.1.2
  Using cached torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.16.2
  Using cached torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.1.2
  Using cached torchaudio-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting torchtext
  Using cached torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
INFO: pip is looking at multiple versions of torchtext to determine which version is com

#This project performs sentiment analysis on IMDB movie reviews using a Feedforward Neural Network (FFNN). The goal is to classify each review as positive or negative based on its text content.

>Tokenize and vectorize the text using a fixed-size vocabulary.

>Convert reviews into binary vectors (indicating presence of words).

>Train a simple FFNN with one hidden layer.

>Evaluate model performance on a test set.

>Use the trained model to predict sentiment on new reviews.

>This project introduces basic deep learning concepts for NLP, using FFNN     instead of more complex models like RNNs or LSTMs.

# Load Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# Load IMDB dataset
df = pd.read_csv("/content/drive/MyDrive/DATASETS/IMDB Dataset.csv")

# Convert sentiment labels to binary (positive:1, negative:0)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df = df[['review', 'label']]

In [4]:
rvw = df['review'][0]
len(rvw)

1761

# Tokenize and vocabulary

In [5]:
from torchtext.data.utils import get_tokenizer
from collections import Counter

tokenizer = get_tokenizer('basic_english')

# Build vocabulary from most common tokens
VOCAB_SIZE = 5000
specials = ['<PAD>', '<UNK>']
counter = Counter()

for review in df['review']:
    counter.update(tokenizer(review))

# Assign indices to words
vocab = {word: idx + len(specials) for idx, (word, _) in enumerate(counter.most_common(VOCAB_SIZE - len(specials)))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

# Encode Reviews as Sequences of Token IDs



In [6]:
import torch
from torch.nn.utils.rnn import pad_sequence

MAX_LEN = 200

def encode_review(text):
    tokens = tokenizer(text)
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

encoded = [torch.tensor(encode_review(review), dtype=torch.long) for review in df['review']]

padded = pad_sequence(
    [r[:MAX_LEN] if len(r) > MAX_LEN else torch.cat([r, torch.zeros(MAX_LEN - len(r), dtype=torch.long)])
     for r in encoded],
    batch_first=True
)

X = padded
y = torch.tensor(df['label'].values, dtype=torch.long)


# Train-Test Split

In [7]:
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# DataLoader

In [8]:
from torch.utils.data import DataLoader, TensorDataset

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64)

# Define Model

In [9]:
import torch.nn as nn

class SimpleTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)        # (batch, seq_len, embed_dim)
        x = x.mean(dim=1)            # average pooling over sequence
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)


# Initialize model

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleTextClassifier(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


# Training loop

In [11]:
NUM_EPOCHS = 5

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Loss: {total_loss:.4f}")

Epoch 1/5 - Loss: 314.4650
Epoch 2/5 - Loss: 214.1900
Epoch 3/5 - Loss: 186.8197
Epoch 4/5 - Loss: 172.1896
Epoch 5/5 - Loss: 162.8905


# Evaluate accuracy

In [12]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 86.69%


# Custom predictions




In [13]:
def predict(text):
    model.eval()
    tokens = encode_review(text)
    if len(tokens) < MAX_LEN:
        tokens += [0] * (MAX_LEN - len(tokens))
    else:
        tokens = tokens[:MAX_LEN]
    input_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.argmax(output, dim=1).item()
    return "positive" if pred == 1 else "negative"

print(predict("This movie was absolutely amazing!"))
print(predict("Worst plot ever. Waste of time."))


positive
negative
