In [4]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-09-03 09:16:14--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-09-03 09:16:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-09-03 09:16:15--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# Load and Preprocess the Data


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# Download NLTK tokenizer models
import nltk
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')

# Encode the labels (positive -> 1, negative -> 0)
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

# Train-test split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
train_data['tokens'] = train_data['review'].apply(word_tokenize)
test_data['tokens'] = test_data['review'].apply(word_tokenize)

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, "Loading GloVe"):
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_file = 'glove.6B.100d.txt'  # Make sure you have this file downloaded
glove_embeddings = load_glove_embeddings(glove_file)
embedding_dim = 100

# Prepare vocabulary and word embeddings matrix
vocab = set([word for tokens in train_data['tokens'] for word in tokens])
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}
word_to_idx["<PAD>"] = 0  # Padding index

# Create an embedding matrix
embedding_matrix = np.zeros((len(word_to_idx) + 1, embedding_dim))
for word, idx in word_to_idx.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

# Convert tokens to indices
def tokens_to_indices(tokens, word_to_idx):
    return [word_to_idx.get(token, 0) for token in tokens]

train_data['indices'] = train_data['tokens'].apply(lambda x: tokens_to_indices(x, word_to_idx))
test_data['indices'] = test_data['tokens'].apply(lambda x: tokens_to_indices(x, word_to_idx))

# Padding sequences
def pad_sequences(sequences, maxlen):
    return [seq[:maxlen] + [0]*(maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]

max_len = 200  # Define a maximum length for padding
train_data['padded'] = pad_sequences(train_data['indices'], max_len)
test_data['padded'] = pad_sequences(test_data['indices'], max_len)

# Convert to PyTorch tensors
X_train = torch.tensor(train_data['padded'].tolist())
y_train = torch.tensor(train_data['sentiment'].tolist(), dtype=torch.float32)
X_test = torch.tensor(test_data['padded'].tolist())
y_test = torch.tensor(test_data['sentiment'].tolist(), dtype=torch.float32)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loading GloVe: 400000it [00:15, 25501.67it/s]


#  Vanilla RNN with GloVe Embeddings

In [6]:
class RNNModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# Model instantiation
hidden_dim = 256
output_dim = 1
vocab_size = len(word_to_idx) + 1
rnn_model = RNNModel(embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(rnn_model.parameters())


# LSTM with GloVe Embeddings


In [7]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]
        return self.fc(hidden)

# Model instantiation
lstm_model = LSTMModel(embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix)

# Loss and optimizer
criterion_l = nn.BCEWithLogitsLoss()
optimizer_l = optim.Adam(lstm_model.parameters())


# Train the Models


In [8]:
def train_model(model, train_loader, optimizer, criterion, n_epochs=5):
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

# Train Vanilla RNN Model
train_model(rnn_model, train_loader, optimizer, criterion)

# Train LSTM Model
train_model(lstm_model, train_loader, optimizer_l, criterion_l)


Epoch 1, Loss: 0.6983772928237915
Epoch 2, Loss: 0.6974422760009765
Epoch 3, Loss: 0.695209515953064
Epoch 4, Loss: 0.6945141072273254
Epoch 5, Loss: 0.6937107044219971
Epoch 1, Loss: 0.6904668976783752
Epoch 2, Loss: 0.6908426870346069
Epoch 3, Loss: 0.6717964559555054
Epoch 4, Loss: 0.5713755627155304
Epoch 5, Loss: 0.3941732023000717


# Evaluation

In [9]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            predictions = torch.round(torch.sigmoid(output.squeeze()))
            y_true.extend(y_batch.tolist())
            y_pred.extend(predictions.tolist())
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, f1

accuracy, f1 = evaluate_model(rnn_model, test_loader)
print(f'Vanilla RNN with GloVe: Accuracy: {accuracy}, F1: {f1}')

accuracy, f1 = evaluate_model(lstm_model, test_loader)
print(f'LSTM with GloVe: Accuracy: {accuracy}, F1: {f1}')


Vanilla RNN with GloVe: Accuracy: 0.5247, F1: 0.4509645373686034
LSTM with GloVe: Accuracy: 0.8274, F1: 0.8339746056175452
