In [1]:
import re
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('IMDB Dataset.csv')

def remove_tags(string):
    removelist = ""
    result = re.sub('<.*?>', '', string)  # Remove HTML tags
    result = re.sub('https://.*', '', result)  # Remove URLs
    result = re.sub(r'[^\w'+removelist+']', ' ', result)  # Remove non-alphanumeric characters
    result = result.lower()
    return result

data['review'] = data['review'].apply(lambda cw: remove_tags(cw))

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

data['review'] = data['review'].apply(lemmatize_text)

# Encode labels
encoder = LabelEncoder()
data['sentiment'] = encoder.fit_transform(data['sentiment'])

# Split the dataset
reviews = data['review'].values
labels = data['sentiment'].values
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, labels, stratify=labels, test_size=0.2)

# Hyperparameters
vocab_size = 3000
embedding_dim = 100
max_length = 200
batch_size = 32
oov_token = '<OOV>'

# Tokenization and Padding
class Vocabulary:
    def __init__(self, num_words):
        self.word_index = {}
        self.index_word = {}
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index[oov_token] = 1
        self.index_word[1] = oov_token

    def fit_on_texts(self, texts):
        word_counts = {}
        for text in texts:
            for word in text.split():
                word_counts[word] = word_counts.get(word, 0) + 1

        sorted_vocab = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        for i, (word, _) in enumerate(sorted_vocab[:self.num_words - 1], start=2):
            self.word_index[word] = i
            self.index_word[i] = word

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            seq = [
                min(self.word_index.get(word, self.word_index[self.oov_token]), self.num_words - 1)
                for word in text.split()
            ]
            sequences.append(seq)
        return sequences


vocab = Vocabulary(num_words=vocab_size)
vocab.fit_on_texts(train_sentences)

train_sequences = vocab.texts_to_sequences(train_sentences)
test_sequences = vocab.texts_to_sequences(test_sentences)

train_padded = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq[:max_length]) for seq in train_sequences], batch_first=True, padding_value=0
)
test_padded = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq[:max_length]) for seq in test_sequences], batch_first=True, padding_value=0
)


# Convert data to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# Dataset and DataLoader
class SentimentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = SentimentDataset(train_padded, train_labels)
test_dataset = SentimentDataset(test_padded, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [3]:


# Model Definition
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]
        output = self.fc(hidden)
        return torch.sigmoid(output)

# Initialize model
hidden_dim = 64
output_dim = 1
model = SentimentRNN(vocab_size, embedding_dim, hidden_dim, output_dim)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
bce_loss_fn = torch.nn.BCELoss()

In [4]:

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            labels = labels.float().unsqueeze(1)
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs >= 0.5).int()
            all_preds.extend(preds.squeeze().tolist())
    return all_preds

In [5]:



# Train the model
num_epochs = 5
train_model(model, train_loader, criterion, optimizer, num_epochs)

# Evaluate the model
predictions = evaluate_model(model, test_loader)
predictions = torch.tensor(predictions).cpu().numpy()
test_labels = test_labels.cpu().numpy()
print("Accuracy:", accuracy_score(test_labels, predictions))


# Predictions on new sentences
sentences = ["The movie was very good and emotional",
             "I have never seen a bad movie like this"]

sequences = vocab.texts_to_sequences(sentences)
padded = nn.utils.rnn.pad_sequence([torch.tensor(seq[:max_length]) for seq in sequences], batch_first=True, padding_value=0)

with torch.no_grad():
    outputs = model(padded)
    preds = (outputs >= 0.5).int()
    for i, sentence in enumerate(sentences):
        print(sentence)
        print("Predicted sentiment:", "Positive" if preds[i].item() == 1 else "Negative")

Epoch 1/5, Loss: 0.6928
Epoch 2/5, Loss: 0.6908
Epoch 3/5, Loss: 0.6825
Epoch 4/5, Loss: 0.5516
Epoch 5/5, Loss: 0.3497
Accuracy: 0.8611
The movie was very good and emotional
Predicted sentiment: Positive
I have never seen a bad movie like this
Predicted sentiment: Negative


In [6]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import torch
all_predictions = np.vstack(predictions)  # Stack into a single NumPy array
all_labels = np.vstack(test_labels)

# Convert probabilities to discrete labels (binary classification)
binary_predictions = (all_predictions > 0.5).astype(int)
all_labels = all_labels.astype(int)

# Calculate BCE
bce_loss = bce_loss_fn(
    torch.tensor(all_predictions, dtype=torch.float32), 
    torch.tensor(all_labels, dtype=torch.float32)
).item()

# Calculate other metrics
accuracy = accuracy_score(all_labels, binary_predictions)
f1 = f1_score(all_labels, binary_predictions, average='macro')
precision = precision_score(all_labels, binary_predictions, average='macro')
recall = recall_score(all_labels, binary_predictions, average='macro')

print(f'Binary Cross Entropy: {bce_loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Binary Cross Entropy: 13.8900
Accuracy: 0.8611
F1 Score: 0.8609
Precision: 0.8628
Recall: 0.8611
