In [145]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
from torch.optim.lr_scheduler import StepLR
import re


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreeyapandey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [132]:
#Load the dataset
data = pd.read_csv('imdb_data.csv')

#Data Pr-processing
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.lower()

data['review'] = data['review'].apply(clean_text)

# Tokenization and removing stopwords
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

data['tokens'] = data['review'].apply(tokenize_and_remove_stopwords)

# Stemming
stemmer = PorterStemmer()
data['stemmed_tokens'] = data['tokens'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

# Convert text to sequences
tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(data['stemmed_tokens'].apply(lambda tokens: ' '.join(tokens)))
sequences = tokenizer.texts_to_sequences(data['stemmed_tokens'].apply(lambda tokens: ' '.join(tokens)))
text_lengths = [min(len(seq), 200) for seq in sequences]  # Ensure all sequences are <= 200
sequences = [seq[:200] for seq in sequences]  # Truncate sequences

# Pad sequences
X = pad_sequences(sequences, maxlen=200, padding='post', truncating='post')
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

# Splitting the dataset
X_train, X_test, y_train, y_test, lengths_train, lengths_test = train_test_split(X, y, text_lengths, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train, y_train, lengths_train = map(torch.tensor, (X_train, y_train, lengths_train))
X_test, y_test, lengths_test = map(torch.tensor, (X_test, y_test, lengths_test))

# Create TensorDatasets and DataLoaders
train_data = TensorDataset(X_train, y_train, lengths_train)
test_data = TensorDataset(X_test, y_test, lengths_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


pre


In [147]:
#Model Design
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional=True, drop_prob=0.5):
        super(SentimentBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Consider using pre-trained embeddings here
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x, text_lengths):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.lstm(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.dropout(hidden)
        return self.sig(self.fc(hidden))

# Initialize model, optimizer, and loss function
model = SentimentBiLSTM(8000, 400, 256, 1, 2, True, 0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Added L2 regularization
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [148]:
#Training of the model
def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, lengths_batch).squeeze()
        y_batch = y_batch.float()  
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()  # Update the learning rate
    return total_loss / len(train_loader)

num_epochs = 15  # Increased epochs
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, scheduler)
    print(f'Epoch {epoch+1}, Loss: {train_loss:.4f}')

Epoch 1, Loss: 0.4717
Epoch 2, Loss: 0.3022
Epoch 3, Loss: 0.2486
Epoch 4, Loss: 0.2126
Epoch 5, Loss: 0.1772
Epoch 6, Loss: 0.0997
Epoch 7, Loss: 0.0729
Epoch 8, Loss: 0.0557
Epoch 9, Loss: 0.0419
Epoch 10, Loss: 0.0320
Epoch 11, Loss: 0.0230
Epoch 12, Loss: 0.0211
Epoch 13, Loss: 0.0195
Epoch 14, Loss: 0.0184
Epoch 15, Loss: 0.0174


In [149]:
#Model Evaluation
def evaluate(model, test_loader):
    model.eval()  # Set model to evaluation mode
    true_labels = []
    predictions = []
    predicted_probs = []

    with torch.no_grad():
        for X_batch, y_batch, lengths_batch in test_loader:
            outputs = model(X_batch, lengths_batch).squeeze()
            probs = outputs.numpy()
            preds = (probs > 0.5).astype(int)
            true_labels.extend(y_batch.numpy())
            predictions.extend(preds)
            predicted_probs.extend(probs)  # Store probabilities for AUC calculation

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    conf_matrix = confusion_matrix(true_labels, predictions)
    auc_score = roc_auc_score(true_labels, predicted_probs)  # Corrected use of predicted probabilities

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"AUC Score: {auc_score:.4f}")

# Evaluate the model
evaluate(model, test_loader)

Accuracy: 0.8759
Precision: 0.8706
Recall: 0.8853
F1 Score: 0.8779
Confusion Matrix:
[[4298  663]
 [ 578 4461]]
AUC Score: 0.9310
