In [24]:

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim

In [25]:
# Step 1: Load and preprocess the data
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.texts = data['review'].values
        self.labels = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, torch.tensor(label)


In [26]:
# Load dataset
data = pd.read_csv("IMDB Dataset.csv")

In [27]:
# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [28]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [29]:
# Create DataLoaders
train_dataset = IMDBDataset(train_data, tokenizer, max_length=128)
test_dataset = IMDBDataset(test_data, tokenizer, max_length=128)

In [30]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [31]:
 #Step 2: Define the Transformer model for sentiment analysis
class SentimentClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(dropout)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped_output = self.drop(pooled_output)
        return torch.sigmoid(self.out(dropped_output))

In [32]:
# Initialize model, loss function, and optimizer
model = SentimentClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [33]:
# Step 3: Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in data_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.float().to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        outputs = outputs.squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def eval_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.squeeze().cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    predictions = [1 if p > 0.5 else 0 for p in predictions]
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

epochs = 3
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    test_accuracy = eval_model(model, test_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

print("Training complete.")

Epoch 1/3, Train Loss: 0.3209, Test Accuracy: 0.8895
Epoch 2/3, Train Loss: 0.2057, Test Accuracy: 0.8952
Epoch 3/3, Train Loss: 0.1242, Test Accuracy: 0.8931
Training complete.
