In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

In [8]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        target = self.data.sentiment[index]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [9]:
test_df = pd.read_csv("/home/sayantika/Desktop/DS2sem4/Applied Machine Leaning/appliedml_assignment-5/sentiment_analysis/test.csv", encoding='unicode_escape')
train_df = pd.read_csv('/home/sayantika/Desktop/DS2sem4/Applied Machine Leaning/appliedml_assignment-5/sentiment_analysis/train.csv', encoding='latin1')

In [10]:
sentiment_category_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
train_df['sentiment'] = train_df['sentiment'].map(sentiment_category_mapping)
test_df['sentiment'] = test_df['sentiment'].map(sentiment_category_mapping)

In [11]:
test_df.dropna(inplace=True)


In [12]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
CHECKPOINT_PATH = 'bert_model_checkpoint.pth'

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Data loaders
train_dataset = SentimentDataset(train_df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

test_dataset = SentimentDataset(test_df, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0

    for batch_idx, data in tqdm(enumerate(train_loader), desc=f"Epoch {epoch}"):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids, labels=targets)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    average_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

    # Save checkpoint
    save_checkpoint(epoch, model, optimizer)

# Evaluation function
def evaluate():
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for data in tqdm(test_loader, desc="Evaluation"):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            outputs = model(ids, mask, token_type_ids)
            _, predicted = torch.max(outputs.logits, 1)

            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(predicted.cpu().detach().numpy())

    return fin_targets, fin_outputs

# Save and load checkpoint
def save_checkpoint(epoch, model, optimizer, filename=CHECKPOINT_PATH):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch + 1}")

def load_checkpoint(model, optimizer, filename=CHECKPOINT_PATH):
    if os.path.isfile(filename):
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        print(f"Model loaded, Resuming training from epoch {epoch + 1}")
        return epoch + 1
    else:
        print("No checkpoint found, starting from epoch 1")
        return 1

# Load model and optimizer state if exists
start_epoch = load_checkpoint(model, optimizer)

# Training loop
for epoch in tqdm(range(start_epoch, 3)):
    train(epoch)

# Evaluation
targets, outputs = evaluate()
print(classification_report(targets, outputs, target_names=['negative','neutral','positive']))

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(targets, outputs)
target_names = ['negative', 'neutral', 'positive']

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

print("Classification Report:")
print(classification_report(targets, outputs, target_names=target_names))

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/68d45e234eb4a928074dfd868cead0219ab85354cc53d20e772753c6bb9169d3?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1713534658&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMzUzNDY1OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iZXJ0LWJhc2UtdW5jYXNlZC82OGQ0NWUyMzRlYjRhOTI4MDc0ZGZkODY4Y2VhZDAyMTlhYjg1MzU0Y2M1M2QyMGU3NzI3NTNjNmJiOTE2OWQzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=XHIMMJfA1WV9wOqEcnG1-gfjo~l4CFmthnOzIaG2vQozcV2jZCvXD4UbnqMmniYZ-dpZT9fITpXX4helvpmpLx6WJlOyhALWCa3XDlvHQkoVq82RUPKNUYLpeXQG4jVWiCCqRz1DgYUXlPAaw6PbwSCaj3DKRuOv07vsIrRDeyWgoP6iGy2PAyT80yFUQTF5tUJwYfc~5jbTeVtYEV3w5lpgePN5lg5XZX0vzPbORE3oBvXcSktWNWr-Oo8dYwO88RORsRZytzT5aYExL-1BZyDFZEwqPaRoMLZBgfz5OPafUhTE7oslPEd2~Z30NjfbWLdca~FVOXJhzhPmL0sUNQ__&Key-Pair-Id=KVTP0A1DKRTAX (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x710e36961db0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))