<a href="https://colab.research.google.com/github/riyaa14/BrainDead_Hackathon/blob/main/BERT_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np


FMS = pd.read_csv("/content/FHM_final.csv")
FMS = FMS.drop_duplicates()

# Concatenate 'gen_caption' and 'text' columns into 'textNdesc' column
FMS['textNdesc'] = 'In the picture ' + FMS.gen_caption + ' And the text says: ' + FMS.text


train_data, test_data = train_test_split(FMS, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [13]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [15]:
# Parameters
MAX_LEN = 128
BATCH_SIZE = 16
NUM_EPOCHS = 5  # Changed to 5 epochs
LEARNING_RATE = 2e-5  # Fine-tune as needed


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming binary classification

train_dataset = CustomDataset(train_data['textNdesc'], train_data['label'], tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_data['textNdesc'], val_data['label'], tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_data['textNdesc'], test_data['label'], tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

best_val_accuracy = 0.0
for epoch in range(NUM_EPOCHS):
    # Training loop
    model.train()
    train_preds = []
    train_true = []
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_true.extend(labels.cpu().numpy())
        train_preds.extend(torch.argmax(outputs.logits, axis=1).cpu().numpy())


    train_accuracy = accuracy_score(train_true, train_preds)
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Train Accuracy: {train_accuracy:.4f}')


    model.eval()
    val_preds = []
    val_true = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].numpy()

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_true.extend(labels)

    val_accuracy = accuracy_score(val_true, val_preds)
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Validation Accuracy: {val_accuracy}')

    # Check if current model is the best
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model.pth')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Train Accuracy: 0.5128
Epoch 1/5, Validation Accuracy: 0.4358974358974359
Epoch 2/5, Train Accuracy: 0.5214
Epoch 2/5, Validation Accuracy: 0.48717948717948717
Epoch 3/5, Train Accuracy: 0.5926
Epoch 3/5, Validation Accuracy: 0.4358974358974359
Epoch 4/5, Train Accuracy: 0.5812
Epoch 4/5, Validation Accuracy: 0.5897435897435898
Epoch 5/5, Train Accuracy: 0.7322
Epoch 5/5, Validation Accuracy: 0.48717948717948717


In [18]:
# Load the best model and evaluate on test set
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_preds = []
test_true = []
test_probs = []  # Initializing test_probs list
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].numpy()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)

    preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
    test_preds.extend(preds)
    test_true.extend(labels)
    test_probs.extend(probs.detach().cpu().numpy()[:, 1])

In [19]:
roc_auc = roc_auc_score(test_true, test_preds)
print(f'ROC-AUC Score: {roc_auc}')

ROC-AUC Score: 0.47041666666666665
