In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.nn.functional import cross_entropy
from tqdm import tqdm
import json

# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.data = []
        with open(file_path, 'r') as file:
            for line in file:
                example = json.loads(line)
                self.data.append({'text': example['text'], 'label': example['label']})
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = torch.tensor(self.data[idx]['label'])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }

# Load pre-trained BERT model and tokenizer to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)  # Assuming binary classification

# Load your dataset
train_dataset = CustomDataset('/home/pratyush/Desktop/PreCog/hateful_memes/train.jsonl', tokenizer)

# Move model to GPU
model.to(device)

# Define training parameters
epochs = 3
learning_rate = 2e-5
batch_size = 8

# Create DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# for epoch in range(epochs):
#     model.train()
#     total_loss = 0.0
#     for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)

#         # print(input_ids)

#         optimizer.zero_grad()

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()

#         loss.backward()
#         optimizer.step()

#     average_loss = total_loss / len(train_loader)
#     print(f'Epoch {epoch + 1}, Average Loss: {average_loss}')

# model.save_pretrained("fine_tuned_bert_model")
# tokenizer.save_pretrained("fine_tuned_bert_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'fine_tuned_bert_model'  # Change this to the directory where you saved your fine-tuned model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)  # Assuming binary classification

# Load your test dataset
test_dataset = CustomDataset('/home/pratyush/Desktop/PreCog/hateful_memes/test_seen.jsonl', tokenizer)

# Create DataLoader for testing data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluation loop
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        total_correct += torch.sum(predictions == labels).item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Testing: 100%|██████████| 125/125 [00:05<00:00, 21.70it/s]

Test Accuracy: 57.10%





In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'fine_tuned_bert_model'  # Change this to the directory where you saved your fine-tuned model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)  # Assuming binary classification

# Load your test dataset
test_dataset = CustomDataset('/home/pratyush/Desktop/PreCog/hateful_memes/test_unseen.jsonl', tokenizer)

# Create DataLoader for testing data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluation loop
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        total_correct += torch.sum(predictions == labels).item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Testing: 100%|██████████| 250/250 [00:11<00:00, 22.21it/s]

Test Accuracy: 63.35%



