In [None]:
# install dataset
#!/bin/bash
# !curl -L -o facebook-hateful-meme-dataset.zip "https://www.kaggle.com/api/v1/datasets/download/parthplc/facebook-hateful-meme-dataset"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0-- --:--:--     0
100 3434M  100 3434M    0     0  30.8M      0  0:01:51  0:01:51 --:--:-- 31.3M 0     0  26.1M      0  0:02:11  0:00:02  0:02:09 30.6M 0  30.1M      0  0:01:53  0:00:19  0:01:34 30.2M1M      0  0:01:50  0:00:49  0:01:01 32.3M  0:01:49  0:00:52  0:00:57 31.8M4M      0  0:01:49  0:01:07  0:00:42 31.3M      0  0:01:49  0:01:15  0:00:34 29.7M01:37  0:00:13 30.6M


In [6]:
# Extract the dataset
!unzip -q facebook-hateful-meme-dataset.zip -d hateful-memes
!ls -la hateful-memes

total 0
drwxr-xr-x  3 michaelosmolovskiy  staff   96 Mar 27 13:30 [34m.[m[m
drwxr-xr-x  6 michaelosmolovskiy  staff  192 Mar 27 13:30 [34m..[m[m
drwxr-xr-x  8 michaelosmolovskiy  staff  256 Mar 27 13:31 [34mdata[m[m


In [7]:
# Install required packages
%pip install torch torchvision transformers scikit-learn matplotlib pandas numpy pillow

Collecting torch
  Using cached torch-2.6.0-cp313-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.21.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting transformers
  Downloading transformers-4.50.2-py3-none-any.whl.metadata (39 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting pillow
  Downloading pillow-11.1.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting

In [8]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from torchvision import models, transforms
from transformers import BertTokenizer, BertModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [14]:
# Load the dataset
def load_jsonl(data_path):
    """
    Load a JSONL file (JSON Lines format) where each line is a valid JSON object.
    """
    data = []
    with open(data_path, 'r') as f:
        for line in f:
            line = line.strip()
            # Skip empty lines and comment lines
            if line and not line.startswith('//'):
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line[:100]}...")
                    print(f"Error message: {str(e)}")
                    raise
    return data

train_data = load_jsonl('hateful-memes/data/train.jsonl')
dev_data = load_jsonl('hateful-memes/data/dev.jsonl')
test_data = load_jsonl('hateful-memes/data/test.jsonl')

print(f"Train samples: {len(train_data)}")
print(f"Dev samples: {len(dev_data)}")
print(f"Test samples: {len(test_data)}")

# Show an example
print("\nExample data point:")
print(train_data[0])

Train samples: 8500
Dev samples: 500
Test samples: 1000

Example data point:
{'id': 42953, 'img': 'img/42953.png', 'label': 0, 'text': 'its their character not their color that matters'}


In [21]:
# Create dataset class
class HatefulMemesDataset(Dataset):
    def __init__(self, data, img_dir='hateful-memes/data', transform=None, tokenizer=None):
        self.data = data
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Load image
        img_path = os.path.join(self.img_dir, item['img'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Process text
        text = item['text']
        if self.tokenizer:
            encoded_text = self.tokenizer(text, padding='max_length', max_length=64, truncation=True, return_tensors='pt')
            input_ids = encoded_text['input_ids'].squeeze(0)
            attention_mask = encoded_text['attention_mask'].squeeze(0)
        else:
            input_ids = torch.tensor([0])
            attention_mask = torch.tensor([0])
        
        # Get label if available
        label = torch.tensor(item['label']) if 'label' in item else torch.tensor(-1)
        
        return {
            'image': image,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'text': text,
            'label': label
        }

In [22]:
# Prepare data loaders
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = HatefulMemesDataset(train_data, transform=image_transforms, tokenizer=tokenizer)
dev_dataset = HatefulMemesDataset(dev_data, transform=image_transforms, tokenizer=tokenizer)
test_dataset = HatefulMemesDataset(test_data, transform=image_transforms, tokenizer=tokenizer)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [23]:
# Define the baseline model
class BaselineHatefulMemesModel(nn.Module):
    def __init__(self):
        super(BaselineHatefulMemesModel, self).__init__()
        
        # Image encoder - ResNet18 pretrained
        resnet = models.resnet18(pretrained=True)
        modules = list(resnet.children())[:-1]  # Remove the last FC layer
        self.image_encoder = nn.Sequential(*modules)
        for param in self.image_encoder.parameters():
            param.requires_grad = False
        
        # Text encoder - BERT
        self.text_encoder = BertModel.from_pretrained('bert-base-uncased')
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        
        # Combined classifier
        self.classifier = nn.Sequential(
            nn.Linear(512 + 768, 256),  # ResNet18 features + BERT embeddings
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
    
    def forward(self, image, input_ids, attention_mask):
        # Extract image features
        image_features = self.image_encoder(image)
        image_features = image_features.view(image_features.size(0), -1)  # Flatten
        
        # Extract text features
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.pooler_output
        
        # Combine features
        combined_features = torch.cat((image_features, text_features), dim=1)
        
        # Classify
        output = self.classifier(combined_features)
        return output.squeeze(-1)

In [24]:
# Initialize model
model = BaselineHatefulMemesModel().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)



In [25]:
# Training function
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().to(device)
        
        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        preds = torch.sigmoid(outputs) >= 0.5
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    return total_loss / len(dataloader), accuracy, f1

# Evaluation function
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float().to(device)
            
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            probs = torch.sigmoid(outputs)
            preds = probs >= 0.5
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    
    return total_loss / len(dataloader), accuracy, f1, auc

In [None]:
# Train the model
num_epochs = 5
train_losses = []
val_losses = []
best_auc = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(model, train_loader, optimizer, criterion)
    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    
    # Validate
    val_loss, val_acc, val_f1, val_auc = evaluate(model, dev_loader, criterion)
    print(f"Val Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f}")
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    # Save best model
    if val_auc > best_auc:
        best_auc = val_auc
        torch.save(model.state_dict(), 'best_baseline_model.pt')
        print("Saved best model!")
    
    print()

Epoch 1/5


In [None]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_baseline_model.pt'))
test_loss, test_acc, test_f1, test_auc = evaluate(model, test_loader, criterion)

print(f"Test Results:")
print(f"Loss: {test_loss:.4f}")
print(f"Accuracy: {test_acc:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"AUC-ROC: {test_auc:.4f}")

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Load the data using our new JSONL loader
train_data = load_jsonl('hateful-memes/data/train.jsonl')
dev_data = load_jsonl('hateful-memes/data/dev.jsonl')
test_data = load_jsonl('hateful-memes/data/test.jsonl')

print(f"Loaded {len(train_data)} training examples")
print(f"Loaded {len(dev_data)} validation examples")
print(f"Loaded {len(test_data)} test examples")

In [None]:
# Convert to dataframes for easier manipulation
train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)

# Display the first few rows of each dataset
print("Training data:")
display(train_df.head())
print("\nValidation data:")
display(dev_df.head())
print("\nTest data:")
display(test_df.head())

In [None]:
# Simple text-based baseline using a bag-of-words model
# Create a text vectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the training text data
X_train = vectorizer.fit_transform(train_df['text'])
y_train = train_df['label']

# Transform the validation data
X_dev = vectorizer.transform(dev_df['text'])
y_dev = dev_df['label']

# Train a simple logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions on validation data
y_pred = classifier.predict(X_dev)
y_pred_proba = classifier.predict_proba(X_dev)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_dev, y_pred)
auc = roc_auc_score(y_dev, y_pred_proba)

print(f"Text-only baseline results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

In [None]:
# For test set predictions, assuming we want to make predictions
# Note that test.jsonl doesn't have labels so we can only generate predictions
X_test = vectorizer.transform(test_df['text'])
test_predictions = classifier.predict_proba(X_test)[:, 1]

# Create submission dataframe
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'proba': test_predictions
})

submission_df.head()