In [1]:
!pip install rouge-score nltk bert-score datasets

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_

In [2]:
!pip install nltk bert-score datasets



In [3]:
import nltk
nltk.download('wordnet')  # Required for METEOR
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [None]:
import pickle

# Define file paths
t_d_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/D_train.pkl"
t_o_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/O_train.pkl"

# Load the D_train pickle file
with open(t_d_pkl_path, 'rb') as f:
    d_train = pickle.load(f)

# Load the O_train pickle file
with open(t_o_pkl_path, 'rb') as f:
    o_train = pickle.load(f)

# Display basic information about d_train
print("D_train type:", type(d_train))
#print(d_train.values[0])
if isinstance(d_train, dict):
    print("D_train keys (first 5):", d_train[list(d_train.keys())[0]])
elif isinstance(d_train, list):
    print("First 5 elements of D_train:", d_train[0])
else:
    print("D_train content preview:", d_train)

print("\n" + "="*40 + "\n")

# Display basic information about o_train
print("O_train type:", type(o_train))
#print(o_train.values[0])
if isinstance(o_train, dict):
    print("O_train keys (first 5):",  o_train[list(o_train.keys())[0]])
elif isinstance(o_train, list):
    print("First 5 elements of O_train:", o_train[0])
else:
    print("O_train content preview:", o_train)


# **Data Preparation**

In [None]:
import pickle
import pandas as pd
from PIL import Image
import torch
from torchvision.models import vit_b_16, ViT_B_16_Weights
from torchvision import transforms

import pickle
import pandas as pd
from PIL import Image
import torch
from torchvision.models import vit_b_16, ViT_B_16_Weights
from torchvision import transforms

class ImageFeatureExtractor:
    def __init__(self):
        self.model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
        self.model.eval()
        self.transform = ViT_B_16_Weights.IMAGENET1K_V1.transforms()

    def extract_features(self, image_path, object_data):
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image).unsqueeze(0)  # [1, 3, 224, 224]
        
        with torch.no_grad():
            x = self.model._process_input(image)  # [1, 196, 768]
            batch_class_token = self.model.class_token.expand(1, -1, -1)  # [1, 1, 768]
            x = torch.cat([batch_class_token, x], dim=1)  # [1, 197, 768]
            x = self.model.encoder(x)  # Encoder handles positional embeddings
            img_features = x.mean(dim=1)  # [1, 768]

            # Process object features (using confidence scores)
            scores = object_data.get('confidence_scores', [])
            obj_features = torch.tensor(scores).mean().reshape(1, 1) if scores else torch.zeros(1, 1)
            obj_features = obj_features.expand(1, 512)  # [1, 512]
            
            combined_features = torch.cat([img_features, obj_features], dim=1)  # [1, 1280]
        
        return combined_features.squeeze(0).cpu().numpy()

In [None]:
# Define the MUSEDataset class (as provided)
class MUSEDataset():
    def __init__(self, df_path, D_pkl, O_pkl, image_dir):
        self.df = pd.read_csv(df_path, sep='\t')
        with open(D_pkl, 'rb') as f:
            self.image_descs = pickle.load(f)
        with open(O_pkl, 'rb') as f:
            self.objects = pickle.load(f)
        self.image_dir = image_dir
        self.image_extractor = ImageFeatureExtractor()

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_id = str(row['pid'])
        object_data = self.objects.get(image_id, {'confidence_scores': []})  # Key must match
        
        return {
            'text': row['text'],
            'target': row['target_of_sarcasm'],
            'image_features': self.image_extractor.extract_features(
                f"{self.image_dir}/{image_id}.jpg", 
                object_data
            ),
            'explanation': row['explanation']
        }

    def __len__(self):
        return len(self.df)

# **Core Architecture**

In [None]:
from torch import nn
from transformers import BartModel, BartForConditionalGeneration

class SharedFusion(nn.Module):
    def __init__(self, d_model=768):
        super().__init__()
        self.text_proj = nn.Linear(d_model, d_model)
        self.image_proj = nn.Linear(d_model, d_model)
        self.fusion = nn.Linear(d_model * 2, d_model)
        
    def forward(self, text_emb, image_features):
        # text_emb: [batch_size, seq_len, d_model]
        # image_features: [batch_size, d_model]
        
        # Expand image features to match text sequence length
        image_features = image_features.unsqueeze(1).expand(-1, text_emb.size(1), -1)  # [batch_size, seq_len, d_model]
        
        # Project text and image features
        text_proj = self.text_proj(text_emb)
        image_proj = self.image_proj(image_features)
    
        # Concatenate and fuse
        combined = torch.cat([text_proj, image_proj], dim=-1)  # [batch_size, seq_len, d_model * 2]
        fused = self.fusion(combined)  # [batch_size, seq_len, d_model]
        return fused

In [None]:
from transformers.modeling_outputs import BaseModelOutput
from transformers import BartModel, BartConfig


class TURBO(nn.Module):
    def __init__(self):
        super().__init__()
        self.bart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.image_proj = nn.Linear(768 + 512, 768)  # 768 (ViT) + 512 (objects)
        self.fusion = SharedFusion()
        
    def forward(self, input_ids, attention_mask, image_features, labels=None):
        image_emb = self.image_proj(image_features)
        text_emb = self.bart.model.encoder(input_ids, attention_mask)[0]
        
        # Iterate through encoder layers and apply cross-attention
        for idx, layer in enumerate(self.bart.model.encoder.layers):
            text_emb = layer(text_emb, attention_mask)[0]
            # Cross-attention: text (query), image (key/value)
            fused_emb = self.fusion(text_emb, image_emb)
            text_emb = layer.image_ln(text_emb + cross_attn_output)
        
        return self.bart(inputs_embeds=fused_emb, labels=labels)


    def generate(self, input_ids, attention_mask, image_features, **generate_kwargs):
        image_emb = self.image_proj(image_features)
        text_emb = self.bart.model.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        fused_features = self.fusion(text_emb, image_emb)
        # Mimic BART's encoder_outputs structure
        
        encoder_outputs = BaseModelOutput(
            last_hidden_state=fused_features,
        )
        return self.bart.generate(
            encoder_outputs=encoder_outputs,
            **generate_kwargs
        )



# **Training Pipeline**

In [None]:
from transformers import BartTokenizer
from datasets import Dataset as HFDataset
# Initialize tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
tokenizer.add_tokens(["[Text]", "</s>", "[Image]"], special_tokens=True)
# Corrected preprocess_function
def preprocess_function(examples):
    inputs = [
        f"[Text] {text} </s> {target} [Image]" 
        for text, target in zip(examples['text'], examples['target'])
    ]
    model_inputs = tokenizer(
        inputs, 
        max_length=512, 
        truncation=True, 
        padding='max_length',
        add_special_tokens=True # Remove return_tensors='pt'
    )
    labels = tokenizer(
        examples['explanation'], 
        max_length=128, 
        truncation=True, 
        padding='max_length'
    )['input_ids']

    model_inputs['labels'] = labels
    model_inputs['image_features'] = examples['image_features']
    return model_inputs
    
# Dataset paths
t_df_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/train_df.tsv"
t_d_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/D_train.pkl"
t_o_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/O_train.pkl"
t_image_dir = "/kaggle/input/more-plus/MORE-PLUS-DATASET/images"

# Load and preprocess train dataset
train_dataset = MUSEDataset(t_df_path, t_d_pkl_path, t_o_pkl_path, t_image_dir)
train_dataset = HFDataset.from_list([train_dataset[i] for i in range(len(train_dataset))])


v_df_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/val_df.tsv"
v_d_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/D_val.pkl"
v_o_pkl_path = "/kaggle/input/more-plus/MORE-PLUS-DATASET/O_val.pkl"
v_image_dir = "/kaggle/input/more-plus/MORE-PLUS-DATASET/images"

# Load and preprocess val dataset
val_dataset = MUSEDataset(v_df_path, v_d_pkl_path, v_o_pkl_path, v_image_dir)
val_dataset = HFDataset.from_list([val_dataset[i] for i in range(len(val_dataset))])


Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:01<00:00, 192MB/s] 


In [None]:
def preprocess_function(examples):
    inputs = [
        f"[Text] {text} </s> {target} [Image]" 
        for text, target in zip(examples['text'], examples['target'])
    ]
    model_inputs = tokenizer(
        inputs, 
        max_length=512, 
        truncation=True, 
        padding='max_length',
        add_special_tokens=True # Remove return_tensors='pt'
    )
    labels = tokenizer(
        examples['explanation'], 
        max_length=128, 
        truncation=True, 
        padding='max_length'
    )['input_ids']

    model_inputs['labels'] = labels
    model_inputs['image_features'] = examples['image_features']
    return model_inputs

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True) 

In [None]:
def data_collator(features):
    batch = {
        'input_ids': torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f['input_ids']) for f in features],
            batch_first=True, padding_value=1  # BART's pad_token_id=1
        ),
        'attention_mask': torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f['attention_mask']) for f in features],
            batch_first=True, padding_value=0
        ),
        'image_features': torch.stack(
            [torch.tensor(f['image_features']).float() for f in features]  # Add .float()
        ),
        'labels': torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f['labels']) for f in features],
            batch_first=True, padding_value=-100  # Standard ignore_index
        )
    }
    return batch

In [None]:
sample = train_dataset[0]
print(type(sample['input_ids']))  # Should be list, not tensor
print(type(sample['image_features']))  # Should be numpy array

In [None]:
def save_model(model, optimizer, epoch, loss, path):
    """
    Save the model checkpoint to the specified path.
    
    Args:
        model (torch.nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer used for training.
        epoch (int): The current epoch number.
        loss (float): The average loss at the end of the epoch.
        path (str): The file path where the checkpoint will be saved.
    """
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)
    print(f"Model checkpoint saved to {path}")

def load_model(model, optimizer, path, device):
    """
    Load the model checkpoint from the specified path.
    
    Args:
        model (torch.nn.Module): The model to load the state into.
        optimizer (torch.optim.Optimizer): The optimizer to load the state into.
        path (str): The file path where the checkpoint is stored.
        device (torch.device): The device to load the model onto (e.g., "cuda" or "cpu").
    
    Returns:
        int: The epoch number from the checkpoint.
        float: The loss value from the checkpoint.
    """
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    
    # Move model to the specified device
    model.to(device)
    
    print(f"Model checkpoint loaded from {path}")
    return epoch, loss

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import tqdm
import os

# Create results directory
os.makedirs("./results", exist_ok=True)

# Define hyperparameters
batch_size = 16
num_epochs = 15
learning_rate = 5e-5
max_length = 128
warmup_step = 500
logging_steps = 100

# Initialize model, tokenizer, dataset, etc.
model = TURBO()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.bart.resize_token_embeddings(len(tokenizer))
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_step,
    num_training_steps=len(train_dataloader) * num_epochs
)

# Start training from scratch
start_epoch = 0
print("Starting training from scratch.")

# Training loop
total_steps = 0
for epoch in range(start_epoch, num_epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_steps += 1
        epoch_loss += loss.item()

        if total_steps % logging_steps == 0:
            print(f"Step {total_steps} - Loss: {loss.item():.4f}")

        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_epoch_loss = epoch_loss / len(train_dataloader)  # This line was missing
    print(f"Epoch {epoch + 1} completed - Average Loss: {avg_epoch_loss:.4f}")
    
    # Save checkpoint
    save_model(
        model=model,
        optimizer=optimizer,
        epoch=epoch + 1,
        loss=avg_epoch_loss,
        path=f"./results/checkpoint_epoch_{epoch + 1}.pt"
    )

print("Training completed!")


In [None]:
import os

# Path to the results directory
results_dir = "./results"

# Get all checkpoint files
checkpoint_files = [f for f in os.listdir(results_dir) if f.startswith("checkpoint_epoch_") and f.endswith(".pt")]

# Sort files by epoch number
checkpoint_files.sort(key=lambda x: int(x.split("_")[2].split(".")[0]))

# Load the model from the latest checkpoint if it exists
if checkpoint_files:
    latest_checkpoint = os.path.join(results_dir, checkpoint_files[-1])
    checkpoint = torch.load(latest_checkpoint, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    print(f"Model loaded from {latest_checkpoint}")
else:
    print("No checkpoint found.")

# **Evaluation System**

In [None]:
def generate_explanations(model, tokenizer, test_df_path, d_pkl_path, o_pkl_path, image_dir, output_file="explanations.txt"):
    # Load test dataset
    test_dataset = MUSEDataset(test_df_path, d_pkl_path, o_pkl_path, image_dir)
    test_dataset = HFDataset.from_list([test_dataset[i] for i in range(len(test_dataset))])
    test_dataset = test_dataset.map(preprocess_function, batched=True)
    
    # Set model to evaluation mode
    model.eval()
    explanations = []
    
    with torch.no_grad():
        for item in test_dataset:
            input_ids = torch.tensor(item['input_ids']).unsqueeze(0).to(device) 
            attention_mask = torch.tensor(item['attention_mask']).unsqueeze(0).to(device)  # Add this
            image_features = torch.tensor(item['image_features']).float().unsqueeze(0).to(device)
            
            # Generate explanation
            output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        image_features=image_features,
        max_length=128,
        num_beams=5,           # Increased beam width
        length_penalty=2.0,     # Added length penalty
        early_stopping=True,
        no_repeat_ngram_size=3  # Prevent repetition
    )
            explanation = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            explanations.append(explanation)
    
    # Save explanations
    with open(output_file, 'w') as f:
        for exp in explanations:
            f.write(exp + "\n")
    
    return explanations

# Example usage (adjust paths for demo)
# test_df_path = "path/to/test.tsv"
# d_test_pkl = "path/to/D_test.pkl"
# o_test_pkl = "path/to/O_test.pkl"
# explanations = generate_explanations(trainer.model, tokenizer, test_df_path, d_test_pkl, o_test_pkl, image_dir)

In [None]:
!pip install --upgrade nltk

In [None]:
import nltk
nltk.download('wordnet', download_dir='/usr/share/nltk_data')  # Explicitly specify path
nltk.download('omw-1.4', download_dir='/usr/share/nltk_data')
nltk.data.path.append('/usr/share/nltk_data')  # Ensure NLTK checks this directory

In [None]:
import torch
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score
from sacrebleu import corpus_bleu

def evaluate_model(model, tokenizer, eval_dataset, device):
    # Set the model to evaluation mode
    model.eval()
    
    # Lists to store generated and ground truth explanations
    generated_explanations = []
    ground_truth_explanations = []
    
    # Disable gradient computation for inference
    with torch.no_grad():
        for item in eval_dataset:
            # Prepare input tensors, adding batch dimension [1, seq_len] or [1, feature_dim]
            input_ids = torch.tensor(item['input_ids']).unsqueeze(0).to(device)
            attention_mask = torch.tensor(item['attention_mask']).unsqueeze(0).to(device)
            image_features = torch.tensor(item['image_features']).unsqueeze(0).to(device)
            
            # Generate explanation using the model's generate method
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                image_features=image_features,
                max_length=128,  # Maximum length of generated sequence
                num_beams=5,     # Beam search for better generation quality
                length_penalty=2.0, 
                early_stopping=True
            )
            
            # Decode generated token IDs to text, removing special tokens
            generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            generated_explanations.append(generated)
            
            # Store the ground truth explanation (original string)
            ground_truth_explanations.append(item['explanation'])
    
    # Compute ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for ref, hyp in zip(ground_truth_explanations, generated_explanations):
        scores = rouge.score(ref, hyp)
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure
    num_samples = len(ground_truth_explanations)
    for key in rouge_scores:
        rouge_scores[key] /= num_samples
    
    # Compute BLEU scores (BLEU-1 to BLEU-4)
    references = [[ref.split()] for ref in ground_truth_explanations]  # List of list of tokens
    hypotheses = [hyp.split() for hyp in generated_explanations]      # List of tokens
    bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))
    
    # Compute METEOR score (averaged over all samples)
    meteor = np.mean([meteor_score([ref.split()], hyp.split()) 
                     for ref, hyp in zip(ground_truth_explanations, generated_explanations)])
    
    # Compute BERTScore (batch computation for efficiency)
    P, R, F1 = score(generated_explanations, ground_truth_explanations, lang='en', verbose=False)
    bertscore_p = P.mean().item()
    bertscore_r = R.mean().item()
    bertscore_f1 = F1.mean().item()  # Average F1 score across all samples
    
    # Compile all metrics into a dictionary
    metrics = {
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'bleu1': bleu1,
        'bleu2': bleu2,
        'bleu3': bleu3,
        'bleu4': bleu4,
        'meteor': meteor,
        'bertscore_f1': bertscore_f1,
        'bertscore_p':bertscore_p,
        'bertscore_r':bertscore_r,
    }
    
    return metrics

In [None]:
metrics = evaluate_model(model, tokenizer, val_dataset, device)
print("Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")