## Dataset Download

In [1]:
!gdown 1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB

Downloading...
From (original): https://drive.google.com/uc?id=1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB
From (redirected): https://drive.google.com/uc?id=1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB&confirm=t&uuid=fc605b59-c18d-41e9-84a5-1fee37d6dc9f
To: /kaggle/working/depression_dataset_complete.zip
100%|████████████████████████████████████████| 652M/652M [00:11<00:00, 58.2MB/s]


In [2]:
!unzip depression_dataset_complete.zip > /dev/null

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel, CLIPImageProcessor, CLIPVisionModel
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from tqdm import tqdm
from PIL import Image
import json
import math

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16
LR = 2e-5
MAX_LEN = 512
FUSION_DIM = 768
LABELS = ["Lack of Interest", "Feeling Down", "Eating Disorder",
          "Sleeping Disorder", "Low Self-Esteem", "Concentration Problem", "Self-Harm"]
LABEL_MAP = {label: i for i, label in enumerate(LABELS)}
NUM_CLASSES = len(LABELS)

## Custom Dataset

In [4]:
class MultimodalDepressionDataset(Dataset):
    def __init__(self, data, image_path, tokenizer, image_processor, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_len = max_len
        self.img_path = image_path
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        ocr_text = sample["ocr_text"]
        figurative_reasoning = sample["figurative_reasoning"]
        combined_text = ocr_text + " [SEP] " + figurative_reasoning

        encoding = self.tokenizer(
            combined_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        image_path = os.path.join(self.img_path, sample["sample_id"] + ".jpeg")
        image = Image.open(image_path).convert("RGB")
        image_tensor = self.image_processor(image, return_tensors="pt")

        labels = torch.zeros(NUM_CLASSES)
        for category in sample["meme_depressive_categories"]:
            labels[LABEL_MAP[category]] = 1.0

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "image": image_tensor,
            "label": labels
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])

    images = {}
    for key in batch[0]['image'].keys():
        if isinstance(batch[0]['image'][key], torch.Tensor):
            images[key] = torch.stack([item['image'][key].squeeze(0) for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'image': images,
        'label': labels,
    }

## Model Definition

In [5]:
class CrossAttentionLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads=8, dropout=0.1):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = hidden_size // num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.output = nn.Linear(hidden_size, hidden_size)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_size)

        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size),
            nn.Dropout(dropout)
        )
        self.ffn_layer_norm = nn.LayerNorm(hidden_size)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, x, context):
        mixed_query_layer = self.query(x)
        mixed_key_layer = self.key(context)
        mixed_value_layer = self.value(context)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        attention_output = self.output(context_layer)
        attention_output = self.dropout(attention_output)
        attention_output = self.layer_norm(attention_output + x)

        ffn_output = self.ffn(attention_output)

        output = self.ffn_layer_norm(ffn_output + attention_output)

        return output

class MultimodalAttentionModel(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_classes=7, fusion_dim=768):
        super(MultimodalAttentionModel, self).__init__()

        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim = self.text_encoder.config.hidden_size

        self.vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.vision_dim = self.vision_encoder.config.hidden_size

        self.text_dim = self.text_encoder.config.hidden_size
        self.vision_dim = self.vision_encoder.config.hidden_size
        fusion_dim = 1024

        self.text_projections = nn.ModuleList([
            nn.Linear(self.text_dim, fusion_dim) for _ in range(3)
        ])

        self.vision_projections = nn.ModuleList([
            nn.Linear(self.vision_dim, fusion_dim) for _ in range(3)
        ])

        self.co_attention_layers = nn.ModuleList([
            CrossAttentionLayer(fusion_dim) for _ in range(3)
        ])

        self.expert_nets = nn.ModuleList([
            nn.Sequential(
                nn.Linear(fusion_dim, fusion_dim),
                nn.LayerNorm(fusion_dim),
                nn.GELU(),
                nn.Dropout(0.2)
            ) for _ in range(4)
        ])

        self.moe_gate = nn.Linear(fusion_dim, 4)

        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, image_features):
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        text_features = [
            text_outputs.hidden_states[-i][:, 0] for i in range(1, 4)
        ]
        text_features = [
            proj(feat) for proj, feat in zip(self.text_projections, text_features)
        ]

        vision_outputs = self.vision_encoder(**image_features, output_hidden_states=True)
        vision_features = [
            vision_outputs.hidden_states[-i][:, 0] for i in range(1, 4)
        ]
        vision_features = [
            proj(feat) for proj, feat in zip(self.vision_projections, vision_features)
        ]

        fused_features = []
        for text_feat, vision_feat, co_attn in zip(text_features, vision_features, self.co_attention_layers):
            fused_feat = co_attn(text_feat.unsqueeze(1), vision_feat.unsqueeze(1))
            fused_features.append(fused_feat.squeeze(1))

        combined_feature = sum(fused_features) / len(fused_features)

        expert_outputs = [expert(combined_feature) for expert in self.expert_nets]
        expert_gates = torch.nn.functional.softmax(self.moe_gate(combined_feature), dim=1)

        moe_output = torch.zeros_like(expert_outputs[0])
        for i, expert_out in enumerate(expert_outputs):
            moe_output += expert_out * expert_gates[:, i].unsqueeze(1)

        logits = self.classifier(moe_output)

        return logits

## Training Functions

In [None]:
def train_multimodal_model(model, train_data, val_data, img_path, epochs, model_save_name):
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
    image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

    train_dataset = MultimodalDepressionDataset(train_data, os.path.join(img_path, "train"), tokenizer, image_processor, max_len=MAX_LEN)
    val_dataset = MultimodalDepressionDataset(val_data, os.path.join(img_path, "val"), tokenizer, image_processor, max_len=MAX_LEN)

    print("Train Set Size:", len(train_dataset))
    print("Validation Set Size:", len(val_dataset))

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=custom_collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    optimizer = optim.AdamW(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2, verbose=True
    )
    criterion = nn.BCEWithLogitsLoss()

    model = model.to(DEVICE)
    model = nn.DataParallel(model)

    best_f1 = 0
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        model.train()
        train_loss = 0
        all_train_preds, all_train_labels = [], []

        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            image_features = {k: v.to(DEVICE) for k, v in batch["image"].items()}

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask, image_features)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()
            with torch.no_grad():
                predictions = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
                all_train_preds.extend(predictions)
                all_train_labels.extend(labels.cpu().numpy())

        train_loss = train_loss / len(train_loader)
        train_metrics = compute_multilabel_metrics(np.array(all_train_labels), np.array(all_train_preds))

        print(f"Train Loss: {train_loss:.4f}")
        print(f"Train Macro-F1: {train_metrics['macro_f1']:.4f}, Weighted-F1: {train_metrics['weighted_f1']:.4f}")

        val_loss, val_metrics = evaluate_multimodal_model(
            model, val_loader, criterion
        )

        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Macro-F1: {val_metrics['macro_f1']:.4f}, Weighted-F1: {val_metrics['weighted_f1']:.4f}")

        scheduler.step(val_metrics['macro_f1'])

        f1_hm = 2 * val_metrics["macro_f1"] * val_metrics["weighted_f1"] / (val_metrics["macro_f1"] + val_metrics["weighted_f1"])
        if f1_hm > best_f1:
            best_f1 = f1_hm
            torch.save(model.state_dict(), f"{model_save_name}_depression.pth")
            print("Best model saved!")

    return model

def compute_multilabel_metrics(y_true, y_pred):
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    metrics = {
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall,
        'weighted_f1': weighted_f1,
        'per_class_precision': per_class_precision,
        'per_class_recall': per_class_recall,
        'per_class_f1': per_class_f1
    }
    
    return metrics

def evaluate_multimodal_model(model, loader, criterion):
    model.eval()
    val_loss = 0
    all_val_preds, all_val_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            image_features = {k: v.to(DEVICE) for k, v in batch["image"].items()}
            
            logits = model(input_ids, attention_mask, image_features)
            loss = criterion(logits, labels)
            
            val_loss += loss.item()
            predictions = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
            all_val_preds.extend(predictions)
            all_val_labels.extend(labels.cpu().numpy())
    
    val_loss = val_loss / len(loader)
    val_metrics = compute_multilabel_metrics(np.array(all_val_labels), np.array(all_val_preds))
    
    print("\nPer-class metrics:")
    for i, label in enumerate(LABELS):
        print(f"{label}: F1={val_metrics['per_class_f1'][i]:.4f}, "
              f"Precision={val_metrics['per_class_precision'][i]:.4f}, "
              f"Recall={val_metrics['per_class_recall'][i]:.4f}")
    
    return val_loss, val_metrics

## Model Training

In [10]:
train_data = json.load(open("depression_train_llava_dataset.json", "r"))
val_data = json.load(open("depression_val_llava_dataset.json", "r"))
test_data = json.load(open("depression_test_llava_dataset.json", "r"))

img_path = "depressive_image"

tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

test_dataset = MultimodalDepressionDataset(test_data, os.path.join(img_path, "test"), tokenizer, image_processor, max_len=MAX_LEN)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=custom_collate_fn
)

In [None]:
model = MultimodalAttentionModel(
    text_model_name="mental/mental-roberta-base",
    num_classes=NUM_CLASSES,
    fusion_dim=FUSION_DIM
)

trained_model = train_multimodal_model(
    model,
    train_data,
    val_data,
    img_path,
    epochs=30,
    model_save_name="no_contrastive"
)

Train Set Size: 8722
Validation Set Size: 359


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/30


Training: 100%|██████████| 546/546 [13:24<00:00,  1.47s/it]


Train Loss: 0.2677
Train Macro-F1: 0.5511, Weighted-F1: 0.5836


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.62it/s]



Per-class metrics:
Lack of Interest: F1=0.3636, Precision=1.0000, Recall=0.2222
Feeling Down: F1=0.6513, Precision=0.7434, Recall=0.5795
Eating Disorder: F1=0.7955, Precision=0.8974, Recall=0.7143
Sleeping Disorder: F1=0.7907, Precision=0.8293, Recall=0.7556
Low Self-Esteem: F1=0.2330, Precision=0.6667, Recall=0.1412
Concentration Problem: F1=0.7324, Precision=0.8966, Recall=0.6190
Self-Harm: F1=0.4783, Precision=0.7097, Recall=0.3607
Validation Loss: 0.3595
Validation Macro-F1: 0.5778, Weighted-F1: 0.5702
Best model saved!



Epoch 2/30


Training: 100%|██████████| 546/546 [13:23<00:00,  1.47s/it]


Train Loss: 0.1824
Train Macro-F1: 0.7290, Weighted-F1: 0.7466


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.59it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6569, Precision=0.7671, Recall=0.5744
Eating Disorder: F1=0.7273, Precision=0.8205, Recall=0.6531
Sleeping Disorder: F1=0.6842, Precision=0.8387, Recall=0.5778
Low Self-Esteem: F1=0.0667, Precision=0.6000, Recall=0.0353
Concentration Problem: F1=0.6857, Precision=0.8571, Recall=0.5714
Self-Harm: F1=0.4792, Precision=0.6571, Recall=0.3770
Validation Loss: 0.4033
Validation Macro-F1: 0.5190, Weighted-F1: 0.5234



Epoch 3/30


Training: 100%|██████████| 546/546 [13:25<00:00,  1.47s/it]


Train Loss: 0.1417
Train Macro-F1: 0.8003, Weighted-F1: 0.8165


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.65it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6412, Precision=0.7517, Recall=0.5590
Eating Disorder: F1=0.7959, Precision=0.7959, Recall=0.7959
Sleeping Disorder: F1=0.5747, Precision=0.5952, Recall=0.5556
Low Self-Esteem: F1=0.0860, Precision=0.5000, Recall=0.0471
Concentration Problem: F1=0.7059, Precision=0.9231, Recall=0.5714
Self-Harm: F1=0.5470, Precision=0.5714, Recall=0.5246
Validation Loss: 0.4368
Validation Macro-F1: 0.5288, Weighted-F1: 0.5287



Epoch 4/30


Training: 100%|██████████| 546/546 [13:23<00:00,  1.47s/it]


Train Loss: 0.1082
Train Macro-F1: 0.8529, Weighted-F1: 0.8668


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3667, Precision=0.7333, Recall=0.2444
Feeling Down: F1=0.5849, Precision=0.7561, Recall=0.4769
Eating Disorder: F1=0.8000, Precision=0.8261, Recall=0.7755
Sleeping Disorder: F1=0.7765, Precision=0.8250, Recall=0.7333
Low Self-Esteem: F1=0.3231, Precision=0.4667, Recall=0.2471
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5763, Precision=0.5965, Recall=0.5574
Validation Loss: 0.4573
Validation Macro-F1: 0.5902, Weighted-F1: 0.5688
Best model saved!



Epoch 5/30


Training: 100%|██████████| 546/546 [13:15<00:00,  1.46s/it]


Train Loss: 0.0825
Train Macro-F1: 0.8974, Weighted-F1: 0.9056


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.69it/s]



Per-class metrics:
Lack of Interest: F1=0.3492, Precision=0.6111, Recall=0.2444
Feeling Down: F1=0.6350, Precision=0.7535, Recall=0.5487
Eating Disorder: F1=0.7423, Precision=0.7500, Recall=0.7347
Sleeping Disorder: F1=0.6988, Precision=0.7632, Recall=0.6444
Low Self-Esteem: F1=0.1887, Precision=0.4762, Recall=0.1176
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5357, Precision=0.5882, Recall=0.4918
Validation Loss: 0.5326
Validation Macro-F1: 0.5556, Weighted-F1: 0.5501



Epoch 6/30


Training: 100%|██████████| 546/546 [13:16<00:00,  1.46s/it]


Train Loss: 0.0652
Train Macro-F1: 0.9206, Weighted-F1: 0.9284


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3438, Precision=0.5789, Recall=0.2444
Feeling Down: F1=0.6319, Precision=0.7863, Recall=0.5282
Eating Disorder: F1=0.7358, Precision=0.6842, Recall=0.7959
Sleeping Disorder: F1=0.6535, Precision=0.5893, Recall=0.7333
Low Self-Esteem: F1=0.1782, Precision=0.5625, Recall=0.1059
Concentration Problem: F1=0.7179, Precision=0.7778, Recall=0.6667
Self-Harm: F1=0.5051, Precision=0.6579, Recall=0.4098
Validation Loss: 0.6097
Validation Macro-F1: 0.5380, Weighted-F1: 0.5369



Epoch 7/30


Training: 100%|██████████| 546/546 [13:16<00:00,  1.46s/it]


Train Loss: 0.0505
Train Macro-F1: 0.9427, Weighted-F1: 0.9458


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.69it/s]



Per-class metrics:
Lack of Interest: F1=0.3390, Precision=0.7143, Recall=0.2222
Feeling Down: F1=0.5962, Precision=0.7949, Recall=0.4769
Eating Disorder: F1=0.7647, Precision=0.7358, Recall=0.7959
Sleeping Disorder: F1=0.6600, Precision=0.6000, Recall=0.7333
Low Self-Esteem: F1=0.3065, Precision=0.4872, Recall=0.2235
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5854, Precision=0.5806, Recall=0.5902
Validation Loss: 0.6343
Validation Macro-F1: 0.5702, Weighted-F1: 0.5584



Epoch 8/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0352
Train Macro-F1: 0.9626, Weighted-F1: 0.9631


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.63it/s]



Per-class metrics:
Lack of Interest: F1=0.3125, Precision=0.5263, Recall=0.2222
Feeling Down: F1=0.6310, Precision=0.7518, Recall=0.5436
Eating Disorder: F1=0.7184, Precision=0.6852, Recall=0.7551
Sleeping Disorder: F1=0.6897, Precision=0.7143, Recall=0.6667
Low Self-Esteem: F1=0.3662, Precision=0.4561, Recall=0.3059
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6062
Validation Macro-F1: 0.5714, Weighted-F1: 0.5724



Epoch 9/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0264
Train Macro-F1: 0.9744, Weighted-F1: 0.9748


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3390, Precision=0.7143, Recall=0.2222
Feeling Down: F1=0.6724, Precision=0.7564, Recall=0.6051
Eating Disorder: F1=0.7708, Precision=0.7872, Recall=0.7551
Sleeping Disorder: F1=0.6966, Precision=0.7045, Recall=0.6889
Low Self-Esteem: F1=0.2393, Precision=0.4375, Recall=0.1647
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5500, Precision=0.5593, Recall=0.5410
Validation Loss: 0.6385
Validation Macro-F1: 0.5726, Weighted-F1: 0.5756



Epoch 10/30


Training: 100%|██████████| 546/546 [13:18<00:00,  1.46s/it]


Train Loss: 0.0213
Train Macro-F1: 0.9782, Weighted-F1: 0.9784


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.65it/s]



Per-class metrics:
Lack of Interest: F1=0.3226, Precision=0.5882, Recall=0.2222
Feeling Down: F1=0.6724, Precision=0.7647, Recall=0.6000
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7381, Precision=0.7949, Recall=0.6889
Low Self-Esteem: F1=0.2975, Precision=0.5000, Recall=0.2118
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5455, Precision=0.6122, Recall=0.4918
Validation Loss: 0.6774
Validation Macro-F1: 0.5808, Weighted-F1: 0.5847



Epoch 11/30


Training: 100%|██████████| 546/546 [13:15<00:00,  1.46s/it]


Train Loss: 0.0156
Train Macro-F1: 0.9862, Weighted-F1: 0.9851


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.69it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6569, Precision=0.7671, Recall=0.5744
Eating Disorder: F1=0.7800, Precision=0.7647, Recall=0.7959
Sleeping Disorder: F1=0.7229, Precision=0.7895, Recall=0.6667
Low Self-Esteem: F1=0.3051, Precision=0.5455, Recall=0.2118
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5440, Precision=0.5312, Recall=0.5574
Validation Loss: 0.7045
Validation Macro-F1: 0.5831, Weighted-F1: 0.5824



Epoch 12/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0118
Train Macro-F1: 0.9889, Weighted-F1: 0.9888


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.66it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6627, Precision=0.7832, Recall=0.5744
Eating Disorder: F1=0.7879, Precision=0.7800, Recall=0.7959
Sleeping Disorder: F1=0.7312, Precision=0.7083, Recall=0.7556
Low Self-Esteem: F1=0.2969, Precision=0.4419, Recall=0.2235
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5546, Precision=0.5690, Recall=0.5410
Validation Loss: 0.7264
Validation Macro-F1: 0.5844, Weighted-F1: 0.5847



Epoch 13/30


Training: 100%|██████████| 546/546 [13:13<00:00,  1.45s/it]


Train Loss: 0.0099
Train Macro-F1: 0.9908, Weighted-F1: 0.9906


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.64it/s]



Per-class metrics:
Lack of Interest: F1=0.3226, Precision=0.5882, Recall=0.2222
Feeling Down: F1=0.6686, Precision=0.7632, Recall=0.5949
Eating Disorder: F1=0.7647, Precision=0.7358, Recall=0.7959
Sleeping Disorder: F1=0.7126, Precision=0.7381, Recall=0.6889
Low Self-Esteem: F1=0.3115, Precision=0.5135, Recall=0.2235
Concentration Problem: F1=0.7222, Precision=0.8667, Recall=0.6190
Self-Harm: F1=0.5357, Precision=0.5882, Recall=0.4918
Validation Loss: 0.7351
Validation Macro-F1: 0.5768, Weighted-F1: 0.5822



Epoch 14/30


Training: 100%|██████████| 546/546 [13:17<00:00,  1.46s/it]


Train Loss: 0.0077
Train Macro-F1: 0.9929, Weighted-F1: 0.9928


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3175, Precision=0.5556, Recall=0.2222
Feeling Down: F1=0.6528, Precision=0.7746, Recall=0.5641
Eating Disorder: F1=0.7723, Precision=0.7500, Recall=0.7959
Sleeping Disorder: F1=0.7333, Precision=0.7333, Recall=0.7333
Low Self-Esteem: F1=0.2881, Precision=0.5152, Recall=0.2000
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5333, Precision=0.5424, Recall=0.5246
Validation Loss: 0.7703
Validation Macro-F1: 0.5753, Weighted-F1: 0.5749



Epoch 15/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0073
Train Macro-F1: 0.9923, Weighted-F1: 0.9921


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.61it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6608, Precision=0.7687, Recall=0.5795
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7209, Precision=0.7561, Recall=0.6889
Low Self-Esteem: F1=0.3089, Precision=0.5000, Recall=0.2235
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5920, Precision=0.5781, Recall=0.6066
Validation Loss: 0.7634
Validation Macro-F1: 0.5843, Weighted-F1: 0.5859



Epoch 16/30


Training: 100%|██████████| 546/546 [13:16<00:00,  1.46s/it]


Train Loss: 0.0060
Train Macro-F1: 0.9933, Weighted-F1: 0.9932


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.67it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6629, Precision=0.7484, Recall=0.5949
Eating Disorder: F1=0.7573, Precision=0.7222, Recall=0.7959
Sleeping Disorder: F1=0.7059, Precision=0.7500, Recall=0.6667
Low Self-Esteem: F1=0.3636, Precision=0.5106, Recall=0.2824
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5517, Precision=0.5818, Recall=0.5246
Validation Loss: 0.7594
Validation Macro-F1: 0.5870, Weighted-F1: 0.5910



Epoch 17/30


Training: 100%|██████████| 546/546 [13:17<00:00,  1.46s/it]


Train Loss: 0.0050
Train Macro-F1: 0.9942, Weighted-F1: 0.9943


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.64it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6706, Precision=0.7770, Recall=0.5897
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7273, Precision=0.7442, Recall=0.7111
Low Self-Esteem: F1=0.3065, Precision=0.4872, Recall=0.2235
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5517, Precision=0.5818, Recall=0.5246
Validation Loss: 0.7773
Validation Macro-F1: 0.5819, Weighted-F1: 0.5857



Epoch 18/30


Training: 100%|██████████| 546/546 [13:16<00:00,  1.46s/it]


Train Loss: 0.0051
Train Macro-F1: 0.9946, Weighted-F1: 0.9945


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.69it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6686, Precision=0.7718, Recall=0.5897
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7391, Precision=0.7234, Recall=0.7556
Low Self-Esteem: F1=0.3089, Precision=0.5000, Recall=0.2235
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5614, Precision=0.6038, Recall=0.5246
Validation Loss: 0.7784
Validation Macro-F1: 0.5859, Weighted-F1: 0.5881



Epoch 19/30


Training: 100%|██████████| 546/546 [13:17<00:00,  1.46s/it]


Train Loss: 0.0048
Train Macro-F1: 0.9954, Weighted-F1: 0.9952


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6529, Precision=0.7655, Recall=0.5692
Eating Disorder: F1=0.7429, Precision=0.6964, Recall=0.7959
Sleeping Disorder: F1=0.7253, Precision=0.7174, Recall=0.7333
Low Self-Esteem: F1=0.3256, Precision=0.4773, Recall=0.2471
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.7914
Validation Macro-F1: 0.5775, Weighted-F1: 0.5792



Epoch 20/30


Training: 100%|██████████| 546/546 [13:25<00:00,  1.47s/it]


Train Loss: 0.0042
Train Macro-F1: 0.9958, Weighted-F1: 0.9956


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.65it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6608, Precision=0.7778, Recall=0.5744
Eating Disorder: F1=0.7647, Precision=0.7358, Recall=0.7959
Sleeping Disorder: F1=0.7191, Precision=0.7273, Recall=0.7111
Low Self-Esteem: F1=0.3175, Precision=0.4878, Recall=0.2353
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5667, Precision=0.5763, Recall=0.5574
Validation Loss: 0.7914
Validation Macro-F1: 0.5845, Weighted-F1: 0.5860



Epoch 21/30


Training: 100%|██████████| 546/546 [13:18<00:00,  1.46s/it]


Train Loss: 0.0043
Train Macro-F1: 0.9951, Weighted-F1: 0.9951


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3333, Precision=0.6667, Recall=0.2222
Feeling Down: F1=0.6686, Precision=0.7718, Recall=0.5897
Eating Disorder: F1=0.7573, Precision=0.7222, Recall=0.7959
Sleeping Disorder: F1=0.7191, Precision=0.7273, Recall=0.7111
Low Self-Esteem: F1=0.3150, Precision=0.4762, Recall=0.2353
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.7937
Validation Macro-F1: 0.5808, Weighted-F1: 0.5850



Epoch 22/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0039
Train Macro-F1: 0.9951, Weighted-F1: 0.9951


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.64it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6725, Precision=0.7733, Recall=0.5949
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7253, Precision=0.7174, Recall=0.7333
Low Self-Esteem: F1=0.3040, Precision=0.4750, Recall=0.2235
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.8015
Validation Macro-F1: 0.5788, Weighted-F1: 0.5840



Epoch 23/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0038
Train Macro-F1: 0.9965, Weighted-F1: 0.9961


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.66it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6686, Precision=0.7718, Recall=0.5897
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7033, Precision=0.6957, Recall=0.7111
Low Self-Esteem: F1=0.3040, Precision=0.4750, Recall=0.2235
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5470, Precision=0.5714, Recall=0.5246
Validation Loss: 0.8062
Validation Macro-F1: 0.5744, Weighted-F1: 0.5804



Epoch 24/30


Training: 100%|██████████| 546/546 [13:15<00:00,  1.46s/it]


Train Loss: 0.0040
Train Macro-F1: 0.9956, Weighted-F1: 0.9951


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.66it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6628, Precision=0.7740, Recall=0.5795
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7033, Precision=0.6957, Recall=0.7111
Low Self-Esteem: F1=0.3016, Precision=0.4634, Recall=0.2235
Concentration Problem: F1=0.7297, Precision=0.8438, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.8084
Validation Macro-F1: 0.5739, Weighted-F1: 0.5781



Epoch 25/30


Training: 100%|██████████| 546/546 [13:16<00:00,  1.46s/it]


Train Loss: 0.0038
Train Macro-F1: 0.9958, Weighted-F1: 0.9955


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6725, Precision=0.7733, Recall=0.5949
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7174, Precision=0.7021, Recall=0.7333
Low Self-Esteem: F1=0.2927, Precision=0.4737, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5470, Precision=0.5714, Recall=0.5246
Validation Loss: 0.8091
Validation Macro-F1: 0.5753, Weighted-F1: 0.5812



Epoch 26/30


Training: 100%|██████████| 546/546 [13:21<00:00,  1.47s/it]


Train Loss: 0.0036
Train Macro-F1: 0.9958, Weighted-F1: 0.9959


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.66it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6647, Precision=0.7703, Recall=0.5846
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7174, Precision=0.7021, Recall=0.7333
Low Self-Esteem: F1=0.2975, Precision=0.5000, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5470, Precision=0.5714, Recall=0.5246
Validation Loss: 0.8149
Validation Macro-F1: 0.5749, Weighted-F1: 0.5791



Epoch 27/30


Training: 100%|██████████| 546/546 [13:20<00:00,  1.47s/it]


Train Loss: 0.0034
Train Macro-F1: 0.9963, Weighted-F1: 0.9962


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.66it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6647, Precision=0.7793, Recall=0.5795
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7174, Precision=0.7021, Recall=0.7333
Low Self-Esteem: F1=0.2927, Precision=0.4737, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5470, Precision=0.5714, Recall=0.5246
Validation Loss: 0.8175
Validation Macro-F1: 0.5742, Weighted-F1: 0.5783



Epoch 28/30


Training: 100%|██████████| 546/546 [13:15<00:00,  1.46s/it]


Train Loss: 0.0038
Train Macro-F1: 0.9958, Weighted-F1: 0.9958


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6686, Precision=0.7808, Recall=0.5846
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7174, Precision=0.7021, Recall=0.7333
Low Self-Esteem: F1=0.2951, Precision=0.4865, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.8181
Validation Macro-F1: 0.5745, Weighted-F1: 0.5796



Epoch 29/30


Training: 100%|██████████| 546/546 [13:13<00:00,  1.45s/it]


Train Loss: 0.0035
Train Macro-F1: 0.9961, Weighted-F1: 0.9962


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6667, Precision=0.7755, Recall=0.5846
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7033, Precision=0.6957, Recall=0.7111
Low Self-Esteem: F1=0.2903, Precision=0.4615, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.8156
Validation Macro-F1: 0.5715, Weighted-F1: 0.5769



Epoch 30/30


Training: 100%|██████████| 546/546 [13:13<00:00,  1.45s/it]


Train Loss: 0.0035
Train Macro-F1: 0.9963, Weighted-F1: 0.9960


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.68it/s]



Per-class metrics:
Lack of Interest: F1=0.3279, Precision=0.6250, Recall=0.2222
Feeling Down: F1=0.6667, Precision=0.7755, Recall=0.5846
Eating Disorder: F1=0.7500, Precision=0.7091, Recall=0.7959
Sleeping Disorder: F1=0.7174, Precision=0.7021, Recall=0.7333
Low Self-Esteem: F1=0.2903, Precision=0.4615, Recall=0.2118
Concentration Problem: F1=0.7200, Precision=0.8182, Recall=0.6429
Self-Harm: F1=0.5424, Precision=0.5614, Recall=0.5246
Validation Loss: 0.8175
Validation Macro-F1: 0.5735, Weighted-F1: 0.5781


## Inference

In [11]:
!gdown 1bjnRsr_3dh-8RNJ2CvunWTxuzZMVZH5c

Downloading...
From (original): https://drive.google.com/uc?id=1bjnRsr_3dh-8RNJ2CvunWTxuzZMVZH5c
From (redirected): https://drive.google.com/uc?id=1bjnRsr_3dh-8RNJ2CvunWTxuzZMVZH5c&confirm=t&uuid=77b628fa-aaa5-4db5-a617-7c8e858cd626
To: /kaggle/working/no_contrastive_depression.pth
100%|██████████████████████████████████████| 1.04G/1.04G [00:16<00:00, 63.8MB/s]


In [14]:
def inference(test_data, model_path):
    model = MultimodalAttentionModel(
        text_model_name="mental/mental-roberta-base",
        num_classes=NUM_CLASSES,
        fusion_dim=FUSION_DIM
    )
    
    model = model.to(DEVICE)

    weights = torch.load(model_path, map_location=DEVICE, weights_only=True)
    weights_single = {k.replace("module.", ""): v for k, v in weights.items()}

    model.load_state_dict(weights_single)
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
    image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

    test_dataset = MultimodalDepressionDataset(test_data, os.path.join(img_path, "test"), tokenizer, image_processor, max_len=MAX_LEN)
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    loss, metrics = evaluate_multimodal_model(
        model, test_loader, nn.BCEWithLogitsLoss()
    )

    print(f"Test Loss: {loss:.4f}")
    print(f"Test Macro-F1: {metrics['macro_f1']:.4f}, Weighted-F1: {metrics['weighted_f1']:.4f}")

In [15]:
inference(test_data, "no_contrastive_depression.pth")

Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 33/33 [00:22<00:00,  1.44it/s]


Per-class metrics:
Lack of Interest: F1=0.4800, Precision=0.8276, Recall=0.3380
Feeling Down: F1=0.6667, Precision=0.8451, Recall=0.5505
Eating Disorder: F1=0.8000, Precision=0.8182, Recall=0.7826
Sleeping Disorder: F1=0.7586, Precision=0.8333, Recall=0.6962
Low Self-Esteem: F1=0.3699, Precision=0.5424, Recall=0.2807
Concentration Problem: F1=0.8036, Precision=0.9783, Recall=0.6818
Self-Harm: F1=0.6391, Precision=0.6136, Recall=0.6667
Test Loss: 0.3889
Test Macro-F1: 0.6454, Weighted-F1: 0.6379



