## Dataset Download

In [1]:
!gdown 1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB

Downloading...
From (original): https://drive.google.com/uc?id=1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB
From (redirected): https://drive.google.com/uc?id=1LQoyq_ZjHJih7hxJ2yJ_OrZhOIE_BZwB&confirm=t&uuid=62dc275f-0edc-430f-ba07-5578c1056838
To: /kaggle/working/depression_dataset_complete.zip
100%|████████████████████████████████████████| 652M/652M [00:08<00:00, 74.9MB/s]


In [2]:
!unzip depression_dataset_complete.zip > /dev/null

In [16]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel, CLIPImageProcessor, CLIPVisionModel
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from tqdm import tqdm
from PIL import Image
import json
import math

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16
LR = 2e-5
MAX_LEN = 512
FUSION_DIM = 768
LABELS = ["Lack of Interest", "Feeling Down", "Eating Disorder",
          "Sleeping Disorder", "Low Self-Esteem", "Concentration Problem", "Self-Harm"]
LABEL_MAP = {label: i for i, label in enumerate(LABELS)}
NUM_CLASSES = len(LABELS)

## Custom Dataset

In [17]:
class MultimodalDepressionDataset(Dataset):
    def __init__(self, data, image_path, tokenizer, image_processor, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_len = max_len
        self.img_path = image_path
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        ocr_text = sample["ocr_text"]
        figurative_reasoning = sample["figurative_reasoning"]
        combined_text = ocr_text + " [SEP] " + figurative_reasoning

        encoding = self.tokenizer(
            combined_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        image_path = os.path.join(self.img_path, sample["sample_id"] + ".jpeg")
        image = Image.open(image_path).convert("RGB")
        image_tensor = self.image_processor(image, return_tensors="pt")

        labels = torch.zeros(NUM_CLASSES)
        for category in sample["meme_depressive_categories"]:
            labels[LABEL_MAP[category]] = 1.0

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "image": image_tensor,
            "label": labels
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])

    images = {}
    for key in batch[0]['image'].keys():
        if isinstance(batch[0]['image'][key], torch.Tensor):
            images[key] = torch.stack([item['image'][key].squeeze(0) for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'image': images,
        'label': labels,
    }

## Model Definition

In [None]:
class SimpleFusionModel(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_classes=7, fusion_dim=768):
        super(SimpleFusionModel, self).__init__()

        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim = self.text_encoder.config.hidden_size

        self.vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.vision_dim = self.vision_encoder.config.hidden_size

        self.text_projection = nn.Linear(self.text_dim, fusion_dim)
        self.vision_projection = nn.Linear(self.vision_dim, fusion_dim)

        self.fusion_dim = fusion_dim
        self.fusion_layer = nn.Sequential(
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.2)
        )

        self.expert_nets = nn.ModuleList([
            nn.Sequential(
                nn.Linear(fusion_dim, fusion_dim),
                nn.LayerNorm(fusion_dim),
                nn.GELU(),
                nn.Dropout(0.2)
            ) for _ in range(4)
        ])

        self.moe_gate = nn.Linear(fusion_dim, 4)

        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim, num_classes)
        )

        contrastive_dim = 256
        self.contrastive_projection = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.LayerNorm(fusion_dim // 2),
            nn.GELU(),
            nn.Linear(fusion_dim // 2, contrastive_dim)
        )

        self.text_contrastive_proj = nn.Sequential(
            nn.Linear(self.text_dim, contrastive_dim),
            nn.LayerNorm(contrastive_dim),
            nn.GELU()
        )

        self.image_contrastive_proj = nn.Sequential(
            nn.Linear(self.vision_dim, contrastive_dim),
            nn.LayerNorm(contrastive_dim),
            nn.GELU()
        )

    def forward(self, input_ids, attention_mask, image_features, get_embeddings=False):
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_cls = text_outputs.last_hidden_state[:, 0]

        vision_outputs = self.vision_encoder(**image_features)
        image_cls = vision_outputs.pooler_output

        text_projected = self.text_projection(text_cls)
        image_projected = self.vision_projection(image_cls)

        concat_features = torch.cat([text_projected, image_projected], dim=1)
        fused_features = self.fusion_layer(concat_features)

        expert_outputs = [expert(fused_features) for expert in self.expert_nets]
        expert_gates = torch.nn.functional.softmax(self.moe_gate(fused_features), dim=1)

        moe_output = torch.zeros_like(expert_outputs[0])
        for i, expert_out in enumerate(expert_outputs):
            moe_output += expert_out * expert_gates[:, i].unsqueeze(1)

        logits = self.classifier(moe_output)

        if get_embeddings:
            multimodal_contrastive = self.contrastive_projection(moe_output)
            text_contrastive = self.text_contrastive_proj(text_cls)
            image_contrastive = self.image_contrastive_proj(image_cls)

            multimodal_contrastive = torch.nn.functional.normalize(multimodal_contrastive, p=2, dim=1)
            text_contrastive = torch.nn.functional.normalize(text_contrastive, p=2, dim=1)
            image_contrastive = torch.nn.functional.normalize(image_contrastive, p=2, dim=1)

            return {
                "logits": logits,
                "multimodal_embedding": multimodal_contrastive,
                "text_embedding": text_contrastive,
                "image_embedding": image_contrastive
            }

        return logits

class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super().__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss(reduction="sum")

    def forward(self, modal1, modal2):
        batch_size = modal1.shape[0]

        features = torch.cat([modal1, modal2], dim=0)
        similarity_matrix = torch.matmul(features, features.T)

        mask = (~torch.eye(2 * batch_size, dtype=bool, device=features.device)).float()

        similarity_matrix = similarity_matrix * mask
        similarity_matrix = similarity_matrix / self.temperature

        labels = torch.arange(batch_size, device=features.device, dtype=torch.long)
        labels = torch.cat([labels + batch_size, labels], dim=0)

        loss = self.criterion(similarity_matrix, labels)
        loss = loss / (2 * batch_size)

        return loss

## Training Functions

In [19]:
def train_multimodal_model(model, train_data, val_data, img_path, epochs, model_save_name):
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
    image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

    train_dataset = MultimodalDepressionDataset(train_data, os.path.join(img_path, "train"), tokenizer, image_processor, max_len=MAX_LEN)
    val_dataset = MultimodalDepressionDataset(val_data, os.path.join(img_path, "val"), tokenizer, image_processor, max_len=MAX_LEN)

    print("Train Set Size:", len(train_dataset))
    print("Validation Set Size:", len(val_dataset))

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=custom_collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    optimizer = optim.AdamW(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2, verbose=True
    )
    criterion = nn.BCEWithLogitsLoss()
    contrastive_criterion = ContrastiveLoss(temperature=0.07)
    contrastive_weight = 0.3

    model = model.to(DEVICE)
    model = nn.DataParallel(model)

    best_f1 = 0
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        model.train()
        train_loss = 0
        all_train_preds, all_train_labels = [], []

        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            image_features = {k: v.to(DEVICE) for k, v in batch["image"].items()}

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, image_features, get_embeddings=True)
            logits = outputs["logits"]
            classification_loss = criterion(logits, labels)

            multimodal_text_loss = contrastive_criterion(
                outputs["multimodal_embedding"],
                outputs["text_embedding"]
            )

            multimodal_image_loss = contrastive_criterion(
                outputs["multimodal_embedding"],
                outputs["image_embedding"]
            )

            text_image_loss = contrastive_criterion(
                outputs["text_embedding"],
                outputs["image_embedding"]
            )

            contrastive_loss = (multimodal_text_loss + multimodal_image_loss + text_image_loss) / 3
            loss = (1 - contrastive_weight) * classification_loss + contrastive_weight * contrastive_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()
            with torch.no_grad():
                predictions = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
                all_train_preds.extend(predictions)
                all_train_labels.extend(labels.cpu().numpy())

        train_loss = train_loss / len(train_loader)
        train_metrics = compute_multilabel_metrics(np.array(all_train_labels), np.array(all_train_preds))

        print(f"Train Loss: {train_loss:.4f}")
        print(f"Train Macro-F1: {train_metrics['macro_f1']:.4f}, Weighted-F1: {train_metrics['weighted_f1']:.4f}")

        val_loss, val_metrics = evaluate_multimodal_model(
            model, val_loader, criterion
        )

        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Macro-F1: {val_metrics['macro_f1']:.4f}, Weighted-F1: {val_metrics['weighted_f1']:.4f}")

        scheduler.step(val_metrics['macro_f1'])
        
        f1_hm = 2 * val_metrics["macro_f1"] * val_metrics["weighted_f1"] / (val_metrics["macro_f1"] + val_metrics["weighted_f1"])
        if f1_hm > best_f1:
            best_f1 = f1_hm
            torch.save(model.state_dict(), f"{model_save_name}_depression.pth")
            print("Best model saved!")
        
    return model

def compute_multilabel_metrics(y_true, y_pred):
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    metrics = {
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall,
        'weighted_f1': weighted_f1,
        'per_class_precision': per_class_precision,
        'per_class_recall': per_class_recall,
        'per_class_f1': per_class_f1
    }
    
    return metrics

def evaluate_multimodal_model(model, loader, criterion):
    model.eval()
    val_loss = 0
    all_val_preds, all_val_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            image_features = {k: v.to(DEVICE) for k, v in batch["image"].items()}
            
            logits = model(input_ids, attention_mask, image_features)
            loss = criterion(logits, labels)
            
            val_loss += loss.item()
            predictions = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
            all_val_preds.extend(predictions)
            all_val_labels.extend(labels.cpu().numpy())
    
    val_loss = val_loss / len(loader)
    val_metrics = compute_multilabel_metrics(np.array(all_val_labels), np.array(all_val_preds))
    
    print("\nPer-class metrics:")
    for i, label in enumerate(LABELS):
        print(f"{label}: F1={val_metrics['per_class_f1'][i]:.4f}, "
              f"Precision={val_metrics['per_class_precision'][i]:.4f}, "
              f"Recall={val_metrics['per_class_recall'][i]:.4f}")
    
    return val_loss, val_metrics

## Model Training

In [20]:
train_data = json.load(open("depression_train_llava_dataset.json", "r"))
val_data = json.load(open("depression_val_llava_dataset.json", "r"))
test_data = json.load(open("depression_test_llava_dataset.json", "r"))

img_path = "depressive_image"

tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

test_dataset = MultimodalDepressionDataset(test_data, os.path.join(img_path, "test"), tokenizer, image_processor, max_len=MAX_LEN)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=custom_collate_fn
)

In [None]:
model = SimpleFusionModel(
    text_model_name="mental/mental-roberta-base",
    num_classes=NUM_CLASSES,
    fusion_dim=FUSION_DIM
)

trained_model = train_multimodal_model(
    model,
    train_data,
    val_data,
    img_path,
    epochs=30,
    model_save_name="no_fusion"
)

Train Set Size: 8722
Validation Set Size: 359


Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/30


Training: 100%|██████████| 546/546 [12:20<00:00,  1.36s/it]


Train Loss: 0.8549
Train Macro-F1: 0.1953, Weighted-F1: 0.2651


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.0000, Precision=0.0000, Recall=0.0000
Feeling Down: F1=0.3175, Precision=0.7018, Recall=0.2051
Eating Disorder: F1=0.7400, Precision=0.7255, Recall=0.7551
Sleeping Disorder: F1=0.8090, Precision=0.8182, Recall=0.8000
Low Self-Esteem: F1=0.0000, Precision=0.0000, Recall=0.0000
Concentration Problem: F1=0.0465, Precision=1.0000, Recall=0.0238
Self-Harm: F1=0.2857, Precision=0.6875, Recall=0.1803
Validation Loss: 0.4138
Validation Macro-F1: 0.3141, Weighted-F1: 0.2949
Best model saved!



Epoch 2/30


Training: 100%|██████████| 546/546 [12:20<00:00,  1.36s/it]


Train Loss: 0.3889
Train Macro-F1: 0.4385, Weighted-F1: 0.5488


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]



Per-class metrics:
Lack of Interest: F1=0.2000, Precision=1.0000, Recall=0.1111
Feeling Down: F1=0.4710, Precision=0.8025, Recall=0.3333
Eating Disorder: F1=0.7525, Precision=0.7308, Recall=0.7755
Sleeping Disorder: F1=0.6923, Precision=0.8182, Recall=0.6000
Low Self-Esteem: F1=0.0233, Precision=1.0000, Recall=0.0118
Concentration Problem: F1=0.5902, Precision=0.9474, Recall=0.4286
Self-Harm: F1=0.4882, Precision=0.4697, Recall=0.5082
Validation Loss: 0.4000
Validation Macro-F1: 0.4596, Weighted-F1: 0.4318
Best model saved!



Epoch 3/30


Training: 100%|██████████| 546/546 [12:27<00:00,  1.37s/it]


Train Loss: 0.2661
Train Macro-F1: 0.6485, Weighted-F1: 0.6979


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]



Per-class metrics:
Lack of Interest: F1=0.3636, Precision=1.0000, Recall=0.2222
Feeling Down: F1=0.6472, Precision=0.7500, Recall=0.5692
Eating Disorder: F1=0.7835, Precision=0.7917, Recall=0.7755
Sleeping Disorder: F1=0.7529, Precision=0.8000, Recall=0.7111
Low Self-Esteem: F1=0.1263, Precision=0.6000, Recall=0.0706
Concentration Problem: F1=0.6765, Precision=0.8846, Recall=0.5476
Self-Harm: F1=0.4494, Precision=0.7143, Recall=0.3279
Validation Loss: 0.3893
Validation Macro-F1: 0.5428, Weighted-F1: 0.5391
Best model saved!



Epoch 4/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.2014
Train Macro-F1: 0.7570, Weighted-F1: 0.7871


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.79it/s]



Per-class metrics:
Lack of Interest: F1=0.3636, Precision=1.0000, Recall=0.2222
Feeling Down: F1=0.6472, Precision=0.7500, Recall=0.5692
Eating Disorder: F1=0.8298, Precision=0.8667, Recall=0.7959
Sleeping Disorder: F1=0.7500, Precision=0.8571, Recall=0.6667
Low Self-Esteem: F1=0.1277, Precision=0.6667, Recall=0.0706
Concentration Problem: F1=0.6667, Precision=0.9167, Recall=0.5238
Self-Harm: F1=0.5664, Precision=0.6154, Recall=0.5246
Validation Loss: 0.3870
Validation Macro-F1: 0.5645, Weighted-F1: 0.5563
Best model saved!



Epoch 5/30


Training: 100%|██████████| 546/546 [12:18<00:00,  1.35s/it]


Train Loss: 0.1515
Train Macro-F1: 0.8273, Weighted-F1: 0.8512


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.75it/s]



Per-class metrics:
Lack of Interest: F1=0.3929, Precision=1.0000, Recall=0.2444
Feeling Down: F1=0.6331, Precision=0.7483, Recall=0.5487
Eating Disorder: F1=0.7921, Precision=0.7692, Recall=0.8163
Sleeping Disorder: F1=0.7333, Precision=0.7333, Recall=0.7333
Low Self-Esteem: F1=0.3065, Precision=0.4872, Recall=0.2235
Concentration Problem: F1=0.7059, Precision=0.9231, Recall=0.5714
Self-Harm: F1=0.5000, Precision=0.7419, Recall=0.3770
Validation Loss: 0.4146
Validation Macro-F1: 0.5805, Weighted-F1: 0.5731
Best model saved!



Epoch 6/30


Training: 100%|██████████| 546/546 [12:26<00:00,  1.37s/it]


Train Loss: 0.1170
Train Macro-F1: 0.8802, Weighted-F1: 0.8973


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3729, Precision=0.7857, Recall=0.2444
Feeling Down: F1=0.5658, Precision=0.7890, Recall=0.4410
Eating Disorder: F1=0.7879, Precision=0.7800, Recall=0.7959
Sleeping Disorder: F1=0.8261, Precision=0.8085, Recall=0.8444
Low Self-Esteem: F1=0.3817, Precision=0.5435, Recall=0.2941
Concentration Problem: F1=0.7143, Precision=0.8929, Recall=0.5952
Self-Harm: F1=0.5556, Precision=0.6383, Recall=0.4918
Validation Loss: 0.4437
Validation Macro-F1: 0.6006, Weighted-F1: 0.5732
Best model saved!



Epoch 7/30


Training: 100%|██████████| 546/546 [12:31<00:00,  1.38s/it]


Train Loss: 0.0918
Train Macro-F1: 0.9207, Weighted-F1: 0.9299


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.76it/s]



Per-class metrics:
Lack of Interest: F1=0.3636, Precision=1.0000, Recall=0.2222
Feeling Down: F1=0.6781, Precision=0.7628, Recall=0.6103
Eating Disorder: F1=0.7368, Precision=0.7609, Recall=0.7143
Sleeping Disorder: F1=0.6341, Precision=0.7027, Recall=0.5778
Low Self-Esteem: F1=0.3359, Precision=0.4783, Recall=0.2588
Concentration Problem: F1=0.7246, Precision=0.9259, Recall=0.5952
Self-Harm: F1=0.5437, Precision=0.6667, Recall=0.4590
Validation Loss: 0.4754
Validation Macro-F1: 0.5738, Weighted-F1: 0.5850



Epoch 8/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.0759
Train Macro-F1: 0.9408, Weighted-F1: 0.9466


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.4068, Precision=0.8571, Recall=0.2667
Feeling Down: F1=0.6149, Precision=0.7795, Recall=0.5077
Eating Disorder: F1=0.7478, Precision=0.6515, Recall=0.8776
Sleeping Disorder: F1=0.7470, Precision=0.8158, Recall=0.6889
Low Self-Esteem: F1=0.3382, Precision=0.4510, Recall=0.2706
Concentration Problem: F1=0.7397, Precision=0.8710, Recall=0.6429
Self-Harm: F1=0.5567, Precision=0.7500, Recall=0.4426
Validation Loss: 0.5196
Validation Macro-F1: 0.5930, Weighted-F1: 0.5790



Epoch 9/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.0635
Train Macro-F1: 0.9567, Weighted-F1: 0.9597


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3571, Precision=0.9091, Recall=0.2222
Feeling Down: F1=0.6386, Precision=0.7737, Recall=0.5436
Eating Disorder: F1=0.7843, Precision=0.7547, Recall=0.8163
Sleeping Disorder: F1=0.6889, Precision=0.6889, Recall=0.6889
Low Self-Esteem: F1=0.1961, Precision=0.5882, Recall=0.1176
Concentration Problem: F1=0.7324, Precision=0.8966, Recall=0.6190
Self-Harm: F1=0.5333, Precision=0.4865, Recall=0.5902
Validation Loss: 0.5707
Validation Macro-F1: 0.5615, Weighted-F1: 0.5555



Epoch 10/30


Training: 100%|██████████| 546/546 [12:18<00:00,  1.35s/it]


Train Loss: 0.0413
Train Macro-F1: 0.9740, Weighted-F1: 0.9749


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3636, Precision=1.0000, Recall=0.2222
Feeling Down: F1=0.6412, Precision=0.7517, Recall=0.5590
Eating Disorder: F1=0.8041, Precision=0.8125, Recall=0.7959
Sleeping Disorder: F1=0.7111, Precision=0.7111, Recall=0.7111
Low Self-Esteem: F1=0.3741, Precision=0.4815, Recall=0.3059
Concentration Problem: F1=0.6957, Precision=0.8889, Recall=0.5714
Self-Harm: F1=0.6018, Precision=0.6538, Recall=0.5574
Validation Loss: 0.5644
Validation Macro-F1: 0.5988, Weighted-F1: 0.5949



Epoch 11/30


Training: 100%|██████████| 546/546 [12:23<00:00,  1.36s/it]


Train Loss: 0.0333
Train Macro-F1: 0.9822, Weighted-F1: 0.9825


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.75it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6516, Precision=0.7278, Recall=0.5897
Eating Disorder: F1=0.7547, Precision=0.7018, Recall=0.8163
Sleeping Disorder: F1=0.6087, Precision=0.8750, Recall=0.4667
Low Self-Esteem: F1=0.3051, Precision=0.5455, Recall=0.2118
Concentration Problem: F1=0.7429, Precision=0.9286, Recall=0.6190
Self-Harm: F1=0.6370, Precision=0.5811, Recall=0.7049
Validation Loss: 0.6144
Validation Macro-F1: 0.5787, Weighted-F1: 0.5809



Epoch 12/30


Training: 100%|██████████| 546/546 [12:23<00:00,  1.36s/it]


Train Loss: 0.0282
Train Macro-F1: 0.9851, Weighted-F1: 0.9848


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.75it/s]



Per-class metrics:
Lack of Interest: F1=0.3571, Precision=0.9091, Recall=0.2222
Feeling Down: F1=0.6814, Precision=0.7410, Recall=0.6308
Eating Disorder: F1=0.7477, Precision=0.6897, Recall=0.8163
Sleeping Disorder: F1=0.7143, Precision=0.7692, Recall=0.6667
Low Self-Esteem: F1=0.2832, Precision=0.5714, Recall=0.1882
Concentration Problem: F1=0.7143, Precision=0.8929, Recall=0.5952
Self-Harm: F1=0.5607, Precision=0.6522, Recall=0.4918
Validation Loss: 0.6091
Validation Macro-F1: 0.5798, Weighted-F1: 0.5862



Epoch 13/30


Training: 100%|██████████| 546/546 [12:21<00:00,  1.36s/it]


Train Loss: 0.0240
Train Macro-F1: 0.9887, Weighted-F1: 0.9881


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3793, Precision=0.8462, Recall=0.2444
Feeling Down: F1=0.6313, Precision=0.7431, Recall=0.5487
Eating Disorder: F1=0.7767, Precision=0.7407, Recall=0.8163
Sleeping Disorder: F1=0.6889, Precision=0.6889, Recall=0.6889
Low Self-Esteem: F1=0.2906, Precision=0.5312, Recall=0.2000
Concentration Problem: F1=0.6944, Precision=0.8333, Recall=0.5952
Self-Harm: F1=0.5536, Precision=0.6078, Recall=0.5082
Validation Loss: 0.6365
Validation Macro-F1: 0.5735, Weighted-F1: 0.5687



Epoch 14/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.0201
Train Macro-F1: 0.9923, Weighted-F1: 0.9920


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.78it/s]



Per-class metrics:
Lack of Interest: F1=0.3793, Precision=0.8462, Recall=0.2444
Feeling Down: F1=0.6414, Precision=0.7432, Recall=0.5641
Eating Disorder: F1=0.7619, Precision=0.7143, Recall=0.8163
Sleeping Disorder: F1=0.7045, Precision=0.7209, Recall=0.6889
Low Self-Esteem: F1=0.3721, Precision=0.5455, Recall=0.2824
Concentration Problem: F1=0.7324, Precision=0.8966, Recall=0.6190
Self-Harm: F1=0.5714, Precision=0.6275, Recall=0.5246
Validation Loss: 0.6451
Validation Macro-F1: 0.5947, Weighted-F1: 0.5909



Epoch 15/30


Training: 100%|██████████| 546/546 [12:20<00:00,  1.36s/it]


Train Loss: 0.0182
Train Macro-F1: 0.9930, Weighted-F1: 0.9927


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.76it/s]



Per-class metrics:
Lack of Interest: F1=0.3793, Precision=0.8462, Recall=0.2444
Feeling Down: F1=0.6353, Precision=0.7448, Recall=0.5538
Eating Disorder: F1=0.7455, Precision=0.6721, Recall=0.8367
Sleeping Disorder: F1=0.6988, Precision=0.7632, Recall=0.6444
Low Self-Esteem: F1=0.3906, Precision=0.5814, Recall=0.2941
Concentration Problem: F1=0.7027, Precision=0.8125, Recall=0.6190
Self-Harm: F1=0.5283, Precision=0.6222, Recall=0.4590
Validation Loss: 0.6658
Validation Macro-F1: 0.5829, Weighted-F1: 0.5821



Epoch 16/30


Training: 100%|██████████| 546/546 [12:17<00:00,  1.35s/it]


Train Loss: 0.0157
Train Macro-F1: 0.9938, Weighted-F1: 0.9934


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.78it/s]



Per-class metrics:
Lack of Interest: F1=0.3729, Precision=0.7857, Recall=0.2444
Feeling Down: F1=0.6477, Precision=0.7261, Recall=0.5846
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.6988, Precision=0.7632, Recall=0.6444
Low Self-Esteem: F1=0.3361, Precision=0.5882, Recall=0.2353
Concentration Problem: F1=0.7123, Precision=0.8387, Recall=0.6190
Self-Harm: F1=0.5636, Precision=0.6327, Recall=0.5082
Validation Loss: 0.6662
Validation Macro-F1: 0.5854, Weighted-F1: 0.5842



Epoch 17/30


Training: 100%|██████████| 546/546 [12:16<00:00,  1.35s/it]


Train Loss: 0.0157
Train Macro-F1: 0.9947, Weighted-F1: 0.9943


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]



Per-class metrics:
Lack of Interest: F1=0.3793, Precision=0.8462, Recall=0.2444
Feeling Down: F1=0.6573, Precision=0.7267, Recall=0.6000
Eating Disorder: F1=0.7593, Precision=0.6949, Recall=0.8367
Sleeping Disorder: F1=0.7273, Precision=0.7442, Recall=0.7111
Low Self-Esteem: F1=0.3252, Precision=0.5263, Recall=0.2353
Concentration Problem: F1=0.7143, Precision=0.8929, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6677
Validation Macro-F1: 0.5879, Weighted-F1: 0.5872



Epoch 18/30


Training: 100%|██████████| 546/546 [12:27<00:00,  1.37s/it]


Train Loss: 0.0150
Train Macro-F1: 0.9941, Weighted-F1: 0.9938


Evaluating: 100%|██████████| 23/23 [00:14<00:00,  1.62it/s]



Per-class metrics:
Lack of Interest: F1=0.3860, Precision=0.9167, Recall=0.2444
Feeling Down: F1=0.6353, Precision=0.7448, Recall=0.5538
Eating Disorder: F1=0.7387, Precision=0.6613, Recall=0.8367
Sleeping Disorder: F1=0.6747, Precision=0.7368, Recall=0.6222
Low Self-Esteem: F1=0.3906, Precision=0.5814, Recall=0.2941
Concentration Problem: F1=0.7123, Precision=0.8387, Recall=0.6190
Self-Harm: F1=0.5818, Precision=0.6531, Recall=0.5246
Validation Loss: 0.6918
Validation Macro-F1: 0.5885, Weighted-F1: 0.5870



Epoch 19/30


Training: 100%|██████████| 546/546 [12:33<00:00,  1.38s/it]


Train Loss: 0.0133
Train Macro-F1: 0.9951, Weighted-F1: 0.9950


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.78it/s]



Per-class metrics:
Lack of Interest: F1=0.3793, Precision=0.8462, Recall=0.2444
Feeling Down: F1=0.6630, Precision=0.7256, Recall=0.6103
Eating Disorder: F1=0.7810, Precision=0.7321, Recall=0.8367
Sleeping Disorder: F1=0.6667, Precision=0.7179, Recall=0.6222
Low Self-Esteem: F1=0.3548, Precision=0.5641, Recall=0.2588
Concentration Problem: F1=0.7027, Precision=0.8125, Recall=0.6190
Self-Harm: F1=0.5283, Precision=0.6222, Recall=0.4590
Validation Loss: 0.6690
Validation Macro-F1: 0.5822, Weighted-F1: 0.5872



Epoch 20/30


Training: 100%|██████████| 546/546 [12:18<00:00,  1.35s/it]


Train Loss: 0.0126
Train Macro-F1: 0.9957, Weighted-F1: 0.9951


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.70it/s]



Per-class metrics:
Lack of Interest: F1=0.3860, Precision=0.9167, Recall=0.2444
Feeling Down: F1=0.6741, Precision=0.7378, Recall=0.6205
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.7209, Precision=0.7561, Recall=0.6889
Low Self-Esteem: F1=0.3750, Precision=0.5581, Recall=0.2824
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6668
Validation Macro-F1: 0.5970, Weighted-F1: 0.6015



Epoch 21/30


Training: 100%|██████████| 546/546 [12:28<00:00,  1.37s/it]


Train Loss: 0.0126
Train Macro-F1: 0.9958, Weighted-F1: 0.9956


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.67it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6648, Precision=0.7452, Recall=0.6000
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.7126, Precision=0.7381, Recall=0.6889
Low Self-Esteem: F1=0.3443, Precision=0.5676, Recall=0.2471
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5586, Precision=0.6200, Recall=0.5082
Validation Loss: 0.6837
Validation Macro-F1: 0.5860, Weighted-F1: 0.5899



Epoch 22/30


Training: 100%|██████████| 546/546 [12:21<00:00,  1.36s/it]


Train Loss: 0.0119
Train Macro-F1: 0.9956, Weighted-F1: 0.9952


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6667, Precision=0.7421, Recall=0.6051
Eating Disorder: F1=0.7593, Precision=0.6949, Recall=0.8367
Sleeping Disorder: F1=0.7143, Precision=0.7692, Recall=0.6667
Low Self-Esteem: F1=0.3651, Precision=0.5610, Recall=0.2706
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5660, Precision=0.6667, Recall=0.4918
Validation Loss: 0.6805
Validation Macro-F1: 0.5895, Weighted-F1: 0.5944



Epoch 23/30


Training: 100%|██████████| 546/546 [12:17<00:00,  1.35s/it]


Train Loss: 0.0114
Train Macro-F1: 0.9960, Weighted-F1: 0.9958


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.73it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6760, Precision=0.7423, Recall=0.6205
Eating Disorder: F1=0.7736, Precision=0.7193, Recall=0.8367
Sleeping Disorder: F1=0.7160, Precision=0.8056, Recall=0.6444
Low Self-Esteem: F1=0.3937, Precision=0.5952, Recall=0.2941
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5243, Precision=0.6429, Recall=0.4426
Validation Loss: 0.6771
Validation Macro-F1: 0.5912, Weighted-F1: 0.5991



Epoch 24/30


Training: 100%|██████████| 546/546 [12:22<00:00,  1.36s/it]


Train Loss: 0.0118
Train Macro-F1: 0.9955, Weighted-F1: 0.9953


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.72it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6778, Precision=0.7394, Recall=0.6256
Eating Disorder: F1=0.7593, Precision=0.6949, Recall=0.8367
Sleeping Disorder: F1=0.6977, Precision=0.7317, Recall=0.6667
Low Self-Esteem: F1=0.3193, Precision=0.5588, Recall=0.2235
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6840
Validation Macro-F1: 0.5802, Weighted-F1: 0.5881



Epoch 25/30


Training: 100%|██████████| 546/546 [12:26<00:00,  1.37s/it]


Train Loss: 0.0118
Train Macro-F1: 0.9968, Weighted-F1: 0.9967


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.77it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6797, Precision=0.7439, Recall=0.6256
Eating Disorder: F1=0.7523, Precision=0.6833, Recall=0.8367
Sleeping Disorder: F1=0.6747, Precision=0.7368, Recall=0.6222
Low Self-Esteem: F1=0.3306, Precision=0.5556, Recall=0.2353
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5607, Precision=0.6522, Recall=0.4918
Validation Loss: 0.6831
Validation Macro-F1: 0.5790, Weighted-F1: 0.5889



Epoch 26/30


Training: 100%|██████████| 546/546 [12:20<00:00,  1.36s/it]


Train Loss: 0.0112
Train Macro-F1: 0.9959, Weighted-F1: 0.9958


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6629, Precision=0.7405, Recall=0.6000
Eating Disorder: F1=0.7593, Precision=0.6949, Recall=0.8367
Sleeping Disorder: F1=0.6897, Precision=0.7143, Recall=0.6667
Low Self-Esteem: F1=0.2906, Precision=0.5312, Recall=0.2000
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5556, Precision=0.6383, Recall=0.4918
Validation Loss: 0.6910
Validation Macro-F1: 0.5733, Weighted-F1: 0.5775



Epoch 27/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.0115
Train Macro-F1: 0.9963, Weighted-F1: 0.9959


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.76it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6797, Precision=0.7439, Recall=0.6256
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.6905, Precision=0.7436, Recall=0.6444
Low Self-Esteem: F1=0.2906, Precision=0.5312, Recall=0.2000
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6863
Validation Macro-F1: 0.5764, Weighted-F1: 0.5841



Epoch 28/30


Training: 100%|██████████| 546/546 [12:19<00:00,  1.35s/it]


Train Loss: 0.0112
Train Macro-F1: 0.9967, Weighted-F1: 0.9965


Evaluating: 100%|██████████| 23/23 [00:12<00:00,  1.79it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6704, Precision=0.7438, Recall=0.6103
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.6905, Precision=0.7436, Recall=0.6444
Low Self-Esteem: F1=0.2906, Precision=0.5312, Recall=0.2000
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6891
Validation Macro-F1: 0.5750, Weighted-F1: 0.5807



Epoch 29/30


Training: 100%|██████████| 546/546 [12:17<00:00,  1.35s/it]


Train Loss: 0.0117
Train Macro-F1: 0.9963, Weighted-F1: 0.9961


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6704, Precision=0.7438, Recall=0.6103
Eating Disorder: F1=0.7664, Precision=0.7069, Recall=0.8367
Sleeping Disorder: F1=0.6905, Precision=0.7436, Recall=0.6444
Low Self-Esteem: F1=0.3051, Precision=0.5455, Recall=0.2118
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6900
Validation Macro-F1: 0.5771, Weighted-F1: 0.5830



Epoch 30/30


Training: 100%|██████████| 546/546 [12:20<00:00,  1.36s/it]


Train Loss: 0.0106
Train Macro-F1: 0.9960, Weighted-F1: 0.9957


Evaluating: 100%|██████████| 23/23 [00:13<00:00,  1.76it/s]



Per-class metrics:
Lack of Interest: F1=0.3509, Precision=0.8333, Recall=0.2222
Feeling Down: F1=0.6723, Precision=0.7407, Recall=0.6154
Eating Disorder: F1=0.7523, Precision=0.6833, Recall=0.8367
Sleeping Disorder: F1=0.6988, Precision=0.7632, Recall=0.6444
Low Self-Esteem: F1=0.3051, Precision=0.5455, Recall=0.2118
Concentration Problem: F1=0.7042, Precision=0.8621, Recall=0.5952
Self-Harm: F1=0.5524, Precision=0.6591, Recall=0.4754
Validation Loss: 0.6868
Validation Macro-F1: 0.5766, Weighted-F1: 0.5831


## Inference

In [21]:
!gdown 1WOD-b9DB0FihbpEsdT8-P-ADJ5mKk_vn

Downloading...
From (original): https://drive.google.com/uc?id=1WOD-b9DB0FihbpEsdT8-P-ADJ5mKk_vn
From (redirected): https://drive.google.com/uc?id=1WOD-b9DB0FihbpEsdT8-P-ADJ5mKk_vn&confirm=t&uuid=3b727fd2-66fb-4333-b2ed-e7ebe87b1675
To: /kaggle/working/no_fusion_depression.pth
100%|████████████████████████████████████████| 873M/873M [00:11<00:00, 78.5MB/s]


In [22]:
def inference(test_data, model_path):
    model = SimpleFusionModel(
        text_model_name="mental/mental-roberta-base",
        num_classes=NUM_CLASSES,
        fusion_dim=FUSION_DIM
    )
    
    model = model.to(DEVICE)

    weights = torch.load(model_path, map_location=DEVICE, weights_only=True)
    weights_single = {k.replace("module.", ""): v for k, v in weights.items()}

    model.load_state_dict(weights_single)
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
    image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

    test_dataset = MultimodalDepressionDataset(test_data, os.path.join(img_path, "test"), tokenizer, image_processor, max_len=MAX_LEN)
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    loss, metrics = evaluate_multimodal_model(
        model, test_loader, nn.BCEWithLogitsLoss()
    )

    print(f"Test Loss: {loss:.4f}")
    print(f"Test Macro-F1: {metrics['macro_f1']:.4f}, Weighted-F1: {metrics['weighted_f1']:.4f}")

In [23]:
inference(test_data, "no_fusion_depression.pth")

Some weights of RobertaModel were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 33/33 [00:25<00:00,  1.30it/s]


Per-class metrics:
Lack of Interest: F1=0.4510, Precision=0.7419, Recall=0.3239
Feeling Down: F1=0.6232, Precision=0.8148, Recall=0.5046
Eating Disorder: F1=0.8315, Precision=0.8605, Recall=0.8043
Sleeping Disorder: F1=0.8212, Precision=0.8611, Recall=0.7848
Low Self-Esteem: F1=0.3902, Precision=0.6400, Recall=0.2807
Concentration Problem: F1=0.7931, Precision=0.9200, Recall=0.6970
Self-Harm: F1=0.6424, Precision=0.6310, Recall=0.6543
Test Loss: 0.3947
Test Macro-F1: 0.6504, Weighted-F1: 0.6354



