# Importing necessary libraries

In [3]:
import modal
import torch
import numpy as np
import pandas as pd
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
import os
import json
from tqdm import tqdm
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from torch.utils.data import DataLoader

In [4]:
app = modal.App("xlm-finetune")

image1 = (modal.Image.debian_slim()
         .pip_install("pandas",
                      "torch",
                      "transformers",
                      "peft",
                      "tqdm",
                      "numpy",
                      "datasets",
                      "scikit-learn",
                      "sentencepiece"))

image2 = (modal.Image.debian_slim()
         .pip_install("pandas")
         .add_local_file("datasets/train.tsv", remote_path="/root/train.tsv")
         .add_local_file("datasets/test.tsv", remote_path="/root/test.tsv"))
         

         

In [5]:
@app.function(gpu='A100-80GB', image=image1, timeout=18000)
def train_and_evaluate(df_train, df_test):
    # Process emotion labels
    def process_emotions(emotion_str):
        if isinstance(emotion_str, str):
            emotions = [int(e.strip()) for e in emotion_str.split(',')]
            return emotions
        return [emotion_str]
    
    df_train['emotion_list'] = df_train['emotion'].apply(process_emotions)
    
    # One-hot encode the emotion labels
    mlb = MultiLabelBinarizer()
    emotion_encoded = mlb.fit_transform(df_train['emotion_list'])
    num_labels = len(mlb.classes_)
    print(f"Number of emotion classes: {num_labels}")
    
    # Split data
    train_indices = []
    val_indices = []
    
    for language, group_indices in df_train.groupby('language').groups.items():
        indices = list(group_indices)
        np.random.seed(42)
        np.random.shuffle(indices)
        split_idx = int(len(indices) * 0.85)
        train_indices.extend(indices[:split_idx])
        val_indices.extend(indices[split_idx:])
    
    train_df = df_train.iloc[train_indices].reset_index(drop=True)
    val_df = df_train.iloc[val_indices].reset_index(drop=True)
    
    print(f"Total examples: {len(df_train)}")
    print(f"Training examples: {len(train_df)}")
    print(f"Validation examples: {len(val_df)}")
    
    # Load model and tokenizer
    model = XLMRobertaForSequenceClassification.from_pretrained(
        "xlm-roberta-base",
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    
    # Prepare datasets
    train_df["labels"] = train_df["emotion_list"].apply(
        lambda x: [1.0 if i in x else 0.0 for i in range(num_labels)]
    )
    val_df["labels"] = val_df["emotion_list"].apply(
        lambda x: [1.0 if i in x else 0.0 for i in range(num_labels)]
    )
    
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    
    def tokenize_function(examples):
        texts = [str(t) for t in examples["text"]]
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)
    
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    
    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=["query", "key", "value"]
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Custom data collator
    class MultiLabelDataCollator(DataCollatorWithPadding):
        def __call__(self, features):
            batch = super().__call__(features)
            if "labels" in batch:
                batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32)
            return batch
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        fp16=False,  # Disable mixed precision to avoid serialization issues
        logging_strategy="epoch",
        logging_dir="./logs",
        report_to="none",
        disable_tqdm=True,
        logging_steps=1000,
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=MultiLabelDataCollator(tokenizer=tokenizer),
    )
    
    # Train the model
    print("Training model...")
    trainer.train()
    
    # EVALUATION PART
    print("\n===== EVALUATING MODEL =====")
    
    # Load test data
    print("Loading test data...")
    df_test['emotion_list'] = df_test['emotion'].apply(process_emotions)
    
    # Use same MultiLabelBinarizer for consistency
    test_emotion_encoded = mlb.transform(df_test['emotion_list'])
    df_test["labels"] = list(test_emotion_encoded)
    
    # Convert to Dataset and tokenize
    test_dataset = Dataset.from_pandas(df_test)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    
    # Prediction function
    def get_predictions(dataset):
        all_preds = []
        all_labels = []
    
    # Create a custom collate function that handles the text properly
        def collate_fn(batch):
            # Extract text and ensure it's a string
            texts = [str(item['text']) for item in batch]
            # Tokenize with explicit padding and truncation
            inputs = tokenizer(
                texts, 
                padding='max_length',
                truncation=True,
                max_length=128,
                return_tensors='pt'
            )
            # Add labels
            if 'labels' in batch[0]:
                inputs['labels'] = torch.tensor([item['labels'] for item in batch], dtype=torch.float32)
            return inputs
        
        # Use the custom collate function
        dataloader = DataLoader(
            dataset, 
            batch_size=16, 
            collate_fn=collate_fn
        )
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()
        
        with torch.no_grad():
            for batch in dataloader:
                batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
                outputs = model(**{k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask']})
                logits = outputs.logits
                preds = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
                labels = batch["labels"].cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels)
        
        return np.array(all_preds), np.array(all_labels)
        
    # Get predictions
    print("Running predictions...")
    predictions, true_labels = get_predictions(test_dataset)
    
    # Calculate overall F1 scores
    f1_micro = f1_score(true_labels, predictions, average='micro')
    f1_macro = f1_score(true_labels, average='macro', y_pred=predictions)
    f1_weighted = f1_score(true_labels, average='weighted', y_pred=predictions)
    
    print("\n===== OVERALL F1 SCORES =====")
    print(f"Micro F1: {f1_micro:.4f}")
    print(f"Macro F1: {f1_macro:.4f}")
    print(f"Weighted F1: {f1_weighted:.4f}")
    
    # Calculate F1 scores by language
    languages = df_test['language'].unique()
    print("\n===== F1 SCORES BY LANGUAGE =====")
    
    language_results = {}
    for lang in languages:
        lang_indices = df_test[df_test['language'] == lang].index.tolist()
        
        if len(lang_indices) < 5:
            continue
            
        lang_preds = predictions[lang_indices]
        lang_true = true_labels[lang_indices]
        
        try:
            lang_f1_micro = f1_score(lang_true, lang_preds, average='micro')
            lang_f1_macro = f1_score(lang_true, lang_preds, average='macro')
            lang_f1_weighted = f1_score(lang_true, lang_preds, average='weighted')
            
            print(f"\nLanguage: {lang} (Examples: {len(lang_indices)})")
            print(f"  Micro F1: {lang_f1_micro:.4f}")
            print(f"  Macro F1: {lang_f1_macro:.4f}")
            print(f"  Weighted F1: {lang_f1_weighted:.4f}")
            
            language_results[lang] = {
                'micro_f1': lang_f1_micro,
                'macro_f1': lang_f1_macro,
                'weighted_f1': lang_f1_weighted,
                'examples': len(lang_indices)
            }
        except Exception as e:
            print(f"Error calculating F1 for language {lang}: {e}")
    
    # Return results
    results = {
        'overall': {
            'micro_f1': float(f1_micro),
            'macro_f1': float(f1_macro),
            'weighted_f1': float(f1_weighted)
        },
        'by_language': language_results
    }
    
    return results

In [6]:
# Define the function to read the dataset
@app.function(image=image2)
def tsv_to_df(path):
    df = pd.read_csv(path, sep='\t')
    return df



In [7]:
with modal.enable_output():
    with app.run():
        df_train = tsv_to_df.remote("/root/train.tsv")
        df_test = tsv_to_df.remote("/root/test.tsv")
        print(train_and_evaluate.remote(df_train, df_test))
    