In [None]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd

train_df = pd.read_csv("/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_train.csv")
print(train_df.columns)
train_df.head()

In [None]:
# !pip install -q transformers accelerate datasets peft safetensors scikit-learn

import os
import math
import random
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Any, Optional

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    PreTrainedTokenizerBase,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.metrics import f1_score

In [None]:
MODEL_DIR = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"
TRAIN_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_train.csv"
VAL_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_val.csv"
TEST_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_test.csv"
SAMPLE_SUB = "/kaggle/input/emoti-code-multi-script-emotion-assignment/sample_submission.csv"

OUTPUT_DIR = "./gemma_lora_output"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
EMOTIONS = ["fear", "happy", "surprise", "sad", "anger", "disgust"]

label2id = {lab: i for i, lab in enumerate(EMOTIONS)}
id2label = {i: lab for lab, i in label2id.items()}

def build_prompt(sentence: str, language: str) -> str:
    # Prompt instructing the model to produce only the emotion token (no extra text)
    # We include language to help cross-lingual signals.
    prompt = (
        f"Sentence: {sentence}\n"
        f"Language: {language}\n"
        f"Question: What is the emotion expressed in the sentence? Answer with one word from [{', '.join(EMOTIONS)}].\n"
        f"Answer:"
    )
    return prompt

def build_prompt_and_target(sentence: str, language: str, emotion: str) -> str:
    # we'll append " {emotion}" as the label target after the prompt.
    prompt = build_prompt(sentence, language)
    target = " " + emotion  # leading space so tokenizer likely makes it a separate token
    return prompt, target

In [None]:
class EmotionCausalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: PreTrainedTokenizerBase, max_length: int = 256, is_train: bool = True):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentence = str(row["Sentence"])
        language = str(row["language"]) if "language" in row else ""
        if self.is_train:
            emotion = str(row["emotion"])
            prompt, target = build_prompt_and_target(sentence, language, emotion)
            # Tokenize
            prompt_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
            target_ids = self.tokenizer(target, add_special_tokens=False)["input_ids"]
            input_ids = prompt_ids + target_ids
            # We only want loss on the target tokens -> labels: -100 for prompt tokens
            labels = [-100] * len(prompt_ids) + target_ids
        else:
            # For eval/test we only pass prompt (model will generate)
            prompt = build_prompt(sentence, language)
            input_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
            labels = None

        if len(input_ids) > self.max_length:
            # truncate from prompt side (keep target)
            # ensure target tokens kept by truncating beginning if necessary
            input_ids = input_ids[-self.max_length:]
            if labels is not None:
                labels = labels[-self.max_length:]

        item = {"input_ids": torch.tensor(input_ids, dtype=torch.long)}
        if labels is not None:
            item["labels"] = torch.tensor(labels, dtype=torch.long)
        return item

@dataclass
class DataCollatorForCausalLMWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding_side: str = "right"
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_ids = [b["input_ids"] for b in batch]
        labels = [b.get("labels", None) for b in batch]
        # pad input_ids
        padded = self.tokenizer.pad({"input_ids": input_ids},
                                    padding=True,
                                    return_tensors="pt")
        if any(l is not None for l in labels):
            # pad labels, use tokenizer.pad with "labels" via same method
            labels_to_pad = [l if l is not None else torch.tensor([], dtype=torch.long) for l in labels]
            # convert to list of python lists for tokenizer.pad
            labels_lists = [l.tolist() if l.numel() > 0 else [] for l in labels_to_pad]
            padded_labels = self.tokenizer.pad({"input_ids": labels_lists},
                                               padding=True,
                                               return_tensors="pt")["input_ids"]
            # replace padding token ids in labels with -100
            padded_labels[padded_labels == self.tokenizer.pad_token_id] = -100
            batch_out = {"input_ids": padded["input_ids"], "attention_mask": padded["attention_mask"], "labels": padded_labels}
        else:
            batch_out = {"input_ids": padded["input_ids"], "attention_mask": padded["attention_mask"]}
        return batch_out

In [None]:
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)
test_df = pd.read_csv(TEST_CSV)

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
# Ensure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Load as causal LM
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    trust_remote_code=False,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    low_cpu_mem_usage=True,
)

# If tokenizer added tokens, resize model embeddings
model.resize_token_embeddings(len(tokenizer))

# Prepare for LoRA (PEFT)
lora_r = 8
lora_alpha = 32
lora_dropout = 0.1

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] , # common for causal models, safe to include
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare model
try:
    model = prepare_model_for_kbit_training(model)
except Exception:
    pass

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # verify LoRA params are trainable

if device == "cuda":
    model = model.to("cuda")

if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs with DataParallel")
    model = torch.nn.DataParallel(model)

In [None]:
MAX_LEN = 256
train_dataset = EmotionCausalDataset(train_df, tokenizer, max_length=MAX_LEN, is_train=True)
val_dataset = EmotionCausalDataset(val_df, tokenizer, max_length=MAX_LEN, is_train=True)  # val has labels
test_dataset = EmotionCausalDataset(test_df, tokenizer, max_length=MAX_LEN, is_train=False)

data_collator = DataCollatorForCausalLMWithPadding(tokenizer=tokenizer)

In [None]:
def predict_emotion_from_generation(pred_text: str) -> str:
    # Normalize generation and pick the first of the known emotions that appears at start or as first token.
    gen = pred_text.strip().splitlines()[0].strip()
    # take first token-like chunk
    first_chunk = gen.split()[0] if len(gen.split())>0 else gen
    first_chunk = first_chunk.strip().strip('.,;:?"\'').lower()
    # try exact match
    if first_chunk in EMOTIONS:
        return first_chunk
    # try prefix matching
    for lab in EMOTIONS:
        if gen.lower().startswith(lab):
            return lab
    # fallback: if any label token appears inside
    for lab in EMOTIONS:
        if lab in gen.lower():
            return lab
    # otherwise return 'unknown' -> treat as wrong
    return "unknown"

def compute_metrics_for_trainer(eval_preds) -> Dict[str, float]:
    # Not used directly by Trainer.generate workflow.
    return {}

# Eval function we can call to produce Macro F1
def evaluate_macro_f1(model, dataset, batch_size=8, max_new_tokens=8):
    model.eval()
    preds = []
    labels = []
    dl = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    gen_kwargs = {"max_new_tokens": max_new_tokens, "do_sample": False}
    for batch in dl:
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        with torch.no_grad():
            generated = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)
        # generated: batch x seqlen (includes prompt + generated tokens)
        for i, gen_ids in enumerate(generated):
            # decode and strip prompt part: we will decode entire generation and then extract generated suffix
            decoded = tokenizer.decode(gen_ids, skip_special_tokens=True)
            input_len = input_ids.shape[1] if isinstance(input_ids, torch.Tensor) else len(batch["input_ids"][i])
            gen_tokens = gen_ids[input_len:].cpu().numpy().tolist()
            if len(gen_tokens) == 0:
                gen_text = ""
            else:
                gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)
            pred_label = predict_emotion_from_generation(gen_text)
            preds.append(pred_label)

    # Pull true labels from dataset in order
    for i in range(len(dataset)):
        # dataset returns item with labels tensor where only target tokens are not -100
        item = dataset[i]
        if "labels" not in item or item["labels"] is None:
            labels.append("unknown")
            continue
        lab_ids = item["labels"].numpy().tolist()
        # find positions not -100
        meaningful = [x for x in lab_ids if x != -100]
        if len(meaningful)==0:
            labels.append("unknown")
        else:
            # decode meaningful ids to string
            true_text = tokenizer.decode(meaningful, skip_special_tokens=True)
            true_label = true_text.strip().split()[0].lower()
            if true_label in EMOTIONS:
                labels.append(true_label)
            else:
                # fallback: find any emotion present
                found = None
                for lab in EMOTIONS:
                    if lab in true_text.lower():
                        found = lab; break
                labels.append(found if found else "unknown")
    
    # Ensure same length
    preds = preds[:len(labels)]
    # convert to numeric for f1_score (map unknown -> some index that'll be treated as wrong)
    y_true = [label2id[l] if l in label2id else -1 for l in labels]
    y_pred = [label2id[l] if l in label2id else -1 for l in preds]

    # Build binary per class
    f1s = []
    for lab in EMOTIONS:
        lab_id = label2id[lab]
        y_t = [1 if y==lab_id else 0 for y in y_true]
        y_p = [1 if y==lab_id else 0 for y in y_pred]
        f1 = f1_score(y_t, y_p, zero_division=0)
        f1s.append(f1)
    macro_f1 = float(np.mean(f1s))
    per_class = {lab: float(f1s[i]) for i, lab in enumerate(EMOTIONS)}
    return {"macro_f1": macro_f1, **per_class, "n_examples": len(labels)}

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,  # lower if OOM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,  # to simulate larger batch size
    num_train_epochs=20,
    learning_rate=2e-4,
    
    fp16=True,
    fp16_full_eval=True,
    
    logging_steps=50,
    eval_strategy="no",  # we will call eval ourselves
    save_strategy="epoch",
    save_total_limit=3,
    remove_unused_columns=False,
    report_to="none",
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    data_collator=data_collator,
)

In [None]:
print("=== STARTING TRAINING ===")
trainer.train()
print("=== TRAINING FINISHED ===")


omodel = model.module if hasattr(model, "module") else model
model = model.module if hasattr(model, "module") else model

# Save peft adapters & tokenizer
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

In [None]:
MODEL_DIR = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"
TRAIN_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_train.csv"
VAL_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_val.csv"
TEST_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_test.csv"
SAMPLE_SUB = "/kaggle/input/emoti-code-multi-script-emotion-assignment/sample_submission.csv"

OUTPUT_DIR = "./gemma_lora_output"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Prediction

import os
import torch
import pandas as pd
from tqdm.auto import tqdm
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
from typing import Dict, List, Any, Optional
from transformers import PreTrainedTokenizerBase


BASE_MODEL_DIR = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"
ADAPTER_DIR = "./gemma_lora_output"
TEST_CSV = "/kaggle/input/emoti-code-multi-script-emotion-assignment/competition_test.csv"
SAMPLE_SUB = "/kaggle/input/emoti-code-multi-script-emotion-assignment/sample_submission.csv"


BATCH_SIZE = 4

MAX_LEN = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {DEVICE}")


EMOTIONS = ["fear", "happy", "surprise", "sad", "anger", "disgust"]

def build_prompt(sentence: str, language: str) -> str:
    prompt = (
        f"Sentence: {sentence}\n"
        f"Language: {language}\n"
        f"Question: What is the emotion expressed in the sentence? Answer with one word from [{', '.join(EMOTIONS)}].\n"
        f"Answer:"
    )
    return prompt

def predict_emotion_from_generation(pred_text: str) -> str:
    gen = pred_text.strip().splitlines()[0].strip()
    first_chunk = gen.split()[0] if len(gen.split()) > 0 else gen
    first_chunk = first_chunk.strip().strip('.,;:?"\'').lower()

    if first_chunk in EMOTIONS:
        return first_chunk
    for lab in EMOTIONS:
        if gen.lower().startswith(lab):
            return lab
    for lab in EMOTIONS:
        if lab in gen.lower():
            return lab
    return "happy"


# Load Model and Tokenizer
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_DIR,
    trust_remote_code=False,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    low_cpu_mem_usage=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)

print("Resizing model token embeddings to match tokenizer...")
base_model.resize_token_embeddings(len(tokenizer))

if base_model.config.pad_token_id is None:
    base_model.config.pad_token_id = tokenizer.pad_token_id

print("Loading PEFT model and merging adapters...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model = model.merge_and_unload()
model.to(DEVICE)
model.eval()
print("Model loaded and ready for inference.")



# Dataset and Dataloader for Inference
class EmotionInferenceDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: PreTrainedTokenizerBase, max_length: int):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentence = str(row["Sentence"])
        language = str(row.get("language", ""))
        prompt = build_prompt(sentence, language)
        
        tokenized = self.tokenizer(
            prompt, 
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_length,
        )
        item = {"input_ids": torch.tensor(tokenized["input_ids"], dtype=torch.long)}
        return item

@dataclass
class InferenceDataCollator:
    tokenizer: PreTrainedTokenizerBase
    padding_side: str = "left"

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        self.tokenizer.padding_side = self.padding_side
        input_ids = [b["input_ids"] for b in batch]
        
        padded = self.tokenizer.pad(
            {"input_ids": input_ids},
            padding=True,
            return_tensors="pt"
        )
        return padded


test_df = pd.read_csv(TEST_CSV)
inference_dataset = EmotionInferenceDataset(test_df, tokenizer, max_length=MAX_LEN)
inference_collator = InferenceDataCollator(tokenizer=tokenizer)
inference_dataloader = DataLoader(
    inference_dataset, 
    batch_size=BATCH_SIZE, 
    collate_fn=inference_collator,
    shuffle=False
)


# Run Inference
predictions = []
gen_kwargs = {"max_new_tokens": 8, "do_sample": False}

print(f"Starting inference on {len(test_df)} examples...")
for batch in tqdm(inference_dataloader):
    input_ids = batch["input_ids"].to(DEVICE)
    attention_mask = batch["attention_mask"].to(DEVICE)
    
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **gen_kwargs
        )
        
    for i, gen_ids in enumerate(generated_ids):
        input_len = len(input_ids[i])
        gen_tokens = gen_ids[input_len:]
        
        if len(gen_tokens) == 0:
            gen_text = ""
        else:
            gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)
            
        pred_label = predict_emotion_from_generation(gen_text)
        predictions.append(pred_label)

print("Inference complete.")


# Submission File
submission_df = pd.read_csv(SAMPLE_SUB)
submission_df['emotion'] = predictions

submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been created successfully!")
print(submission_df.head())