In [None]:
# Update dependencies first
!pip install -U bitsandbytes accelerate transformers peft trl datasets wandb nlpaug

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting wandb
  Downloading wandb-0.19.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-

In [None]:
# Import necessary libraries
from pathlib import Path
import os
import random
import numpy as np
import pandas as pd
import torch
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch.nn as nn
import gc
import time

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

In [None]:
# Function to free GPU memory
def free_gpu_memory():
    """Frees up GPU memory after CUDA out-of-memory error in Colab."""
    try:
        # Delete all torch tensors to free up memory
        for obj in list(locals().values()):
            if torch.is_tensor(obj):
                del obj

        # Collect garbage to release any remaining unused memory
        gc.collect()

        # Empty the CUDA cache to release GPU memory
        torch.cuda.empty_cache()

        # Adding a small delay to allow memory to be fully released
        time.sleep(2)

        print("✅ GPU memory has been freed.")
    except Exception as e:
        print(f"❌ Error while freeing GPU memory: {e}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

base_folder = Path('/content/drive/MyDrive/data')
data_folder = Path('/content/inclass_kaggle_data')
kaggle_api = base_folder/'.kaggle'
model_folder = base_folder/'models/nlp_spring_2025/inclass_kaggle/Qwen3'
archive_folder = data_folder/'archive'

os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_api)
!chmod 600 "{kaggle_api}/kaggle.json"

data_folder.mkdir(exist_ok=True, parents=True)
kaggle_api.mkdir(exist_ok=True, parents=True)
model_folder.mkdir(exist_ok=True, parents=True)
archive_folder.mkdir(exist_ok=True, parents=True)

Mounted at /content/drive


In [None]:
!kaggle competitions download emotion-detection-spring-2025 -p {archive_folder}

import zipfile
with zipfile.ZipFile(archive_folder / "emotion-detection-spring-2025.zip", 'r') as zip_ref:
    zip_ref.extractall(data_folder)

In [None]:
train_df = pd.read_csv(data_folder / 'train.csv')
test_df = pd.read_csv(data_folder / 'test.csv')
sample_submission = pd.read_csv(data_folder / 'sample_submission.csv')

# Define emotion label columns
label_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
              'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Text preprocessing function
def preprocess_text(df):
    """Clean and prepare text data"""
    df = df.copy()
    df['Tweet'] = df['Tweet'].str.replace(r'http\S+', '', regex=True)  # Remove URLs
    df['Tweet'] = df['Tweet'].str.replace(r'@\w+', '@user', regex=True)  # Normalize mentions
    df['Tweet'] = df['Tweet'].str.replace(r'#(\w+)', r'\1', regex=True)  # Remove # but keep hashtag text
    return df

# Apply preprocessing
train_df = preprocess_text(train_df)
test_df = preprocess_text(test_df)


In [None]:
# Smarter data augmentation for minority classes
def augment_data(df, target_labels, aug_multiplier=1.5):
    """Augment data for minority classes to address class imbalance"""
    try:
        import nlpaug.augmenter.word as naw

        # Create synonym replacement and back translation augmenters
        aug_synonym = naw.SynonymAug(aug_src='wordnet')

        # Calculate class distribution
        class_counts = df[target_labels].sum().sort_values()
        total_samples = len(df)

        # Get class imbalance ratio
        class_ratios = class_counts / total_samples
        minority_threshold = class_ratios.median() * 0.75
        minority_labels = class_counts[class_ratios < minority_threshold].index.tolist()

        print(f"Augmenting for minority classes: {minority_labels}")

        augmented_rows = []

        # For each minority class, augment examples with variable multiplier
        for label in minority_labels:
            # Get samples that have this label
            positive_samples = df[df[label] == 1]

            # More severe augmentation for more under-represented classes
            label_ratio = class_ratios[label] / class_ratios.median()
            label_multiplier = aug_multiplier * (1 + (1 - label_ratio))

            # Determine how many samples to generate
            num_to_generate = int(len(positive_samples) * (label_multiplier - 1))

            if num_to_generate > 0:
                print(f"  - {label}: Adding {num_to_generate} samples (multiplier: {label_multiplier:.2f})")

                # Select samples to augment
                samples_to_augment = positive_samples.sample(
                    n=min(num_to_generate, len(positive_samples)),
                    replace=(num_to_generate > len(positive_samples))
                )

                # Augment each sample
                for _, row in samples_to_augment.iterrows():
                    try:
                        # Augment the text
                        augmented_text = aug_synonym.augment(row['Tweet'])

                        # Create new row with augmented text
                        new_row = row.copy()
                        new_row['Tweet'] = augmented_text
                        augmented_rows.append(new_row)
                    except Exception as e:
                        continue

        # Combine original and augmented data
        if augmented_rows:
            augmented_df = pd.DataFrame(augmented_rows)
            return pd.concat([df, augmented_df], ignore_index=True)

    except ImportError:
        print("nlpaug not available, skipping augmentation")

    return df

In [None]:
# Convert multi-label format to comma-separated labels for generative approach
def format_emotion_labels(row):
    """Convert multi-hot encoding to text labels"""
    present_emotions = [emotion for emotion, value in zip(label_cols, row[label_cols]) if value == 1]
    return ", ".join(present_emotions) if present_emotions else "none"

# Add text label column to the dataframes
train_df['emotion_text'] = train_df.apply(format_emotion_labels, axis=1)

# Apply augmentation
train_df = augment_data(train_df, label_cols)

# Split data into train and validation sets
val_df = train_df.sample(frac=0.15, random_state=42)
train_df = train_df.drop(val_df.index)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Augmenting for minority classes: ['surprise', 'trust', 'love', 'pessimism']
  - surprise: Adding 619 samples (multiplier: 2.56)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nlt

  - trust: Adding 623 samples (multiplier: 2.56)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

  - love: Adding 902 samples (multiplier: 2.08)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

  - pessimism: Adding 908 samples (multiplier: 2.02)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

Training samples: 6565
Validation samples: 1159
Test samples: 3259


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

In [None]:
# Initialize W&B
wandb.init(
    project="emotion_detection_qwen3_generation",
    name=f"Qwen3-0.6B-Instruction-Label-Generation-{wandb.util.generate_id()}",
    reinit=True
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshreevershith[0m ([33mmy-wandb-account[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Use the instruction-tuned model
model_name = "Qwen/Qwen3-0.6B"
print(f"Using Instruction-tuned Model: {model_name}")

# Try different quantization approaches with proper error handling
try:
    # QLoRA 4-bit configuration - first attempt
    print("Attempting 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer with proper padding configuration
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'  # Important for causal LM training

    # Load base model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16
    )
    print("4-bit quantization successful!")

except Exception as e:
    print(f"4-bit quantization failed: {e}")
    try:
        # Fall back to 8-bit quantization
        print("Falling back to 8-bit quantization...")
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_quant_type="nf4"
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            quantization_config=bnb_config,
            device_map="auto"
        )
        print("8-bit quantization successful!")

    except Exception as e2:
        print(f"8-bit quantization failed: {e2}")
        print("Falling back to FP16...")

        # Fall back to FP16 (no quantization)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            device_map="auto",
            torch_dtype=torch.float16
        )
        print("Model loaded in FP16!")

Using Instruction-tuned Model: Qwen/Qwen3-0.6B
Attempting 4-bit quantization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/9.68k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

4-bit quantization successful!


In [None]:
# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Optimized LoRA configuration
lora_config = LoraConfig(
    r=8,  # Reduced rank for faster training
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,  # Lower dropout for better generalization
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA adapter to the model
model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {model.print_trainable_parameters()}")

trainable params: 5,046,272 || all params: 601,096,192 || trainable%: 0.8395
Trainable parameters: None


In [None]:
# Define the classification template
classification_template = """
{%- for message in messages -%}
    {%- if message['role'] == 'user' -%}
        {{ message['content'] }}
    {%- elif message['role'] == 'assistant' -%}
        {{ message['content'] }}{{eos_token}}
    {%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{%- endif -%}
"""

# Apply the template to the tokenizer
tokenizer.chat_template = classification_template
print("✅ Tokenizer template is set.")


✅ Tokenizer template is set.


In [None]:
# # Create prompt templates based on model type
# def create_prompt_template(use_base_model):
#     if use_base_model:
#         # Simple prompt for base model
#         def prompt_fn(tweet, emotion_text=None):
#             if emotion_text is not None:
#                 return f"Tweet: {tweet}\nEmotions: {emotion_text}</s>"
#             else:
#                 return f"Tweet: {tweet}\nEmotions:"
#     else:
#         # Instruction format for instruction-tuned model
#         def prompt_fn(tweet, emotion_text=None):
#             if emotion_text is not None:
#                 return f"<|im_start|>user\nIdentify the emotions expressed in this tweet: {tweet}<|im_end|>\n<|im_start|>assistant\n{emotion_text}<|im_end|>"
#             else:
#                 return f"<|im_start|>user\nIdentify the emotions expressed in this tweet: {tweet}<|im_end|>\n<|im_start|>assistant\n"

#     return prompt_fn

# prompt_template = create_prompt_template(use_base_model)

#  Prompt template for instruction-tuned model
def create_prompt_template():
    def prompt_fn(tweet, emotion_text=None):
        # Create messages for chat template
        if emotion_text:
            # For training examples where we have labels
            messages = [
                {"role": "user", "content": f"Please identify all the emotions expressed in this tweet. Choose from these emotions: {', '.join(label_cols)}.\n\nTweet: {tweet}"},
                {"role": "assistant", "content": emotion_text}
            ]
        else:
            # For inference, just the question
            messages = [
                {"role": "user", "content": f"Please identify all the emotions expressed in this tweet. Choose from these emotions: {', '.join(label_cols)}.\n\nTweet: {tweet}"},
                {"role": "assistant", "content": ""}
            ]

        # Apply the chat template
        return tokenizer.apply_chat_template(messages, tokenize=False, eos_token=tokenizer.eos_token)

    return prompt_fn

# Initialize the prompt template
prompt_template = create_prompt_template()
print("✅ Prompt template is set.")

✅ Prompt template is set.


In [None]:
# Custom dataset with optimized memory usage
class EmotionLabelGenerationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, prompt_template, max_length=196, training=True):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.prompt_template = prompt_template
        self.max_length = max_length
        self.training = training

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        tweet = row['Tweet']

        if self.training:
            # For training data, include the emotion text in the prompt
            emotion_text = row['emotion_text']
            prompt = self.prompt_template(tweet, emotion_text)
        else:
            # For testing, just provide the input prompt
            prompt = self.prompt_template(tweet)

        # Tokenize input with proper padding
        inputs = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        item = {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.training:
            # Create labels for causal language modeling
            item['labels'] = item['input_ids'].clone()

            # Find position of the assistant token or where answer should start
            user_content = f"Please identify all the emotions expressed in this tweet. Choose from these emotions: {', '.join(label_cols)}.\n\nTweet: {tweet}"
            user_tokens = self.tokenizer.encode(user_content, add_special_tokens=False)
            user_end_pos = len(user_tokens) + 2  # +2 for extra tokens

            # Mask out labels for input portion
            if user_end_pos < len(item['labels']):
                item['labels'][:user_end_pos] = -100

        return item

In [None]:
# Function to extract emotion labels from generated text
def extract_emotions(text, label_cols):
    """Extract emotion labels from generated text"""
    text = text.lower().strip()

    # Split by common separators and clean up
    emotions = []
    for sep in [',', ' and ', ';', '\n']:
        if sep in text:
            emotions.extend([e.strip() for e in text.split(sep)])
            break
    else:
        # If no separator found, treat the whole text as a single emotion
        emotions = [text.strip()]

    # Match with valid emotions
    valid_emotions = []
    for emotion in emotions:
        for label in label_cols:
            if label in emotion:
                valid_emotions.append(label)

    # Remove duplicates
    return list(set(valid_emotions))

In [None]:
# Compute metrics function for generation evaluation
def compute_metrics(eval_preds):
    """
    Compute F1 scores by generating text and extracting labels
    """
    print("Evaluating generation performance...")

    # Select a reasonable subset for fast evaluation
    eval_size = min(len(val_dataset), 100)

    # Track metrics
    all_true_labels = []
    all_pred_labels = []

    # Sample indices for evaluation
    eval_indices = random.sample(range(len(val_dataset)), eval_size)

    # Generate predictions
    model.eval()
    for idx in eval_indices:
        # Get original sample
        sample = val_df.iloc[idx]
        tweet = sample['Tweet']

        # Get true emotion labels
        true_emotion_labels = [1 if sample[label] == 1 else 0 for label in label_cols]
        all_true_labels.append(true_emotion_labels)

        # Create prompt and generate text
        prompt = prompt_template(tweet)
        inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=32,
                num_beams=3,
                early_stopping=True
            )

        # Decode generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Get the assistant's response part
        user_part = f"Please identify all the emotions expressed in this tweet. Choose from these emotions: {', '.join(label_cols)}.\n\nTweet: {tweet}"
        if user_part in generated_text:
            generated_text = generated_text.split(user_part)[1].strip()

        # Extract emotions from generated text
        emotions = extract_emotions(generated_text, label_cols)

        # Create binary vector
        binary_labels = [1 if label in emotions else 0 for label in label_cols]
        all_pred_labels.append(binary_labels)

    # Convert to numpy arrays
    all_true_labels = np.array(all_true_labels)
    all_pred_labels = np.array(all_pred_labels)

    # Calculate metrics
    f1_macro = f1_score(all_true_labels, all_pred_labels, average="macro", zero_division=0)
    f1_micro = f1_score(all_true_labels, all_pred_labels, average="micro", zero_division=0)
    f1_weighted = f1_score(all_true_labels, all_pred_labels, average="weighted", zero_division=0)
    acc = accuracy_score(all_true_labels, all_pred_labels)

    # Log metrics to wandb
    wandb.log({
        "eval/f1_macro": f1_macro,
        "eval/f1_micro": f1_micro,
        "eval/f1_weighted": f1_weighted,
        "eval/accuracy": acc
    })

    print(f"F1 Macro: {f1_macro:.4f}, F1 Micro: {f1_micro:.4f}, Accuracy: {acc:.4f}")

    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "accuracy": acc
    }

In [None]:
# Create datasets with optimized sequence length
max_sequence_length = 196  # Shorter for efficiency but long enough for context
train_dataset = EmotionLabelGenerationDataset(train_df, tokenizer, prompt_template, max_length=max_sequence_length)
val_dataset = EmotionLabelGenerationDataset(val_df, tokenizer, prompt_template, max_length=max_sequence_length)

# Custom data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
# Optimized training arguments
training_args = TrainingArguments(
    output_dir="./qwen3_instruction_model_results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=5e-5,
    num_train_epochs=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to=["wandb"],
    fp16=False,
    bf16=True,
    optim="adamw_torch_fused",
    remove_unused_columns=False,
    label_names=["labels"],
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    max_grad_norm=1.0,
)

# Free memory before training
free_gpu_memory()

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

free_gpu_memory()

# Train the model with error handling
try:
    print("Starting training...")
    trainer.train()
    print("Training completed successfully!")
except RuntimeError as e:
    print(f"Training error: {e}")
    print("Attempting to recover...")
    free_gpu_memory()

    # If training fails, try with even lower memory settings
    print("Retrying with more aggressive memory optimization...")

    # Save the current state of the model anyway
    trainer.save_model(model_folder / "qwen3_label_generation_instruction_partial")

    # Create a smaller subsample
    debug_train_df = train_df.sample(frac=0.5, random_state=42)
    debug_train_dataset = EmotionLabelGenerationDataset(debug_train_df, tokenizer, prompt_template, max_length=128)

    # Update trainer with smaller dataset and more aggressive memory settings
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            **{**training_args.__dict__,
               "per_device_train_batch_size": 1,
               "gradient_accumulation_steps": 16,
               "max_steps": 500,
               "evaluation_strategy": "steps",
               "eval_steps": 100,
               "save_strategy": "steps",
               "save_steps": 100,
            }),
        train_dataset=debug_train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Try training again
    trainer.train()

✅ GPU memory has been freed.
✅ GPU memory has been freed.
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


Training completed successfully!


In [None]:
# Save the trained model
trainer.save_model(model_folder / "qwen3_label_generation_instruction")

In [None]:
# Generate predictions for test data
model.eval()
predictions = []

print("Generating predictions...")
for i, sample in enumerate(test_df['Tweet']):
    if i % 100 == 0:
        print(f"Processing sample {i}/{len(test_df)}")

    # Create prompt
    prompt = prompt_template(sample)
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

    # Generate prediction with improved parameters
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=32,
            num_beams=4,
            temperature=0.7,  # Add some temperature for diversity
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Get the assistant's response part
    user_part = f"Please identify all the emotions expressed in this tweet. Choose from these emotions: {', '.join(label_cols)}.\n\nTweet: {sample}"
    if user_part in generated_text:
        generated_text = generated_text.split(user_part)[1].strip()

    # Extract emotions
    emotions = extract_emotions(generated_text, label_cols)

    # Create binary vector
    binary_labels = [1 if label in emotions else 0 for label in label_cols]
    predictions.append(binary_labels)

    if i < 5:  # Print examples
        print(f"Sample: {sample}")
        print(f"Generated: {generated_text}")
        print(f"Extracted emotions: {emotions}")
        print(f"Binary labels: {binary_labels}")
        print("---")

Generating predictions...
Processing sample 0/3259
Sample: @user @user Dont worry Indian army is on its ways to dispatch all Terrorists to Hell
Extracted emotions: ['disgust', 'fear']
Binary labels: [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
---
Sample: Academy of Sciences, eschews the normally sober tone of scientific papers and calls the massive loss of wildlife a “biological annihilation
Extracted emotions: ['disgust', 'sadness', 'fear']
Binary labels: [0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]
---
Sample: I blew that opportunity -__- mad
Generated: fear, pity, sorrow, tragedy, sympathy, terror, distrust, despair, guilt, helplessness, loss of hope, hopelessness
optimism
Extracted emotions: ['fear', 'trust', 'optimism']
Binary labels: [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]
---
Sample: This time in 2 weeks I will be 30... 😥
Generated: fear, nostalgia, smilejoy, pride, positivity, optimistic, satisfaction, truismiledisgust, happiness, optimismpessimism
Extracted emotions: ['disgust', 'joy', 'pessimism', 'opt

In [None]:
# Create submission dataframe
submission = pd.DataFrame(predictions, columns=label_cols)
submission.insert(0, 'ID', test_df['ID'])

# Check if we have any samples without predictions
empty_predictions = submission[label_cols].sum(axis=1) == 0
if empty_predictions.any():
    print(f"Warning: {empty_predictions.sum()} samples have no predicted emotions")
    # For samples with no predictions, add the most common emotion as a fallback
    most_common = train_df[label_cols].sum().idxmax()
    submission.loc[empty_predictions, most_common] = 1

# Save submission
submission_path = model_folder / 'qwen3_instruction_generated_labels_submission.csv'
submission.to_csv(submission_path, index=False)
print('✅ Submission saved:', submission_path)

✅ Submission saved: /content/drive/MyDrive/data/models/nlp_spring_2025/inclass_kaggle/Qwen3/qwen3_instruction_generated_labels_submission.csv


In [None]:
# Print prediction stats
total_predictions = submission[label_cols].sum().sum()
print(f"Total positive predictions: {total_predictions}")
print("Predictions per class:")
for col in label_cols:
    class_count = submission[col].sum()
    print(f" - {col}: {class_count} ({class_count/len(submission)*100:.2f}%)")

Total positive predictions: 8768
Predictions per class:
 - anger: 340 (10.43%)
 - anticipation: 248 (7.61%)
 - disgust: 2391 (73.37%)
 - fear: 2118 (64.99%)
 - joy: 1011 (31.02%)
 - love: 176 (5.40%)
 - optimism: 352 (10.80%)
 - pessimism: 326 (10.00%)
 - sadness: 1170 (35.90%)
 - surprise: 209 (6.41%)
 - trust: 427 (13.10%)


In [None]:
# Submit to Kaggle
competition = "emotion-detection-spring-2025"
!kaggle competitions submit -c {competition} -f {submission_path} -m "Qwen3 instruction-tuned model with optimized label generation approach"

100% 105k/105k [00:00<00:00, 281kB/s]
Successfully submitted to Emotion Detection Spring2025

In [None]:
# Finalize wandb run
wandb.finish()

0,1
train/epoch,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
train/grad_norm,█▁▁▁▂▁▁▁▂▁▂▂▂▂▁▂
train/learning_rate,▅█▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,6868121605373952.0
train/epoch,1.9968
train/global_step,820.0
train/grad_norm,1.20928
train/learning_rate,0.0
train/loss,1.6091
train_loss,1.79389
train_runtime,7056.0028
train_samples_per_second,1.861
train_steps_per_second,0.116
