In [None]:
!pip install -q bitsandbytes accelerate transformers peft datasets evaluate wandb nltk

import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, Qwen2ForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from sklearn.metrics import f1_score, precision_score, recall_score
import wandb
import re
import nltk
from nltk.corpus import stopwords
import random
from sklearn.model_selection import StratifiedKFold

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Text Preprocessing Functions
def clean_text(text):
    """Basic text cleaning"""
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags but keep the text
    text = re.sub(r'#(\w+)', r'\1', text)
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Text augmentation techniques
def synonym_replacement(text, n=1):
    """Replace n words in the text with synonyms"""
    from nltk.corpus import wordnet

    words = text.split()
    if len(words) <= 1:
        return text

    new_words = words.copy()
    random_word_indices = random.sample(range(len(words)), min(n, len(words)))

    for idx in random_word_indices:
        word = words[idx]
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) > 0:
            new_words[idx] = random.choice(synonyms)

    return ' '.join(new_words)

def random_deletion(text, p=0.1):
    """Randomly delete words from the text with probability p"""
    words = text.split()
    if len(words) <= 1:
        return text

    new_words = []
    for word in words:
        if random.random() > p:
            new_words.append(word)

    if len(new_words) == 0:
        return random.choice(words)

    return ' '.join(new_words)

def augment_text(text):
    """Apply a random augmentation technique"""
    aug_type = random.choice([1, 2, 3])

    if aug_type == 1:
        return synonym_replacement(text)
    elif aug_type == 2:
        return random_deletion(text)
    else:
        return text  # No augmentation

# Load Data
train_path = 'train.csv'
test_path = 'test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Preprocess
emotion_labels = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Clean text
train_df['cleaned_text'] = train_df['Tweet'].apply(clean_text)
test_df['cleaned_text'] = test_df['Tweet'].apply(clean_text)

# Convert labels to numeric
for col in emotion_labels:
    for df in [train_df, test_df]:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Data Augmentation for Minority Classes
# Identify minority classes
label_counts = train_df[emotion_labels].sum().sort_values()
minority_labels = label_counts.index[:5].tolist()  # Get 5 least frequent emotions

# Create augmented samples for minority classes
augmented_samples = []
for label in minority_labels:
    # Get positive samples for this label
    positive_samples = train_df[train_df[label] == 1]

    # Create augmented versions
    for _, row in positive_samples.iterrows():
        augmented_text = augment_text(row['cleaned_text'])
        if augmented_text != row['cleaned_text']:  # Only add if text changed
            new_row = row.copy()
            new_row['cleaned_text'] = augmented_text
            augmented_samples.append(new_row)

# Add augmented samples to training data if we have any
if augmented_samples:
    augmented_df = pd.DataFrame(augmented_samples)
    train_df = pd.concat([train_df, augmented_df], ignore_index=True)

# Calculate optimal thresholds based on label distribution
label_frequencies = train_df[emotion_labels].mean()
# Adjust thresholds inversely to frequency (less frequent = lower threshold)
initial_thresholds = 0.5 - (0.5 - label_frequencies) * 0.5
thresholds = {label: float(thresh) for label, thresh in zip(emotion_labels, initial_thresholds)}

# Tokenizer and Model
model_name = "Qwen/Qwen2.5-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Use the sequence classification head for multi-label classification
model = Qwen2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_labels),
    problem_type="multi_label_classification",
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True
)

model.config.pad_token_id = tokenizer.pad_token_id

# QLoRA Config - Adjusted for better fine-tuning
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,  # Increased rank for better expressivity
    lora_alpha=32,  # Increased alpha
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Added more target modules
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Custom Dataset with text cleaning
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):  # Increased max_length
        self.tokenizer = tokenizer
        self.texts = df['cleaned_text'].tolist()  # Using cleaned text
        self.labels = df[emotion_labels].values.astype(np.float32)
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Compute Class Weights - More sophisticated weighting
label_counts = train_df[emotion_labels].sum()
total_samples = len(train_df)
# Effective number of samples formula
beta = 0.9
effective_num = 1.0 - np.power(beta, label_counts)
weights = (1.0 - beta) / np.array(effective_num)
weights = weights / np.sum(weights) * len(emotion_labels)
pos_weights = torch.tensor(weights, dtype=torch.float).to('cuda')

# Focal Loss implementation for better handling of imbalanced data
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1, gamma=2, pos_weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.pos_weight = pos_weight
        self.reduction = reduction
        self.bce = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='none')

    def forward(self, inputs, targets):
        BCE_loss = self.bce(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

# Custom Trainer with Focal Loss and F1 optimization
class MultiLabelTrainer(Trainer):
    def __init__(self, thresholds=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.thresholds = thresholds or {label: 0.5 for label in emotion_labels}

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if logits.ndim == 3:
            logits = logits[:, 0, :]

        # Use Focal Loss instead of BCE
        loss_fct = FocalLoss(gamma=2, pos_weight=pos_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            if logits.ndim == 3:
                logits = logits[:, 0, :]
            probs = torch.sigmoid(logits)

            # Apply label-specific thresholds
            preds = torch.zeros_like(probs, dtype=torch.int)
            for i, label in enumerate(emotion_labels):
                preds[:, i] = (probs[:, i] > self.thresholds[label]).int()

        if has_labels:
            labels = inputs["labels"]
            return None, preds, labels
        else:
            return None, preds, None

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_dataloader = self.get_eval_dataloader(eval_dataset)

        # Initialize metrics
        all_preds = None
        all_labels = None

        for step, inputs in enumerate(eval_dataloader):
            has_labels = "labels" in inputs
            inputs = self._prepare_inputs(inputs)

            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits
                if logits.ndim == 3:
                    logits = logits[:, 0, :]
                probs = torch.sigmoid(logits)

                # Apply label-specific thresholds
                preds = torch.zeros_like(probs, dtype=torch.int)
                for i, label in enumerate(emotion_labels):
                    preds[:, i] = (probs[:, i] > self.thresholds[label]).int()

            if has_labels:
                labels = inputs["labels"]
                if all_preds is None:
                    all_preds = preds.detach().cpu()
                    all_labels = labels.detach().cpu()
                else:
                    all_preds = torch.cat((all_preds, preds.detach().cpu()), dim=0)
                    all_labels = torch.cat((all_labels, labels.detach().cpu()), dim=0)

        # Calculate metrics
        metrics = {}
        if all_preds is not None and all_labels is not None:
            # Overall metrics
            macro_f1 = f1_score(all_labels, all_preds, average="macro")
            micro_f1 = f1_score(all_labels, all_preds, average="micro")
            weighted_f1 = f1_score(all_labels, all_preds, average="weighted")

            metrics[f"{metric_key_prefix}_macro_f1"] = macro_f1
            metrics[f"{metric_key_prefix}_micro_f1"] = micro_f1
            metrics[f"{metric_key_prefix}_weighted_f1"] = weighted_f1

            # Per-label metrics
            for i, label in enumerate(emotion_labels):
                label_f1 = f1_score(all_labels[:, i], all_preds[:, i], average="binary")
                label_precision = precision_score(all_labels[:, i], all_preds[:, i], average="binary", zero_division=0)
                label_recall = recall_score(all_labels[:, i], all_preds[:, i], average="binary", zero_division=0)

                metrics[f"{metric_key_prefix}_{label}_f1"] = label_f1
                metrics[f"{metric_key_prefix}_{label}_precision"] = label_precision
                metrics[f"{metric_key_prefix}_{label}_recall"] = label_recall

                # Log to wandb
                if self.args.report_to == "wandb":
                    wandb.log({
                        f"{metric_key_prefix}_{label}_f1": label_f1,
                        f"{metric_key_prefix}_{label}_precision": label_precision,
                        f"{metric_key_prefix}_{label}_recall": label_recall,
                    })

        # Log overall metrics
        self.log(metrics)

        return metrics

# Threshold Optimization Function
def optimize_thresholds(trainer, dataset):
    """Find optimal thresholds for each label to maximize F1 score"""
    trainer.model.eval()
    dataloader = trainer.get_eval_dataloader(dataset)

    all_probs = []
    all_labels = []

    # Collect all predictions and labels
    for batch in dataloader:
        batch = {k: v.to(trainer.model.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = trainer.model(**{k: v for k, v in batch.items() if k != "labels"})
            logits = outputs.logits
            if logits.ndim == 3:
                logits = logits[:, 0, :]
            probs = torch.sigmoid(logits).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels)

    all_probs = np.vstack(all_probs)
    all_labels = np.vstack(all_labels)

    # Find optimal threshold for each label
    optimal_thresholds = {}
    for i, label in enumerate(emotion_labels):
        best_f1 = 0
        best_threshold = 0.5

        # Try different thresholds
        for threshold in np.arange(0.1, 0.9, 0.05):
            preds = (all_probs[:, i] >= threshold).astype(int)
            f1 = f1_score(all_labels[:, i], preds, average='binary', zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        optimal_thresholds[label] = best_threshold
        print(f"Optimal threshold for {label}: {best_threshold:.2f} (F1: {best_f1:.4f})")

    return optimal_thresholds

# Data Splitting - Using stratified sampling for better representation
# Create a stratified split based on label combinations
# First, create a string representation of the label combinations
train_df['label_combo'] = train_df[emotion_labels].apply(lambda x: ''.join(x.astype(str)), axis=1)

# Use StratifiedKFold to create a stratified split
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_indices, val_indices = next(skf.split(train_df, train_df['label_combo']))

train_subset = train_df.iloc[train_indices].reset_index(drop=True)
val_subset = train_df.iloc[val_indices].reset_index(drop=True)

# Create datasets
train_dataset = EmotionDataset(train_subset, tokenizer)
val_dataset = EmotionDataset(val_subset, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)

# WandB Init with more detailed config
wandb.init(
    project="qwen2-emotion-multilabel-improved",
    config={
        "model": model_name,
        "lora_r": peft_config.r,
        "lora_alpha": peft_config.lora_alpha,
        "lora_dropout": peft_config.lora_dropout,
        "target_modules": peft_config.target_modules,
        "emotion_labels": emotion_labels,
        "initial_thresholds": thresholds,
        "augmentation": "synonym_replacement, random_deletion",
        "loss": "focal_loss",
    }
)

# Training Args - Improved configuration
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    logging_dir="./logs",
    num_train_epochs=3,
    logging_steps=10,
    save_total_limit=2,
    report_to="wandb",
    do_train=True,
    do_eval=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    fp16=True,
)



# Trainer with initial thresholds
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    thresholds=thresholds
)

# Train the model
trainer.train()

# Optimize thresholds after training
print("Optimizing thresholds...")
optimal_thresholds = optimize_thresholds(trainer, val_dataset)
trainer.thresholds = optimal_thresholds

# Final evaluation with optimized thresholds
print("Final evaluation with optimized thresholds:")
eval_metrics = trainer.evaluate(val_dataset)
print(f"Validation Macro F1 Score: {eval_metrics['eval_macro_f1']:.4f}")

# Evaluate on Test
outputs = trainer.predict(test_dataset)
preds = outputs.predictions

# Save submission
submission = test_df[['ID', 'Tweet']].copy()
for i, label in enumerate(emotion_labels):
    submission[label] = preds[:, i]

submission.to_csv("qwen_predictions_improved.csv", index=False)
print("Submission saved.")

# Log final thresholds to wandb
wandb.log({"final_thresholds": optimal_thresholds})
wandb.finish()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 10,131,968 || all params: 7,080,790,528 || trainable%: 0.1431




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwadekarritin6[0m ([33mwadekarritin6-onpoint-insights[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kw

Step,Training Loss
10,2.8116
20,2.6416
30,2.7675
40,2.693
50,2.5042
60,2.3373
70,2.0608
80,1.6468
90,1.2185
100,0.851


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Optimizing thresholds...
Optimal threshold for anger: 0.40 (F1: 0.7901)
Optimal threshold for anticipation: 0.40 (F1: 0.5223)
Optimal threshold for disgust: 0.45 (F1: 0.7833)
Optimal threshold for fear: 0.50 (F1: 0.7086)
Optimal threshold for joy: 0.45 (F1: 0.8344)
Optimal threshold for love: 0.45 (F1: 0.6157)
Optimal threshold for optimism: 0.45 (F1: 0.7407)
Optimal threshold for pessimism: 0.40 (F1: 0.5198)
Optimal threshold for sadness: 0.45 (F1: 0.6758)
Optimal threshold for surprise: 0.35 (F1: 0.3459)
Optimal threshold for trust: 0.35 (F1: 0.4069)
Final evaluation with optimized thresholds:
Validation Macro F1 Score: 0.6312


Submission saved.


0,1
eval/anger_f1,▁
eval/anger_precision,▁
eval/anger_recall,▁
eval/anticipation_f1,▁
eval/anticipation_precision,▁
eval/anticipation_recall,▁
eval/disgust_f1,▁
eval/disgust_precision,▁
eval/disgust_recall,▁
eval/fear_f1,▁

0,1
eval/anger_f1,0.7901
eval/anger_precision,0.74435
eval/anger_recall,0.84185
eval/anticipation_f1,0.52228
eval/anticipation_precision,0.45572
eval/anticipation_recall,0.61159
eval/disgust_f1,0.7833
eval/disgust_precision,0.73713
eval/disgust_recall,0.83564
eval/fear_f1,0.70855
