In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from huggingface_hub import login


# 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
from getpass import getpass
login(token=getpass("Enter your Hugging Face token: "))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Enter your Hugging Face token: ··········
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
#"/content/drive/MyDrive/train.csv"
#"/content/drive/MyDrive/test.csv"

In [3]:
pip install transformers



In [4]:
pip install datasets



In [5]:
!pip install -U scipy
!pip install -U sentencepiece
!pip install -U wandb
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q accelerate
!pip install -q trl



In [6]:
# Remove existing bitsandbytes
!pip uninstall -y bitsandbytes

# Install the latest version
!pip install -U bitsandbytes

Found existing installation: bitsandbytes 0.44.1
Uninstalling bitsandbytes-0.44.1:
  Successfully uninstalled bitsandbytes-0.44.1
Collecting bitsandbytes
  Using cached bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
#Model rank 253 on mteb/leaderboard
MODEL_NAME = "Jaume/gemma-2b-embeddings"
MAX_LENGTH = 256

train_data = pd.read_csv("/content/drive/MyDrive/train.csv")
emotion_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
                  'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Calculate class weights
class_weights = []
for emotion in emotion_columns:
    neg, pos = len(train_data[train_data[emotion] == 0]), len(train_data[train_data[emotion] == 1])
    weight = neg / pos if pos > 0 else 1.0
    class_weights.append(weight)
class_weights = torch.tensor(class_weights)

# QLoRA Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Initialize tokenizer
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


Initializing tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [9]:
# Tokenization function
def tokenize_and_format(examples):
    tokenized = tokenizer(
        examples['Tweet'],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors=None
    )

    labels = torch.zeros((len(examples['Tweet']), len(emotion_columns)))
    for i, example in enumerate(examples['Tweet']):
        for j, emotion in enumerate(emotion_columns):
            labels[i][j] = float(examples[emotion][i])

    tokenized['labels'] = labels.tolist()
    return tokenized

tokenized_train = train_dataset.map(
    tokenize_and_format,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Processing train dataset"
)

tokenized_val = val_dataset.map(
    tokenize_and_format,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Processing validation dataset"
)


Processing train dataset:   0%|          | 0/6951 [00:00<?, ? examples/s]

Processing validation dataset:   0%|          | 0/773 [00:00<?, ? examples/s]

In [10]:
# Set format for pytorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Calculate class weights
train_labels = np.array([sample['labels'] for sample in tokenized_train])
label_counts = np.sum(train_labels, axis=0)
total_samples = len(tokenized_train)
class_weights = torch.tensor(total_samples / (len(emotion_columns) * label_counts), dtype=torch.float)


In [11]:
class GemmaMultiLabelClassification(nn.Module):
    def __init__(self, num_labels, class_weights):
        super().__init__()
        self.num_labels = num_labels
        self.class_weights = class_weights

        # QLoRA Configuration
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False
        )

        # LoRA Configuration
        self.lora_config = LoraConfig(
            r=64,
            lora_alpha=16,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            lora_dropout=0.1,
            bias="none",
            task_type="SEQUENCE_CLASSIFICATION"
        )

        # Load base model with QLoRA
        self.model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=num_labels,
            quantization_config=self.bnb_config,
            problem_type="multi_label_classification"
        )

        # Prepare model for QLoRA
        self.model = prepare_model_for_kbit_training(self.model)
        self.model = get_peft_model(self.model, self.lora_config)


        self.loss_fct = nn.BCEWithLogitsLoss(
            pos_weight=self.class_weights,
            reduction='mean'
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels.float())

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}


In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(predictions)) > 0.5).float().numpy()

    # Calculate different F1 averages only
    macro_f1 = f1_score(labels, predictions, average='macro')
    micro_f1 = f1_score(labels, predictions, average='micro')
    weighted_f1 = f1_score(labels, predictions, average='weighted')

    return {
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'weighted_f1': weighted_f1
    }

In [13]:
training_args = TrainingArguments(
    output_dir='./results_gemma_embeddings',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    logging_steps=50,
    fp16=True,
    save_total_limit=2,
    report_to="wandb",
    optim="paged_adamw_32bit"
)




In [14]:
model = GemmaMultiLabelClassification(
    num_labels=len(emotion_columns),
    class_weights=class_weights
)

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at Jaume/gemma-2b-embeddings and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [16]:
trainer.train()
trainer.save_model("./final_gemma_embeddings_model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Macro F1,Micro F1,Weighted F1
0,0.2249,0.209722,0.326567,0.444998,0.421816
1,0.1798,0.191448,0.504908,0.564534,0.548372
2,0.1415,0.200347,0.547349,0.628065,0.615268
4,0.0529,0.267702,0.578145,0.660636,0.652176
5,0.0234,0.308402,0.591755,0.67938,0.673461
6,0.0106,0.336934,0.596405,0.691752,0.681371
8,0.0027,0.367537,0.590711,0.690132,0.678881
9,0.002,0.373381,0.586632,0.687575,0.675959


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['model.base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax', 'model.base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_map', 'model.base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.absmax', 'model.base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_map', 'model.base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.absmax', 'model.base_model.model.model.layers.0.self_at

In [19]:
# Testing code (corrected version)
print("\nGenerating predictions for test set...")
test_data = pd.read_csv("/content/drive/MyDrive/test.csv")
test_dataset = Dataset.from_pandas(test_data)

def tokenize_test_data(examples):
    tokenized = tokenizer(
        examples['Tweet'],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    return tokenized

tokenized_test = test_dataset.map(
    tokenize_test_data,
    batched=True,
    remove_columns=test_dataset.column_names,
    desc="Processing test dataset"
)

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Generate predictions
device = next(model.parameters()).device  # Get the device from the model
model.eval()
all_predictions = []

for batch in trainer.get_test_dataloader(tokenized_test):
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs['logits']
        predictions = torch.sigmoid(logits).cpu().numpy()
        all_predictions.append(predictions)


all_predictions = np.concatenate(all_predictions, axis=0)
binary_predictions = (all_predictions > 0.5).astype(int)

# Create submission DataFrame
submission = pd.DataFrame(binary_predictions, columns=emotion_columns)
submission['ID'] = test_data['ID']
submission = submission[['ID'] + emotion_columns]

# Save binary predictions
submission.to_csv('/content/drive/MyDrive/gemma_predictions.csv', index=False)
print("Binary predictions saved to: gemma_predictions.csv")

# Save raw probabilities
raw_predictions = pd.DataFrame(all_predictions, columns=emotion_columns)
raw_predictions['ID'] = test_data['ID']
raw_predictions.to_csv('/content/drive/MyDrive/gemma_raw_probabilities.csv', index=False)
print("\nRaw probabilities saved to: gemma_raw_probabilities.csv")

# Print prediction statistics
print("\nPrediction Statistics:")
for i, emotion in enumerate(emotion_columns):
    positive_preds = binary_predictions[:, i].sum()
    total_preds = len(binary_predictions)
    print(f"{emotion}: {positive_preds} positive predictions ({(positive_preds/total_preds)*100:.2f}%)")

# Print distribution summary
print("\nPrediction Distribution Summary:")
for emotion in emotion_columns:
    raw_preds = raw_predictions[emotion]
    print(f"\n{emotion}:")
    print(f"Mean probability: {raw_preds.mean():.4f}")
    print(f"Median probability: {raw_preds.median():.4f}")
    print(f"Std deviation: {raw_preds.std():.4f}")


Generating predictions for test set...


Processing test dataset:   0%|          | 0/3259 [00:00<?, ? examples/s]

Binary predictions saved to: gemma_predictions.csv

Raw probabilities saved to: gemma_raw_probabilities.csv

Prediction Statistics:
anger: 1094 positive predictions (33.57%)
anticipation: 268 positive predictions (8.22%)
disgust: 1057 positive predictions (32.43%)
fear: 410 positive predictions (12.58%)
joy: 1415 positive predictions (43.42%)
love: 487 positive predictions (14.94%)
optimism: 1137 positive predictions (34.89%)
pessimism: 278 positive predictions (8.53%)
sadness: 931 positive predictions (28.57%)
surprise: 85 positive predictions (2.61%)
trust: 69 positive predictions (2.12%)

Prediction Distribution Summary:

anger:
Mean probability: 0.3153
Median probability: 0.0097
Std deviation: 0.4296

anticipation:
Mean probability: 0.0746
Median probability: 0.0021
Std deviation: 0.1994

disgust:
Mean probability: 0.2955
Median probability: 0.0107
Std deviation: 0.4115

fear:
Mean probability: 0.1205
Median probability: 0.0005
Std deviation: 0.3029

joy:
Mean probability: 0.4123
M