In [20]:
# !pip install transformers evaluate sentencepiece accelerate
# !pip install -U datasets
# !pip install -q -U bitsandbytes

In [21]:
from collections import defaultdict, Counter
import json
import numpy as np
import torch

from matplotlib import pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

In [22]:
train_data = load_dataset('csv',data_files = "train.csv")
val_data = load_dataset('csv',data_files="validation.csv")

In [23]:
def df_preprocess(dataset):
    df = dataset['train'].to_pandas()
    df = df[['joke', 'rhetoric']]
    # merge same category between cased/uncased
    df['rhetoric'] = df['rhetoric'].str.lower()
    # select only jokes
    df = df[df['rhetoric'] != 'not a joke']
    # uncase jokes to use uncased pretrained model
    df['joke'] = df['joke'].str.lower()
    df = df.rename(columns={'joke': 'text', 'rhetoric': 'labels'})
    # Drop rows where 'labels' is None
    df = df.dropna(subset=['labels'])
    return df

train_df = df_preprocess(train_data)
val_df = df_preprocess(val_data)

In [24]:
# filter only on common labels for training / validation
train_labels = set(train_df['labels'].unique())
val_labels = set(val_df['labels'].unique())

common_labels = list(train_labels.intersection(val_labels))
train_df_filtered = train_df[train_df['labels'].isin(common_labels)].copy()
val_df = val_df[val_df['labels'].isin(common_labels)].copy()


# Undersample the filtered training data
sampling_threshold = 60

final_train_df = (
    train_df_filtered.groupby('labels', group_keys=False)
    .apply(lambda x: x.sample(n=min(len(x), sampling_threshold), random_state=42))
)
final_train_df = final_train_df.sample(frac=1, random_state=42).reset_index(drop=True)


# # --- FINAL VERIFICATION ---
# # Let's check the new label distribution to confirm it worked.
# # All counts should now be 30 or less.
# print("\n--- New Training Set Label Counts after Undersampling ---")
# print(final_train_df['labels'].value_counts())
# print(f"\nTotal size of the new, balanced training DataFrame: {len(final_train_df)}")

train_df = final_train_df

In [25]:
# categorical_cols = train_df.iloc[:,1:]
# for col in categorical_cols:
#     plt.figure(figsize=(6, 4))
#     sns.countplot(data=train_df, x=col, order=train_df[col].value_counts().index)
#     plt.title(f"Distribution of {col}")
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

In [26]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from huggingface_hub import login


import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load variables from .env
load_dotenv()

hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("HF_TOKEN not found in .env file")

login(token=hf_token)


name = "google/gemma-2-2b"

# Define the 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
    bnb_4bit_quant_type="nf4"
)



tokenizer = AutoTokenizer.from_pretrained(name)

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

token_lengths = [len(tokenizer.encode(text)) for text in train_dataset['text']]
int(np.percentile(token_lengths, 95))
#set maxlength as 40(multiple of 8)

40

In [29]:
# Now that our tokenizer has been properly loaded, we need to call the tokenizer
# for every example in the dataset. Here we use list comprehension with a
# lambda function ensure that.

tokenized_train_dataset = train_dataset.map(
    lambda example: tokenizer(example['text'], padding="max_length",
    truncation=True, max_length=40)
)

tokenized_val_dataset = val_dataset.map(
    lambda example: tokenizer(example['text'], padding="max_length",
    truncation=True, max_length=40)
)

# We need to remove these extra columns before the dataset can be sent to the
# dataloader and subsequently to the model. Also be sure to check that the
# output column is named labels or else rename if necessary
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text'])
tokenized_train_dataset.set_format("torch")

tokenized_val_dataset = tokenized_val_dataset.remove_columns(['text'])
tokenized_val_dataset.set_format("torch")

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/486 [00:00<?, ? examples/s]

In [30]:
import warnings
# This will ignore any warning that contains the specified text
import warnings
warnings.filterwarnings('ignore')

In [31]:
# --- IMPORTS ---
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import numpy as np
import os
import gc
from transformers import TrainingArguments, Trainer

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup,
    set_seed
)
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder

# Import PEFT libraries for LoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# --- 1. CONFIGURATION ---
# All your hyperparameters are now in one place for easy tuning.

MODEL_NAME = "google/gemma-2-2b"
MODEL_SAVE_PATH = "gemma2_model" # A descriptive name for the final model folder

# Training Hyperparameters
LEARNING_RATE = 5e-5  # LoRA can often use a higher learning rate
BATCH_SIZE = 2       # Batch size
NUM_EPOCHS = 3       # Number of epochs to train for
WEIGHT_DECAY = 0.05

# LoRA Specific Hyperparameters
LORA_R = 6            # LoRA rank (a key parameter to tune)
LORA_ALPHA = LORA_R * 2  # Standard practice is to set alpha to 2*r
LORA_DROPOUT = 0.1


# --- 2. SETUP ---
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

# Label Encoding (assuming train_df is already loaded and preprocessed)
le = LabelEncoder()
le.fit(train_df['labels'])

# --- 3. DATA LOADING ---
# Create DataLoaders with the specified batch size
train_dataloader = DataLoader(
    tokenized_train_dataset, # Assumes this is pre-loaded
    batch_size=BATCH_SIZE,
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_val_dataset, # Assumes this is pre-loaded
    batch_size=BATCH_SIZE,
    shuffle=False
)

# --- 4. MODEL SETUP ---
# Define Quantization Config for 4-bit loading
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model with quantization
config = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).config
config.num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config,
    quantization_config=quantization_config,
    device_map="auto"
)

# Prepare the quantized model for LoRA training
model = prepare_model_for_kbit_training(model)

# Define the LoRA configuration
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Specific to Gemma-2 model
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.SEQ_CLS # Specify the task
)

# Wrap the base model with LoRA adapters
model = get_peft_model(model, lora_config)

# Print a summary of the trainable parameters to verify LoRA is working
model.print_trainable_parameters()

# --- 5. OPTIMIZER, SCHEDULER, AND LOSS FUNCTION ---
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_training_steps = len(train_dataloader) * NUM_EPOCHS
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
loss_fct = nn.CrossEntropyLoss()

# --- 6. TRAINING & VALIDATION LOOP ---
best_val_loss = float("inf")
print("\n--- Starting Model Training ---")

for epoch in range(NUM_EPOCHS):
    # --- Training Phase ---
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} Training"):
        optimizer.zero_grad()

        # Data is already on the correct device due to device_map
        input_ids = batch['input_ids'].to(device) # Not needed with device_map
        attention_mask = batch['attention_mask'].to(device) # Not needed with device_map
        labels = torch.tensor(le.transform(batch['labels'])).to(device)

        # Forward pass
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = output.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # --- Validation Phase ---
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = torch.tensor(le.transform(batch['labels'])).to(device)
            output = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += output.loss.item()

    avg_val_loss = total_val_loss / len(eval_dataloader)

    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print(f"  Average Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")

    # --- Save the best model ---
    if avg_val_loss < best_val_loss:
        print(f"  Validation loss improved from {best_val_loss:.4f} to {avg_val_loss:.4f}. Saving model...")
        best_val_loss = avg_val_loss
        # Use save_pretrained for LoRA models
        model.save_pretrained(MODEL_SAVE_PATH)
        # Also save the tokenizer for easy loading later
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

print("\n--- Training Complete ---")
print(f"Best validation loss achieved: {best_val_loss:.4f}")
print(f"Best model saved to: {MODEL_SAVE_PATH}")

# # --- 7. CLEANUP ---
del model, optimizer, lr_scheduler
gc.collect()
torch.cuda.empty_cache()

In [33]:
# # --- IMPORTS ---
# import pandas as pd
# from tqdm.notebook import tqdm
# import torch
# import torch.nn as nn
# import numpy as np
# import os
# import gc
# from transformers import TrainingArguments, Trainer
# import itertools
# import json
# from datetime import datetime

# from transformers import (
#     AutoModelForSequenceClassification,
#     AutoTokenizer,
#     BitsAndBytesConfig,
#     get_linear_schedule_with_warmup,
#     set_seed
# )
# from torch.utils.data import DataLoader
# from sklearn.preprocessing import LabelEncoder

# # Import PEFT libraries for LoRA
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# # --- 1. GRID SEARCH CONFIGURATION ---
# GRID_SEARCH_PARAMS = {
#     'LORA_R': [4, 6, 8],                    # LoRA rank values to test
#     'LEARNING_RATE': [5e-5, 1e-4],       # Learning rates to test
#     'BATCH_SIZE': [2, 4],                   # Batch sizes to test
#     'NUM_EPOCHS': [3],                   # Number of epochs to test
#     'WEIGHT_DECAY': [0.01, 0.05],        # Weight decay values
#     'LORA_DROPOUT': [0.05, 0.1]           # LoRA dropout values
# }

# # Fixed parameters
# FIXED_PARAMS = {
#     'MODEL_NAME': "google/gemma-2-2b",
#     'MODEL_SAVE_PATH': "gemma2_lora_grid_search_results"
# }

# # --- 2. GRID SEARCH FUNCTIONS ---
# def generate_param_combinations(param_grid):
#     """Generate all possible parameter combinations"""
#     keys = param_grid.keys()
#     values = param_grid.values()
#     combinations = []

#     for combination in itertools.product(*values):
#         param_dict = dict(zip(keys, combination))
#         combinations.append(param_dict)

#     return combinations

# def create_model_with_params(params):
#     """Create and configure model with specific parameters"""
#     # Define Quantization Config for 4-bit loading
#     quantization_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_compute_dtype=torch.bfloat16
#     )

#     # Load the base model with quantization
#     config = AutoModelForSequenceClassification.from_pretrained(FIXED_PARAMS['MODEL_NAME']).config
#     config.num_labels = len(le.classes_)

#     model = AutoModelForSequenceClassification.from_pretrained(
#         FIXED_PARAMS['MODEL_NAME'],
#         config=config,
#         quantization_config=quantization_config,
#         device_map="auto"
#     )

#     # Prepare the quantized model for LoRA training
#     model = prepare_model_for_kbit_training(model)

#     # Define the LoRA configuration with current parameters
#     lora_config = LoraConfig(
#         r=params['LORA_R'],
#         lora_alpha=params['LORA_R'] * 2,  # Standard practice: alpha = 2*r
#         target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#         lora_dropout=params['LORA_DROPOUT'],
#         bias="none",
#         task_type=TaskType.SEQ_CLS
#     )

#     # Wrap the base model with LoRA adapters
#     model = get_peft_model(model, lora_config)

#     return model

# def train_with_params(params, train_dataloader, eval_dataloader, experiment_id):
#     """Train model with specific parameters and return results"""
#     print(f"\n=== Experiment {experiment_id} ===")
#     print(f"Parameters: {params}")

#     # Create model
#     model = create_model_with_params(params)

#     # Setup optimizer and scheduler
#     optimizer = torch.optim.AdamW(
#         model.parameters(),
#         lr=params['LEARNING_RATE'],
#         weight_decay=params['WEIGHT_DECAY']
#     )

#     num_training_steps = len(train_dataloader) * params['NUM_EPOCHS']
#     lr_scheduler = get_linear_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=0,
#         num_training_steps=num_training_steps
#     )

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     loss_fct = nn.CrossEntropyLoss()

#     # Training history
#     training_history = {
#         'train_losses': [],
#         'val_losses': [],
#         'epochs': []
#     }

#     best_val_loss = float("inf")
#     early_stop_patience = 2
#     no_improve_count = 0

#     # Training loop
#     for epoch in range(params['NUM_EPOCHS']):
#         # Training phase
#         model.train()
#         total_train_loss = 0

#         train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{params['NUM_EPOCHS']} Training")
#         for batch in train_progress:
#             optimizer.zero_grad()

#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = torch.tensor(le.transform(batch['labels'])).to(device)

#             output = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 labels=labels
#             )
#             loss = output.loss
#             total_train_loss += loss.item()

#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             lr_scheduler.step()

#             train_progress.set_postfix({'loss': f'{loss.item():.4f}'})

#         avg_train_loss = total_train_loss / len(train_dataloader)

#         # Validation phase
#         model.eval()
#         total_val_loss = 0

#         with torch.no_grad():
#             val_progress = tqdm(eval_dataloader, desc=f"Epoch {epoch+1}/{params['NUM_EPOCHS']} Validation")
#             for batch in val_progress:
#                 input_ids = batch['input_ids'].to(device)
#                 attention_mask = batch['attention_mask'].to(device)
#                 labels = torch.tensor(le.transform(batch['labels'])).to(device)

#                 output = model(
#                     input_ids=input_ids,
#                     attention_mask=attention_mask,
#                     labels=labels
#                 )
#                 total_val_loss += output.loss.item()

#                 val_progress.set_postfix({'val_loss': f'{output.loss.item():.4f}'})

#         avg_val_loss = total_val_loss / len(eval_dataloader)

#         # Record history
#         training_history['train_losses'].append(avg_train_loss)
#         training_history['val_losses'].append(avg_val_loss)
#         training_history['epochs'].append(epoch + 1)

#         print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

#         # Early stopping check
#         if avg_val_loss < best_val_loss:
#             best_val_loss = avg_val_loss
#             no_improve_count = 0
#         else:
#             no_improve_count += 1

#         if no_improve_count >= early_stop_patience:
#             print(f"Early stopping triggered after epoch {epoch+1}")
#             break

#     # Calculate overfitting metrics
#     final_train_loss = training_history['train_losses'][-1]
#     final_val_loss = training_history['val_losses'][-1]
#     overfitting_gap = final_val_loss - final_train_loss

#     # Cleanup
#     del model, optimizer, lr_scheduler
#     gc.collect()
#     torch.cuda.empty_cache()

#     return {
#         'experiment_id': experiment_id,
#         'params': params,
#         'best_val_loss': best_val_loss,
#         'final_train_loss': final_train_loss,
#         'final_val_loss': final_val_loss,
#         'overfitting_gap': overfitting_gap,
#         'training_history': training_history,
#         'completed_epochs': len(training_history['epochs'])
#     }

# # --- 3. MAIN GRID SEARCH EXECUTION ---
# def run_grid_search():
#     """Execute the complete grid search"""
#     # Setup
#     set_seed(42)
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     print(f"Using device: {device}")

#     # Create results directory
#     results_dir = FIXED_PARAMS['MODEL_SAVE_PATH']
#     if not os.path.exists(results_dir):
#         os.makedirs(results_dir)

#     # Load your data (assuming these are already defined)
#     # Make sure train_df, tokenized_train_dataset, tokenized_val_dataset are available
#     global le
#     le = LabelEncoder()
#     le.fit(train_df['labels'])  # Make sure train_df is loaded

#     # Generate all parameter combinations
#     param_combinations = generate_param_combinations(GRID_SEARCH_PARAMS)
#     total_combinations = len(param_combinations)

#     print(f"\nStarting Grid Search with {total_combinations} parameter combinations")
#     print("=" * 60)

#     # Store all results
#     all_results = []

#     # Run grid search
#     for i, params in enumerate(param_combinations, 1):
#         try:
#             # Create data loaders with current batch size
#             train_dataloader = DataLoader(
#                 tokenized_train_dataset,
#                 batch_size=params['BATCH_SIZE'],
#                 shuffle=True
#             )
#             eval_dataloader = DataLoader(
#                 tokenized_val_dataset,
#                 batch_size=params['BATCH_SIZE'],
#                 shuffle=False
#             )

#             # Train with current parameters
#             result = train_with_params(params, train_dataloader, eval_dataloader, i)
#             all_results.append(result)

#             # Save intermediate results
#             with open(f"{results_dir}/intermediate_results.json", 'w') as f:
#                 json.dump(all_results, f, indent=2, default=str)

#             print(f"Completed {i}/{total_combinations} experiments")

#         except Exception as e:
#             print(f"Error in experiment {i}: {str(e)}")
#             continue

#     # Analyze and save final results
#     analyze_results(all_results, results_dir)
#     return all_results

# def analyze_results(results, results_dir):
#     """Analyze grid search results and save findings"""
#     if not results:
#         print("No results to analyze!")
#         return

#     # Sort by validation loss
#     results_sorted = sorted(results, key=lambda x: x['best_val_loss'])

#     print("\n" + "="*60)
#     print("GRID SEARCH RESULTS ANALYSIS")
#     print("="*60)

#     # Best performing configurations
#     print("\nTOP 5 BEST CONFIGURATIONS:")
#     print("-" * 40)
#     for i, result in enumerate(results_sorted[:5], 1):
#         params = result['params']
#         print(f"\n{i}. Best Val Loss: {result['best_val_loss']:.4f}")
#         print(f"   Overfitting Gap: {result['overfitting_gap']:.4f}")
#         print(f"   Epochs Completed: {result['completed_epochs']}")
#         print(f"   LoRA_R: {params['LORA_R']}, LR: {params['LEARNING_RATE']:.2e}")
#         print(f"   Batch Size: {params['BATCH_SIZE']}, Weight Decay: {params['WEIGHT_DECAY']}")
#         print(f"   LoRA Dropout: {params['LORA_DROPOUT']}")

#     # Configurations with least overfitting
#     results_by_overfitting = sorted(results, key=lambda x: x['overfitting_gap'])
#     print("\n\nTOP 5 LEAST OVERFITTING CONFIGURATIONS:")
#     print("-" * 40)
#     for i, result in enumerate(results_by_overfitting[:5], 1):
#         params = result['params']
#         print(f"\n{i}. Overfitting Gap: {result['overfitting_gap']:.4f}")
#         print(f"   Best Val Loss: {result['best_val_loss']:.4f}")
#         print(f"   LoRA_R: {params['LORA_R']}, LR: {params['LEARNING_RATE']:.2e}")
#         print(f"   Batch Size: {params['BATCH_SIZE']}, LoRA Dropout: {params['LORA_DROPOUT']}")

#     # Save detailed results
#     results_df = pd.DataFrame([
#         {
#             **result['params'],
#             'best_val_loss': result['best_val_loss'],
#             'final_train_loss': result['final_train_loss'],
#             'final_val_loss': result['final_val_loss'],
#             'overfitting_gap': result['overfitting_gap'],
#             'completed_epochs': result['completed_epochs']
#         }
#         for result in results
#     ])

#     results_df.to_csv(f"{results_dir}/grid_search_results.csv", index=False)

#     # Save complete results with training history
#     with open(f"{results_dir}/complete_results.json", 'w') as f:
#         json.dump(results, f, indent=2, default=str)

#     print(f"\nResults saved to {results_dir}/")
#     print("- grid_search_results.csv: Summary table")
#     print("- complete_results.json: Full results with training history")

#     return results_sorted[0]  # Return best configuration

# # --- 4. EXECUTION ---
# if __name__ == "__main__":
#     # Run the grid search
#     best_results = run_grid_search()

#     print("\nGrid Search Complete!")
#     print("Check the results directory for detailed analysis.")


Using device: cuda

Starting Grid Search with 48 parameter combinations

=== Experiment 1 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.8068, Val Loss: 2.7640


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.4511, Val Loss: 2.8106


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.9941, Val Loss: 2.6843
Completed 1/48 experiments

=== Experiment 2 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.1835, Val Loss: 2.5650


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.4365, Val Loss: 2.5394


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.0418, Val Loss: 2.5127
Completed 2/48 experiments

=== Experiment 3 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.6488, Val Loss: 2.7275


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.3961, Val Loss: 2.4464


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.0532, Val Loss: 2.4901
Completed 3/48 experiments

=== Experiment 4 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.8589, Val Loss: 2.9230


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.4302, Val Loss: 2.7133


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.9209, Val Loss: 2.5786
Completed 4/48 experiments

=== Experiment 5 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.8703, Val Loss: 2.9643


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.5678, Val Loss: 2.6255


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.1266, Val Loss: 2.6017
Completed 5/48 experiments

=== Experiment 6 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.7195, Val Loss: 3.3154


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.6412, Val Loss: 2.8127


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.1451, Val Loss: 2.7661
Completed 6/48 experiments

=== Experiment 7 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.1950, Val Loss: 2.9417


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.6991, Val Loss: 2.6217


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.2558, Val Loss: 2.5921
Completed 7/48 experiments

=== Experiment 8 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.2438, Val Loss: 2.9278


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.6474, Val Loss: 2.6601


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.2410, Val Loss: 2.5474
Completed 8/48 experiments

=== Experiment 9 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.7813, Val Loss: 2.8990


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.0696, Val Loss: 2.4710


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.2163, Val Loss: 2.6268
Completed 9/48 experiments

=== Experiment 10 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.5385, Val Loss: 2.3670


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.1367, Val Loss: 2.6759


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.4001, Val Loss: 2.5145
Early stopping triggered after epoch 3
Completed 10/48 experiments

=== Experiment 11 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.7263, Val Loss: 2.5028


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.1961, Val Loss: 2.5484


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.5702, Val Loss: 2.4050
Completed 11/48 experiments

=== Experiment 12 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.3635, Val Loss: 2.7535


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.1385, Val Loss: 2.4147


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.5159, Val Loss: 2.5351
Completed 12/48 experiments

=== Experiment 13 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.9639, Val Loss: 2.4290


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.2832, Val Loss: 2.7526


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.7365, Val Loss: 2.6314
Early stopping triggered after epoch 3
Completed 13/48 experiments

=== Experiment 14 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.5926, Val Loss: 2.5193


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.1992, Val Loss: 2.3809


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.7726, Val Loss: 2.4918
Completed 14/48 experiments

=== Experiment 15 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.3940, Val Loss: 2.9173


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.3095, Val Loss: 2.7636


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.6083, Val Loss: 2.6841
Completed 15/48 experiments

=== Experiment 16 ===
Parameters: {'LORA_R': 4, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.9157, Val Loss: 3.4492


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.3829, Val Loss: 2.9545


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.8005, Val Loss: 2.7241
Completed 16/48 experiments

=== Experiment 17 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.8066, Val Loss: 2.7436


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.2318, Val Loss: 2.6548


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.6524, Val Loss: 2.7687
Completed 17/48 experiments

=== Experiment 18 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.7661, Val Loss: 2.8657


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.3632, Val Loss: 2.6588


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.8145, Val Loss: 2.5906
Completed 18/48 experiments

=== Experiment 19 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.0819, Val Loss: 3.1006


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.4445, Val Loss: 2.7191


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.8795, Val Loss: 2.5465
Completed 19/48 experiments

=== Experiment 20 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 2, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.05, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.3578, Val Loss: 2.5478


Epoch 2/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.4044, Val Loss: 2.6180


Epoch 3/3 Training:   0%|          | 0/288 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/243 [00:00<?, ?it/s]

Epoch 3: Train Loss: 1.8745, Val Loss: 2.3737
Completed 20/48 experiments

=== Experiment 21 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.05}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 1: Train Loss: 4.2069, Val Loss: 3.0518


Epoch 2/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 2/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 2: Train Loss: 2.5388, Val Loss: 2.8829


Epoch 3/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 3/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

Epoch 3: Train Loss: 2.0869, Val Loss: 2.7754
Completed 21/48 experiments

=== Experiment 22 ===
Parameters: {'LORA_R': 6, 'LEARNING_RATE': 5e-05, 'BATCH_SIZE': 4, 'NUM_EPOCHS': 3, 'WEIGHT_DECAY': 0.01, 'LORA_DROPOUT': 0.1}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 Training:   0%|          | 0/144 [00:00<?, ?it/s]

Epoch 1/3 Validation:   0%|          | 0/122 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [39]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel  # Notice we are not using LoraConfig or get_peft_model here
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
    compute_dtype = torch.float16
else:
    device = torch.device("cpu")
    compute_dtype = torch.bfloat16
print(f"Using device: {device}")


print("--- Loading base model and attaching LoRA adapter ---")

# --- Step 2: Load the base model without quantization ---
model_name = "google/gemma-2-2b"
config = AutoModelForSequenceClassification.from_pretrained(model_name).config
config.num_labels = 12

# Load the base model in half-precision (float16) to save memory.
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    torch_dtype=compute_dtype,
)

# --- Step 3: Load the LoRA adapter onto the base model ---
# This merges your saved adapter weights with the base model.
# Point this to the FOLDER you created with `save_pretrained`.
adapter_path = "/content/gemma2_lora_grid_finetuned_model"
try:
    model = PeftModel.from_pretrained(base_model, adapter_path)
    print(f"Successfully loaded the LoRA adapter from '{adapter_path}'.")
except Exception as e:
    print(f"Could not find or load adapter from folder '{adapter_path}'. Make sure the folder exists and is not corrupted. Error: {e}")
    # Stop execution if this fails

# --- Step 4: Move the final, merged model to the MPS device ---
model.to(device)


# --- Step 5: Run your evaluation loop ---
model.eval()
test_batch_logits = []
y_true = []

eval_dataloader = DataLoader(
    tokenized_val_dataset,
    batch_size=2,
    shuffle=False
)

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = torch.tensor(le.transform(batch['labels'])).to(device)
        y_true.extend(labels.cpu().numpy())

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        test_batch_logits.append(output.logits.cpu())

print("\n--- Evaluation Complete ---")
test_logits = torch.cat(test_batch_logits, dim=0)

Using device: cuda
--- Loading base model and attaching LoRA adapter ---


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Could not find or load adapter from folder '/content/gemma2_lora_grid_finetuned_model'. Make sure the folder exists and is not corrupted. Error: Can't find 'adapter_config.json' at '/content/gemma2_lora_grid_finetuned_model'


Evaluating:   0%|          | 0/243 [00:00<?, ?it/s]


--- Evaluation Complete ---


In [40]:
print(len(test_batch_logits),len(eval_dataloader))
test_logits = torch.cat(test_batch_logits, dim=0)

#sanity check -> dimension 0 of your logits tensor should be same as the size of the test dataset
print(test_logits.shape,len(tokenized_val_dataset),len(y_true))

243 243
torch.Size([486, 12]) 486 486


In [41]:
#Convert the logits to predicted labels
y_pred = torch.argmax(torch.cat(test_batch_logits, dim=0), dim=1).cpu().numpy()

print(y_true[:])
print(y_pred[:])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

[np.int64(11), np.int64(8), np.int64(0), np.int64(11), np.int64(1), np.int64(11), np.int64(1), np.int64(1), np.int64(3), np.int64(10), np.int64(10), np.int64(1), np.int64(1), np.int64(10), np.int64(11), np.int64(8), np.int64(1), np.int64(11), np.int64(8), np.int64(11), np.int64(11), np.int64(11), np.int64(11), np.int64(1), np.int64(8), np.int64(8), np.int64(10), np.int64(11), np.int64(8), np.int64(11), np.int64(11), np.int64(1), np.int64(1), np.int64(11), np.int64(4), np.int64(11), np.int64(0), np.int64(8), np.int64(11), np.int64(1), np.int64(3), np.int64(10), np.int64(8), np.int64(1), np.int64(8), np.int64(7), np.int64(6), np.int64(1), np.int64(10), np.int64(10), np.int64(10), np.int64(6), np.int64(0), np.int64(11), np.int64(0), np.int64(2), np.int64(8), np.int64(11), np.int64(10), np.int64(11), np.int64(10), np.int64(11), np.int64(8), np.int64(3), np.int64(11), np.int64(8), np.int64(10), np.int64(10), np.int64(7), np.int64(4), np.int64(8), np.int64(10), np.int64(11), np.int64(11), np

In [42]:
from sklearn.metrics import accuracy_score, f1_score
# call the f1_score function
print('F1 Score:',f1_score(y_true,y_pred,average='macro'))

# call the accuracy_score function
print('Accuracy Score:',accuracy_score(y_true, y_pred))

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=le.classes_))

class_names = [
    'cultural reference', 'dark humor', 'hyperbole', 'irony', 'none', 'other',
    'puns', 'sarcasm', 'satire', 'self-deprecation', 'vulgarity', 'wordplay'
]
report_dict = classification_report(y_true, y_pred,
                                    target_names=class_names,
                                    output_dict=True,
                                    zero_division=0 )

F1 Score: 0.09468462906883612
Accuracy Score: 0.205761316872428
                    precision    recall  f1-score   support

cultural reference       0.06      0.12      0.08        41
        dark humor       0.15      0.05      0.08        73
         hyperbole       0.05      0.14      0.07        14
             irony       0.00      0.00      0.00        18
              none       0.00      0.00      0.00         9
             other       0.00      0.00      0.00         1
              puns       0.00      0.00      0.00         8
           sarcasm       0.07      0.12      0.08        25
            satire       0.20      0.21      0.21        81
  self-deprecation       0.00      0.00      0.00         1
         vulgarity       0.39      0.13      0.20        52
          wordplay       0.46      0.38      0.42       163

          accuracy                           0.21       486
         macro avg       0.11      0.10      0.09       486
      weighted avg       0.26     

In [None]:
report_dict

In [None]:
df_report = pd.DataFrame(report_dict).transpose()
accuracy = report_dict['accuracy']

# --- Step 4: Prepare the DataFrame for plotting ---
df_plot = df_report.drop(['accuracy', 'macro avg', 'weighted avg'])
df_plot_scores = df_plot.drop('support', axis=1)

# Sort the classes by f1-score for a more insightful visualization
df_plot_scores = df_plot_scores.sort_values(by='f1-score', ascending=True)


sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 10)

# Create the horizontal bar plot from our prepared DataFrame
ax = df_plot_scores.plot(
    kind='barh',
    width=0.8,
    color=sns.color_palette('deep', n_colors=3) # A nice color palette
)

# sns.barplot(
#     data=df_plot_scores,
#     y='Class',
#     x='Score',
#     hue='model',
#     palette='muted'  # Choose your palette
# )

# Customize the plot
plt.title(f'Model Performance by Class', fontsize=18, pad=20)
plt.xlabel('Score', fontsize=15)
plt.ylabel('Class', fontsize=15)
plt.xlim(0, 1.0) # Standardize axis for metrics
plt.legend(title='Metric', fontsize=12, title_fontsize=13)
plt.yticks(fontsize=15)

# Add the support count to the y-axis labels for context
# Get the support counts from the original dataframe in the new sorted order
support_counts = df_plot.reindex(df_plot_scores.index)['support'].astype(int)
new_yticklabels = [f"{label}\n (n={count})" for label, count in support_counts.items()]
ax.set_yticklabels(new_yticklabels)

# Add data labels to each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', label_type='edge', padding=4, fontsize=14)

plt.tight_layout()
plt.show()

# (Optional) Print the generated DataFrame to see the structure
print("--- Classification Report as a Pandas DataFrame ---")
print(df_report)