In [1]:
!pip install -U torch torchvision transformers datasets peft accelerate bitsandbytes wandb matplotlib sentencepiece huggingface_hub dotenv nbformat optuna --no-cache-dir



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing all the modules

In [None]:
import wandb
import huggingface_hub
import pandas as pd
import bitsandbytes as bnb
import torch, os
import gc
from datasets import Dataset
import math
import optuna
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
load_dotenv()

hf_token = os.getenv("huggingface_token")
wandb_key = os.getenv("wandb_key")

huggingface_hub.login(token = hf_token)
wandb.login(key = wandb_key)

## Importing Model and Tokenizer

In [6]:
model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bits = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Importing Dataset and tokenizing

In [7]:
df = pd.read_json("./train (1).jsonl", lines = True)
df_train = df[:50]
df_val = df[50:60]

def tokenize(x):
    return tokenizer(
        x["text"],
        truncation = True,
        padding = "max_length",
        max_length = 1024
    )

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)


ds_train = ds_train.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing train dataset"
)

ds_val = ds_val.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing val dataset"
 )

Tokenizing train dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Tokenizing val dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

## Setting Lora Configurations

In [8]:

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM"
)

## Defining Training Arguments and Trainer for hyperparameter finetuning

### Training Arguments
> `TrainingArguments` is a configuration class form **Hugging Face Transformers** that hold training hyperparameter and runtime setting for `Trainer`

- `output-dir`: Directory where the checkpoints, logs and outputs are stored.
- `per_device_trin_batch_size`: Batch size per GPU/CPU for training.
- `per_device_eval_batch_size`: Batch size per device during evaluation.
- `num_train_epochs`: How many epochs to train for.
- `learning_rate`: Learning rate fo the optimizer.
- `bf16`: Mixed precision mode, It has wide exponent range but fewer mantissa bits. It reduces memory and speeds up training.
- `logging_steps`: how often to log metrics, setting it to 'x' means it will log metrics every 'x' training steps.
- `eval_strategy`: When to run evaluation ("steps" or "epoch")
- `eval_steps`: How often to evaluate if using strategy "steps"
- `save-steps`: How often to save checkpoints, setting it to 'x' means it will save checkpoints every 'x' training steps.
- `save_total_limit`: Max number of checkpoints to keep. oldest checkpoints are deleted/replaced by new one.
- `report_to`: Where to send logs ("wandb")

### Trainer Arguments
> `Trainer` is a class from **HuggingFace Transformers** that handles the training loops for models

- `model`: The model to train.
- `args`: The `TrainingArguments` object containing all hyperparameter and runtime settings.
- `train_datset`: The training dataset used.
- `eval_dataset`: Teh evaluation dataset used.
- `data_collator`: Batches samples together, it needs the tokenizer to get the pad token id, create attention masks and handle special tokens.

In [None]:

training_args = TrainingArguments(
    output_dir = '.drive/MyDrive/outputs/hp_search',
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 8,
    gradient_checkpointing = True,
    num_train_epochs = 3,
    learning_rate = 2e-4,
    bf16 = True,
    logging_steps = 10,
    eval_strategy = "steps",
    eval_steps = 100,
    save_steps = 100,
    save_total_limit = 2,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)


class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            perplexity =  math.exp(metrics["eval_loss"])
            metrics["eval_perplexity"] = perplexity
            wandb.log({"eval_perplexity": perplexity})

# Cache model and initial weights to avoid reloading
base_model = None
initial_state = None
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

def model_init():
    """Reuse cached model and reset weights instead of reloading"""
    torch.cuda.empty_cache()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    return model

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    train_dataset = ds_train,
    eval_dataset = ds_val,
    data_collator = data_collator,
    callbacks = [PerplexityCallback()]
)


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

## HyperParameter Finetuning

Uses optuna as a framework to find the best hyperparametr for our model.

In [None]:


def clear_gpu():
    """Clears GPU memory."""
    gc.collect()
    torch.cuda.empty_cache()
    print("GPU memory cleared.")

def hyperparameter_search(trainer):
    """Automated hyperparameter optimization using Optuna with GPU clearing."""
    
    # Define a callback to clear GPU after each trial
    def optuna_trial_callback(study, trial):
        clear_gpu()
    
    best_run = trainer.hyperparameter_search(
        backend="optuna",
        direction="minimize",
        hp_space=lambda trial: {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
            "per_device_train_batch_size": trial.suggest_categorical("batch_size", [2, 4]),
            "warmup_steps": trial.suggest_int("warmup_steps", 2, 3),
        },
        n_trials=2,
        compute_objective=lambda metrics: metrics["eval_loss"],
        # Pass Optuna callback
        callbacks=[optuna_trial_callback]  # Hugging Face Trainer supports callbacks
    )
    
    return best_run

# Run hyperparameter search
best_hyperparams = hyperparameter_search(trainer)
print("Best hyperparameters:", best_hyperparams)


[I 2026-02-01 10:51:16,516] A new study created in memory with name: no-name-d70e0aba-eef7-4b32-80c9-7b73487b8af8


Step,Training Loss,Validation Loss


## Actual Training

### Dataset Object preparation

In [None]:
df_train = df[:100]
df_val = df[100:110]

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)


ds_train = ds_train.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing train dataset"
)

ds_val = ds_val.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing val dataset"
 )

In [None]:
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto"
)

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()

### Training arguments and trainer

In [None]:
wandb.init(project = "llama-finetune", name="final-training")

best_learning_rate = best_hyperparams.hyperparameters["learning_rate"]
best_batch_size = best_hyperparams.hyperparameters["per_device_train_batch_size"]
best_epochs = best_hyperparams.hyperparameters["num_train_epochs"]
best_warmup = best_hyperparams.hyperparameters["warmup_steps"]

training_args = TrainingArguments(
    output_dir = './drive/MyDrive/outputs',
    per_device_train_batch_size = best_batch_size,
    per_device_eval_batch_size = best_batch_size,
    gradient_accumulation_steps = 8,
    gradient_checkpointing = True,
    num_train_epochs = best_epochs,
    learning_rate = best_learning_rate,
    warmup_steps = best_warmup,
    bf16 = True,
    logging_steps = 10,
    eval_strategy = "steps",
    eval_steps = 100,
    save_steps = 100,
    save_total_limit = 2,
    report_to = "wandb"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)


class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            perplexity =  math.exp(metrics["eval_loss"])
            metrics["eval_perplexity"] = perplexity
            wandb.log({"eval_perplexity": perplexity})

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = ds_train,
    eval_dataset = ds_val,
    data_collator = data_collator,
    callbacks = [PerplexityCallback()]
)

### Start Training
> `resume_from_checkpoint` set to True resumes the training from the last checkpoint int the output_dir

In [None]:
trainer.train(resume_from_checkpoint = True)
trainer.save_model(f"./outputs/final")
tokenizer.save_pretrained(f"./outputs/final")
wandb.finish()