In [1]:
# !pip install -U torch torchvision transformers datasets peft accelerate bitsandbytes wandb matplotlib sentencepiece huggingface_hub dotenv nbformat optuna --no-cache-dir

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# Importing all the modules

In [3]:
import wandb
import huggingface_hub
import pandas as pd
import bitsandbytes as bnb
import torch, os, gc
from datasets import Dataset
import math
import optuna
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

hf_token = os.getenv("huggingface_token")
wandb_key = os.getenv("wandb_key")

huggingface_hub.login(token = hf_token)
wandb.login(key = wandb_key)

[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/sanja/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msanjayashrestha777[0m ([33msanjayashrestha777-thapathali-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Importing Model and Tokenizer

In [5]:
model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bits = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

## Importing Dataset and tokenizing

In [6]:
df = pd.read_json("../Datasets/train (1).jsonl", lines = True)
df_train = df[:50]
df_val = df[50:60]

def tokenize(x):
    return tokenizer(
        x["text"],
        truncation = True,
        padding = "max_length",
        max_length = 1024
    )

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)


ds_train = ds_train.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing train dataset"
)

ds_val = ds_val.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing val dataset"
 )

Tokenizing train dataset: 100%|██████████| 50/50 [00:00<00:00, 1105.61 examples/s]
Tokenizing val dataset: 100%|██████████| 10/10 [00:00<00:00, 1027.84 examples/s]


## Setting Lora Configurations

In [7]:

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM"
)

## Defining Training Arguments and Trainer for hyperparameter finetuning

### Training Arguments
> `TrainingArguments` is a configuration class form **Hugging Face Transformers** that hold training hyperparameter and runtime setting for `Trainer`

- `output-dir`: Directory where the checkpoints, logs and outputs are stored.
- `per_device_trin_batch_size`: Batch size per GPU/CPU for training.
- `per_device_eval_batch_size`: Batch size per device during evaluation.
- `num_train_epochs`: How many epochs to train for.
- `learning_rate`: Learning rate fo the optimizer.
- `bf16`: Mixed precision mode, It has wide exponent range but fewer mantissa bits. It reduces memory and speeds up training.
- `logging_steps`: how often to log metrics, setting it to 'x' means it will log metrics every 'x' training steps.
- `eval_strategy`: When to run evaluation ("steps" or "epoch")
- `eval_steps`: How often to evaluate if using strategy "steps"
- `save-steps`: How often to save checkpoints, setting it to 'x' means it will save checkpoints every 'x' training steps.
- `save_total_limit`: Max number of checkpoints to keep. oldest checkpoints are deleted/replaced by new one.
- `report_to`: Where to send logs ("wandb")

### Trainer Arguments
> `Trainer` is a class from **HuggingFace Transformers** that handles the training loops for models

- `model`: The model to train.
- `args`: The `TrainingArguments` object containing all hyperparameter and runtime settings.
- `train_datset`: The training dataset used.
- `eval_dataset`: Teh evaluation dataset used.
- `data_collator`: Batches samples together, it needs the tokenizer to get the pad token id, create attention masks and handle special tokens.

In [8]:
# Verbose logging for hyperparameter search
import transformers
import optuna

transformers.utils.logging.set_verbosity_info()
optuna.logging.set_verbosity(optuna.logging.INFO)


In [None]:

training_args = TrainingArguments(
    output_dir = '.drive/MyDrive/outputs/hp_search',
    per_device_train_batch_size = 2,
    num_train_epochs=2,
    warmup_steps=2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 8,
    bf16 = True,
    logging_steps = 10,
    eval_strategy = "steps",
    eval_steps = 10,
    save_steps = 10,
    save_total_limit = 2,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)


class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            perplexity =  math.exp(metrics["eval_loss"])
            metrics["eval_perplexity"] = perplexity
            #wandb.log({"eval_perplexity": perplexity})

# Load fresh base model (no PEFT history)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for k-bit training
base_model = prepare_model_for_kbit_training(base_model)
base_model = get_peft_model(base_model, lora_config)

# Cache model and initial weights to avoid reloading
def model_init():
    """Load fresh base model for each trial to avoid PEFT conflicts"""
    # Cleanup GPU
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        torch.cuda.ipc_collect()
    print(base_model)
    # Apply LoRA (first time, no warnings)
    return get_peft_model(base_model.unload(), lora_config)



trainer = Trainer(
    model_init = model_init,
    args = training_args,
    train_dataset = ds_train,
    eval_dataset = ds_val,
    data_collator = data_collator,
    callbacks = [PerplexityCallback()]
)


PyTorch: setting up devices
loading configuration file config.json from cache at /home/sanja/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": null,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_parameters": {
    "rope_theta": 10000.0,
    "rope_type": "default"
  },
  "tie_word_embeddings": false,
  "transformers_version": "5.0.0",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safeten

## HyperParameter Finetuning

Uses optuna as a framework to find the best hyperparametr for our model.

In [None]:

# Run hyperparameter search
best_hyperparams = trainer.hyperparameter_search(
        backend = "optuna",
        direction = "minimize",
        hp_space = lambda trial: {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        },
        n_trials = 1,  # Number of search trials
        compute_objective = lambda metrics: metrics["eval_loss"],
    )
print("Best hyperparameters:", best_hyperparams)


[32m[I 2026-02-01 19:08:57,669][0m A new study created in memory with name: no-name-2583a306-f3a4-4070-a05f-e4b99c014364[0m
Trial: {'learning_rate': 0.0008405936741321961, 'batch_size': 2, 'num_train_epochs': 3, 'warmup_steps': 3}
***** Running training *****
  Num examples = 50
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 12
  Number of trainable parameters = 4,194,304
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


[33m[W 2026-02-01 19:12:21,155][0m Trial 0 failed with parameters: {'learning_rate': 0.0008405936741321961, 'batch_size': 2, 'num_train_epochs': 3, 'warmup_steps': 3} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/sanja/.pyenv/versions/3.12.10/lib/python3.12/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/sanja/.pyenv/versions/3.12.10/lib/python3.12/site-packages/transformers/integrations/integration_utils.py", line 253, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/sanja/.pyenv/versions/3.12.10/lib/python3.12/site-packages/transformers/trainer.py", line 2174, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/sanja/.pyenv/versions/3.12.10/lib/python3.12/site-packages/transformers/trainer.py", line 2536, in _inner_training_loop
    tr_loss_step 

KeyboardInterrupt: 

## Actual Training

### Dataset Object preparation

In [None]:
df_train = df[:100]
df_val = df[100:110]

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)


ds_train = ds_train.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing train dataset"
)

ds_val = ds_val.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing val dataset"
 )

Tokenizing train dataset: 100%|██████████| 100/100 [00:00<00:00, 1042.72 examples/s]
Tokenizing val dataset: 100%|██████████| 10/10 [00:00<00:00, 946.88 examples/s]


In [None]:
train_model = model_init()

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

### Training arguments and trainer

In [None]:
wandb.init(project = "llama-finetune", name="final-training")

best_learning_rate = best_hyperparams.hyperparameters["learning_rate"]
# best_batch_size = best_hyperparams.hyperparameters["per_device_train_batch_size"]
# best_epochs = best_hyperparams.hyperparameters["num_train_epochs"]
# best_warmup = best_hyperparams.hyperparameters["warmup_steps"]

training_args = TrainingArguments(
    output_dir = './drive/MyDrive/outputs',
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 8,
    num_train_epochs =2,
    learning_rate = best_learning_rate,
    warmup_steps = 2,
    bf16 = True,
    logging_steps = 10,
    eval_strategy = "steps",
    eval_steps = 100,
    save_steps = 100,
    save_total_limit = 2,
    report_to = "wandb"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)


class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            perplexity =  math.exp(metrics["eval_loss"])
            metrics["eval_perplexity"] = perplexity
            wandb.log({"eval_perplexity": perplexity})

trainer = Trainer(
    model = train_model,
    args = training_args,
    train_dataset = ds_train,
    eval_dataset = ds_val,
    data_collator = data_collator,
    callbacks = [PerplexityCallback()]
)

NameError: name 'best_hyperparams' is not defined

### Start Training
> `resume_from_checkpoint` set to True resumes the training from the last checkpoint int the output_dir

In [None]:
trainer.train(resume_from_checkpoint = True)
trainer.save_model(f"./outputs/final")
tokenizer.save_pretrained(f"./outputs/final")
wandb.finish()