In [None]:
#!pip install -U torch torchvision transformers datasets peft accelerate bitsandbytes wandb matplotlib sentencepiece huggingface_hub dotenv nbformat optuna --no-cache-dir

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install bitsandbytes

# Importing all the modules

In [None]:
import huggingface_hub
import pandas as pd
import bitsandbytes as bnb
import torch, os, gc
from datasets import Dataset
import math
import optuna
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"
data = "/kaggle/input/needed-files/train (1).jsonl"
output_dir = '.drive/MyDrive/outputs/hp_search'

In [None]:
# logging from kaggle
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

# hf_key = user_secrets.get_secret("huggingface_token")
# wandb_key = user_secrets.get_secret("wandb_key")

# huggingface_hub.login(token = hf_token)
# wandb.login(key = wandb_key)

In [None]:

load_dotenv()

hf_token = os.getenv("huggingface_token")
wandb_key = os.getenv("wandb_key")

huggingface_hub.login(token = hf_token)
wandb.login(key = wandb_key)

## Importing Model and Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

if torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    device_map = "auto"
else:
    bnb_config = None
    device_map = {"": "cpu"}

## Importing Dataset and tokenizing

In [None]:
df = pd.read_json(data, lines = True)
df_train = df[:100]
df_val = df[100:110]

def tokenize(x):
    return tokenizer(
        x["text"],
        truncation = True,
        padding = "max_length",
        max_length = 512
    )

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)


ds_train = ds_train.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing train dataset"
)

ds_val = ds_val.map(
    tokenize,
    batched = True,
    batch_size = 32,
    remove_columns = df_train.columns,
    desc = "Tokenizing val dataset"
 )

## Setting Lora Configurations

In [None]:

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM"
)

## Defining Training Arguments and Trainer for hyperparameter finetuning

### Training Arguments
> `TrainingArguments` is a configuration class form **Hugging Face Transformers** that hold training hyperparameter and runtime setting for `Trainer`

- `output-dir`: Directory where the checkpoints, logs and outputs are stored.
- `per_device_trin_batch_size`: Batch size per GPU/CPU for training.
- `per_device_eval_batch_size`: Batch size per device during evaluation.
- `num_train_epochs`: How many epochs to train for.
- `learning_rate`: Learning rate fo the optimizer.
- `bf16`: Mixed precision mode, It has wide exponent range but fewer mantissa bits. It reduces memory and speeds up training.
- `logging_steps`: how often to log metrics, setting it to 'x' means it will log metrics every 'x' training steps.
- `eval_strategy`: When to run evaluation ("steps" or "epoch")
- `eval_steps`: How often to evaluate if using strategy "steps"
- `save-steps`: How often to save checkpoints, setting it to 'x' means it will save checkpoints every 'x' training steps.
- `save_total_limit`: Max number of checkpoints to keep. oldest checkpoints are deleted/replaced by new one.
- `report_to`: Where to send logs ("wandb")

### Trainer Arguments
> `Trainer` is a class from **HuggingFace Transformers** that handles the training loops for models

- `model`: The model to train.
- `args`: The `TrainingArguments` object containing all hyperparameter and runtime settings.
- `train_datset`: The training dataset used.
- `eval_dataset`: Teh evaluation dataset used.
- `data_collator`: Batches samples together, it needs the tokenizer to get the pad token id, create attention masks and handle special tokens.

## HyperParameter Finetuning

Uses optuna as a framework to find the best hyperparametr for our model.

In [None]:
os.environ["WANDB_DISABLED"] = "true"
# Run Optuna hyperparameter search directly
def objective(trial):
    trial_args = TrainingArguments(
        output_dir = output_dir,
        per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 2, 8),
        num_train_epochs = 2,
        warmup_steps = trial.suggest_int("warmup_steps", 0, 5),
        per_device_eval_batch_size = 2,
        fp16=torch.cuda.is_available(),
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        logging_steps = 10,
        eval_strategy = "steps",
        eval_steps = 10,
        save_steps = 10,
        save_total_limit = 2,
        report_to = None
    )

    data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
    )

    lora_config = LoraConfig(
    r = trial.suggest_categorical("r", [8, 16, 32, 64]),
    lora_alpha = trial.suggest_categorical("lora_alpha", [8, 16, 32, 64]),
    target_modules = trial.suggest_categorical("target_modules", [["q_proj", "v_proj"], ["q_proj", "v_proj", "k_proj"]]),
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM"
    )

    class PerplexityCallback(TrainerCallback):
        def on_evaluate(self, args, state, control, metrics=None, **kwargs):
            if metrics and "eval_loss" in metrics:
                perplexity =  math.exp(metrics["eval_loss"])
                metrics["eval_perplexity"] = perplexity
                #wandb.log({"eval_perplexity": perplexity})

    def model_init():
        """Load fresh base model for each trial to avoid PEFT conflicts"""
        # Apply LoRA (first time, no warnings)
        return get_peft_model(prepare_model_for_kbit_training(AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto"
        )), lora_config)

    trial_trainer = Trainer(
        model = model_init(),
        args = trial_args,
        train_dataset = ds_train,
        eval_dataset = ds_val,
        data_collator = data_collator,
        #callbacks = [PerplexityCallback()],
    )

    trial_trainer.train()
    metrics = trial_trainer.evaluate()
    del trial_trainer.model
    # Cleanup between trials
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    gc.collect()

    return metrics["eval_loss"]

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, gc_after_trial=True)

best_hyperparams = study.best_trial
print("Best hyperparameters:", best_hyperparams.params)
