In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install -q transformers bitsandbytes datasets peft accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!nvidia-smi

Wed Aug 20 20:44:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import json
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback

from typing import List, Dict, Optional
from peft import LoraConfig, get_peft_model, TaskType

In [6]:
class LoRATrainer:
    def __init__(self, data_path: str, model_name: str):
        self.data_path = data_path
        self.model_name = model_name

        self.lora_config = None
        self.bnb_config = None
        self.model = None

        self.tokenizer = None
        self.train_tokenized_data = None
        self.validation_tokenized_data = None
        self.test_tokenized_data = None

        self.train_dataset = None
        self.validation_dataset = None
        self.test_dataset = None

        if self.model_name is None or not self.model_name.strip():
            raise ValueError("Model name cannot be none or empty")

    def set_lora_conf(self, lora_config: Optional[Dict] = None):
        if lora_config is None or not lora_config:
            lora_config = {
                "r":8,
                "lora_alpha":16,
                "target_modules":['q_proj', 'v_proj'],
                "lora_dropout":0.05,
                "bias":"none",
                "task_type": TaskType.CAUSAL_LM
            }

        self.lora_config = LoraConfig(**lora_config)
        return self.lora_config

    def set_bnb_config(self, bnb_config: Optional[Dict] = None):
        if not bnb_config or bnb_config is None:
            bnb_config = {
                "load_in_4bit":True,
                "bnb_4bit_quant_type":'nf4',
                "bnb_4bit_compute_dtype":torch.bfloat16
            }

        self.bnb_config = BitsAndBytesConfig(**bnb_config)
        return self.bnb_config

    def load_model(self):
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config = self.bnb_config,
            device_map = "auto",
            trust_remote_code = True
        )
        self.model = get_peft_model(self.model, self.lora_config)
        return self.model

    def preprocess_data(self, batch: Dict):
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True)

        template = "### Instruction:\n{}### Response:\n{}"
        texts = [template.format(inst, resp) for inst, resp in zip(batch["instruction"], batch["response"])]

        tokens = self.tokenizer(
            texts,
            padding='max_length',
            max_length=256,
            truncation=True,
            return_tensors='pt'
        )

        tokens['labels'] =  tokens['input_ids'].clone()
        return tokens

    def set_tokenized_data(self, data_type: str = 'train'):
        if data_type == "train":
            self.train_tokenized_data = self.train_dataset.map(self.preprocess_data, batched=True, remove_columns=self.train_dataset.column_names)
            return self.train_tokenized_data
        elif data_type == "validation":
            self.validation_tokenized_data = self.train_dataset.map(self.preprocess_data, batched=True, remove_columns=self.validation_dataset.column_names)
            return self.validation_tokenized_data
        elif data_type == "test":
            self.test_tokenized_data = self.train_dataset.map(self.preprocess_data, batched=True, remove_columns=self.test_dataset.column_names)
            return self.test_tokenized_data

    def train(self, training_args: Optional[Dict]=None, adapted_model_path: str='./tinyllama-lora-ftuned-adapted'):
        if not training_args or training_args is None:
            training_args = {
                "output_dir":'./tinyllama-lora-ftuned-ml',
                'per_device_train_batch_size':4,
                'gradient_accumulation_steps':4,
                'learning_rate':1e-3,
                'num_train_epochs':50,
                'fp16':True,
                'report_to':'none',
                'logging_steps':20,
                'save_strategy':'epoch',
                'eval_strategy': 'epoch',
                'save_total_limit': 3,
                'load_best_model_at_end': True,
                'metric_for_best_model': 'eval_loss',
                'greater_is_better': False,
                'remove_unused_columns': False,
                'label_names': ['labels']
            }

        self.training_args = TrainingArguments(**training_args)

        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01
        )

        trainer = Trainer(
            model = self.model,
            args = self.training_args,
            train_dataset = self.train_tokenized_data,
            eval_dataset = self.validation_tokenized_data,
            processing_class = self.tokenizer,
            callbacks=[early_stopping]
        )

        print("Training the model")
        trainer.train()

        print("Saving the new weights")
        self.model.save_pretrained(adapted_model_path)
        self.tokenizer.save_pretrained(adapted_model_path)

    def prepare_and_load_data(self):
            preprocessed_data = []
            data = None

            try:
                with open(self.data_path) as f:
                    data = json.load(f)
            except FileNotFoundError:
                print(f"Error: File not found -> {self.data_path}")
                return []
            except json.JSONDecodeError as e:
                print(f"Error: Invalid JSON format in {self.data_path} - {e}")
            except Exception as e:
                print(f"Unexpected error while loading file: {e}")
                return []

            questions = data.get("questions", [])
            if not isinstance(questions, list):
                print("Error: 'questions' should be a list in the dataset")
                return []

            for idx, q in enumerate(questions, start=1):
                try:
                    preprocessed_data.append({
                        "instruction": f"[{q['category']} - {q['difficulty']}] {q['instruction']}",
                        "input": "",
                        "response": q["response"]
                    })
                except KeyError as e:
                    print(f"Skipping question {idx}: Missing key {e}")
                except Exception as e:
                    print(f"Skipping question {idx}: due to unexpected error - {e}")

            with open("lora_dataset.json", "w") as f:
                json.dump(preprocessed_data, f, indent=2)

            dataset = load_dataset("json", data_files="lora_dataset.json")


            dataset = dataset['train'].train_test_split(test_size=0.3, seed=42)
            train_dataset = dataset['train']
            temp_dataset = dataset['test']

            temp_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
            validation_dataset = temp_split['train']  # 15% of total
            test_dataset = temp_split['test']         # 15% of total

            self.train_dataset = train_dataset        # 70%
            self.validation_dataset = validation_dataset  # 15%
            self.test_dataset = test_dataset          # 15%

            print(f"Train samples: {len(self.train_dataset)}")
            print(f"Validation samples: {len(self.validation_dataset)}")
            print(f"Test samples: {len(self.test_dataset)}")

            return self.train_dataset, self.validation_dataset, self.test_dataset

In [7]:
data_path = "/content/drive/MyDrive/interview-questions/ml_interview_dataset.json"
model_name = 'TinyLLama/TinyLlama-1.1B-Chat-v1.0'

In [8]:
lora_trainer_obj = LoRATrainer(data_path=data_path, model_name=model_name)

In [9]:
lora_trainer_obj.prepare_and_load_data()

Generating train split: 0 examples [00:00, ? examples/s]

Train samples: 1843
Validation samples: 395
Test samples: 395


(Dataset({
     features: ['instruction', 'input', 'response'],
     num_rows: 1843
 }),
 Dataset({
     features: ['instruction', 'input', 'response'],
     num_rows: 395
 }),
 Dataset({
     features: ['instruction', 'input', 'response'],
     num_rows: 395
 }))

In [10]:
lora_trainer_obj.set_lora_conf()

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)

In [11]:
lora_trainer_obj.set_bnb_config()

BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [12]:
lora_trainer_obj.load_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [13]:
lora_trainer_obj.set_tokenized_data("train")
lora_trainer_obj.set_tokenized_data("validation")

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1843
})

In [14]:
lora_trainer_obj.train()

Training the model


Epoch,Training Loss,Validation Loss
1,0.1581,0.152233
2,0.1556,0.130579
3,0.1254,0.113734
4,0.1256,0.101821
5,0.0922,0.091262
6,0.0954,0.079208
7,0.0774,0.073059
8,0.0804,0.069218
9,0.0718,0.060351


Saving the new weights
