# The notebook was run in Google Colab

# Load Packages



In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/trl.git
!pip install flash-attn --no-build-isolation
!pip install quanto


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies .

# Load Curated Training and Validation Datasets and Tokenize Them for Training

In [None]:
# Mount GDrive - will prompt authentication
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# Load training and validation sets as hugging face datasets
from datasets import load_dataset
train_df = load_dataset('json', data_files='/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/reddit_dataset_train.jsonl', split='train')
validation_df = load_dataset('json', data_files='/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/reddit_dataset_val.jsonl', split='train')


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print (f"Number of training examples: {len(train_df)}")
print (f"Number of validation examples: {len(validation_df)}")

Number of training examples: 38400
Number of validation examples: 7781


In [None]:
print(train_df["text"][0])

print(validation_df["text"][0])


<s>[INST] You are a friendly parenting companion who gives helpful advice like a fellow parent would. You sound warm and practical — not robotic or formal. User's Prompt: The million dollar question - Apparently, my 6yrs old just found out about the word 'sex' and came asking. First I tried to wrapped my head around a suitable response while trying to find out where she heard it from, she quickly screamed 'school' cutting short my thinking time. Personally, I thought this was not a topic you say 'let's talk about this later, so I posed more questions to buy time; who mentioned it in school and how did they talk about it, what was the explanation you got? She tried mumbling some stuff, right then I had to say 'give me a moment we will talk about this later. What's the best way to approach this? [/INST] There is absolutely nothing wrong with saying “I’m not sure how to answer that, let me think about it and get back to you.” The key is you HAVE to follow up. Basic simple explanation. Don

In [None]:
## Get HF tokenizer
with open("/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/hftoken.txt") as f:
    HF_TOKEN = f.read().strip()


In [None]:
BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    padding_side="left",
    add_eos_token=False, # we already include this in the dataset
    add_bos_token=False, # we already include this in the dataset
    token = HF_TOKEN
)
tokenizer.pad_token = tokenizer.eos_token

max_length = 620 # Set this to roughly what you considered while choosing samples during dataset curation

def tokenize_sample(data):
  enc = tokenizer(
      data["text"],
      truncation=True,
      max_length=max_length,
      padding=True
  )
  # we duplicate input_ids into labels so the model learns to predict the next token at every step
  # hugging face requires it to compute loss
  enc["labels"] = enc["input_ids"].copy()
  return enc

tokenized_train_df = train_df.map(tokenize_sample)
tokenized_validation_df = validation_df.map(tokenize_sample)



tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/38400 [00:00<?, ? examples/s]

Map:   0%|          | 0/7781 [00:00<?, ? examples/s]

In [None]:
tokenized_train_df.column_names

['post_id',
 'comment_id',
 'text',
 'num_tokens',
 'input_ids',
 'attention_mask',
 'labels']

In [None]:
print (f"Number of training examples: {len(tokenized_train_df)}")
print (f"Number of validation examples: {len(tokenized_validation_df)}")

Number of training examples: 38400
Number of validation examples: 7781


# Set Up Instruction Tuning  Mistral-7B-Instruct-v0.3 using QLora

**LoRA (Low-Rank Adaptation): Freeze the model, add small trainable low-rank adapters.** LoRA is a parameter-efficient fine-tuning method that adds a small number of trainable low-rank matrices into the large model, while freezing the original model weights.

**QLoRA: Do the same, but run the frozen model in 4-bit quantization so you can fine-tune very large models on smaller hardware.** QLoRA builds on LoRA by loading the base model in in 4-bit quantization (using bitsandbytes / NF4), massively reducing memory. The LoRA adapters are still trained in 16-bit precision to preserve accuracy.

In [None]:
# LOAD THE BASE MODEL IN 4-BIT PRECISION WITH DOUBLE QUANTIZATION

import torch
from transformers import AutoModelForCausalLM

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # loads base model in 4-bit precision
    bnb_4bit_use_double_quant=True, # double quantization saves VRAM
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2", # FA2 is fastest on A100
    token=HF_TOKEN # login to hugging face
)


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
from peft import prepare_model_for_kbit_training
# model.gradient_checkpointing_enable()
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model) # preps a quantized model for fine tuning


In [None]:
from peft import LoraConfig, get_peft_model

# r is the dimensionality of the low-rank decomposition.
# In LoRA, instead of learning a full dW (say 4096×4096), we factorize it into two skinny matrices: A (4096xr) and B (rx4096).

# alpha: a scaling factor applied to the LoRA update. The LoRA update is dW = (alpha / r) * BA(x).
# alpha acts like a multiplier for the LoRA updates - a way to control the effective learning rate of the adapter layers.

config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

def print_train_params(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_train_params(model) # ~1.12% params are gonna be trained (small number)


trainable params: 42532864 || all params: 3800895488 || trainable%: 1.1190221918566996


In [None]:
# If more than 1 GPU, ok to set the model to train in parallel
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True



# Instruction Fine Tune Mistral-7B-Instruct-v0.3 using QLora

In [None]:
# Fine Tuning Set up

import transformers
from datetime import datetime

run_name = "parentpalai"
output_dir = "/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/" + run_name

# One step occurs after per_device_train_batch_size × gradient_accumulation_steps × n_gpus

# How we set it up?
# Run for 1-2 epochs i.e. 1 pass over training data --> can increase later if loss keeps improving
# We log after every 100 steps
# Log: Prints loss to console every 100 steps
# Eval: Runs eval every 100 steps
# Save: saves weights every 100 steps (checkpoint)


args = transformers.TrainingArguments(
    output_dir=output_dir,
    # ---- batches & schedule ----
    per_device_train_batch_size=6,      # T4-safe
    gradient_accumulation_steps=4,      # Accumulate gradients over 8 samples then update weights. So, global batch 2*8 = 16 samples per step
    per_device_eval_batch_size=2,       # batch size during evaluation
    num_train_epochs=1,                 # train for 1-2 full passes over the dataset (46832/16 = 2927 steps in one epoch)
    warmup_ratio=0.03,                  # first 3% of training steps are a warmup phase where learning rate ramps from 0 → 1e-4. Prevents instability at the start.
    learning_rate=1e-4,
    lr_scheduler_type="cosine",         # Learning rate follows a cosine decay curve after warmup
    # ---- precision / optimizer ----
    bf16=True,
    fp16=False,
    optim="paged_adamw_8bit",           # QLoRA-friendly
    # ---- logging / eval / save ----
    logging_steps=100,
    logging_dir=output_dir+"/logs",
    eval_strategy="steps",
    eval_steps=400,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=5,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",       # With eval enabled, loss is treated as eval_loss
    greater_is_better=False,
    # ---- misc ----
    gradient_checkpointing=True,
    report_to=["tensorboard"],
)

d_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer, mlm=False, pad_to_multiple_of=8
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_validation_df,
    args=args,
    data_collator=d_collator,
)

model.config.use_cache = False


In [None]:
# Start fine tuning

from google.colab import runtime
try:
  # Train
  trainer.train()
  # Get best model checkpoint
  print(trainer.state.best_model_checkpoint)
  # Save pretrained tokenizer
  tokenizer.save_pretrained(output_dir)
except Exception as e:
  print (e)
  # If there's an error, terminate colab session to save compute units
  runtime.unassign()



In [None]:
trainer.state.best_model_checkpoint

In [None]:
# Val loss is still reducing --> worth training for more epochs

trainer.args.num_train_epochs = 2
trainer.args.metric_for_best_model = "eval_loss"
trainer.args.load_best_model_at_end = True




In [None]:
from google.colab import runtime

# resumes from the last checkpoint
try:
  # Train
  trainer.train(resume_from_checkpoint=True)
  # Get best model checkpoint
  print (trainer.state.best_model_checkpoint)
  # Save the model and tokenizer etc - will save the best model as load_best_model_at_end = True now
  trainer.save_state()
  trainer.save_model(output_dir)
  tokenizer.save_pretrained(output_dir)
except Exception as e:
  print ("ERROR!")
  print (e)
  # If there's an error, terminate colab session to save compute units
  runtime.unassign()




Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss
2000,1.7578,1.941954
2400,1.7656,1.94094
2800,1.7687,1.939376
3200,1.7579,1.939546




/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/parentpalai/checkpoint-1600




In [None]:
## Once everything runs, terminate session to save compute units
runtime.unassign()


In Epoch 1,

| Step | Training Loss | Validation Loss |
|------|----------------|-----------------|
| 400  | 1.909600       | 1.947681        |
| 800  | 1.936500       | 1.924183        |
| 1200 | 1.931200       | 1.912943        |
| 1600 | 1.926500       | 1.910231        |

In Epoch 2,

| Step | Training Loss | Validation Loss |
|------|----------------|-----------------|
| 2000 | 1.757800       | 1.941954        |
| 2400 | 1.765600       | 1.940940        |
| 2800 | 1.768700       | 1.939376        |
| 3200 | 1.757900       | 1.939546        |

Training loss decreased in epoch 2, but validation loss increased --> looks like over-fitting.

Checkpoint 1600 is the best model.