In [1]:
!pip install -U bitsandbytes
!pip install trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.0-py3-none-any.whl (462 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.0


In [2]:
import torch
import torch.nn as nn
from typing import Any
import pandas as pd
import tokenizers
from torch.utils.data import DataLoader, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,  AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from trl import SFTTrainer
from datasets import Dataset as HFDataset # Import Hugging Face Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# HYPERPARAMS
LORA_RANK=32
BATCH_SIZE = 2
GRADIENT_ACCUMULATION=16
DATA_MAX_LEN = 768
EPOCHS = 3
DROPOUT = 0.1
WEIGHT_DECAY = 0.01
TEMPERATURE = 0.8
GRAD_CLIP = 1 #0.3
WARMUP_RATIO = 0.01
LEARNING_RATE = 0.0002

In [4]:
def retrieve_data(path, tokenizer, batch_size=BATCH_SIZE):
    data = pd.read_csv(path).dropna()
    texts = ['Subject: ' + row['subject'] + '\n\n' + row['body']
             for _, row in data.iterrows()]

    full_dataset = HFDataset.from_dict({'text': texts})
    split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)

    return split_dataset['train'], split_dataset['test']

In [5]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    torch_dtype = torch.bfloat16,
    quantization_config = bnb_config,
    trust_remote_code=True
  )
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [6]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_RANK * 2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
        ],
    lora_dropout=DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"

)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 25,231,360 || all params: 1,125,279,744 || trainable%: 2.2422


In [7]:
train_dataset, test_dataset = retrieve_data('gmail_data.csv', tokenizer)

training_args = TrainingArguments(
    output_dir="email-finetuned-model",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=GRAD_CLIP,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    optim="paged_adamw_8bit",
    warmup_ratio=WARMUP_RATIO,
    report_to="none",
    bf16=True, # Enable bfloat16 training
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
)
trainer.train()



Adding EOS to train dataset:   0%|          | 0/1164 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1164 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2657 > 2048). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/1164 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/292 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/292 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/292 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.7054,1.561143,1.574822,487321.0,0.650456
2,1.473,1.440273,1.436223,974642.0,0.673443
3,1.3352,1.408044,1.331827,1461963.0,0.682451


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=111, training_loss=1.5432924144976847, metrics={'train_runtime': 842.0927, 'train_samples_per_second': 4.147, 'train_steps_per_second': 0.132, 'total_flos': 1.4030786743861248e+16, 'train_loss': 1.5432924144976847, 'epoch': 3.0})

In [9]:
model_dir = training_args.output_dir + '/checkpoint-' + str(EPOCHS * 37)
# model_dir = "email-finetuned-model"  # Your checkpoint
# Load fine-tuned model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    model_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_dir)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create pipeline - handles dtype automatically
pipe = pipeline(
    "text-generation",
    model=finetuned_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Use pipeline
result = pipe(
    "Subject: Generate an email to tell my employer about myself",
    max_new_tokens=256,
    do_sample=True,
    temperature=TEMPERATURE,
    top_p=0.95,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

print(result[0]['generated_text'])

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


Subject: Generate an email to tell my employer about myself

Hi John,  My name is Kaustubh Sonawane and I am an intern at Zendesk from the University of California, Berkeley. My major is Computer Science & Business and my minor is Computer Science with a focus on Artificial Intelligence. I recently started the process of applying for summer internships and I wanted to get a better feel about Zendesk before I apply in a few weeks. Would you be able to spare a few minutes of your time on a call to discuss the future opportunities at Zendesk? I have attached my resume to this email.  --  Thank you Kaustubh Sonawane 


In [11]:
import os
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

def save_as_gguf(model_dir, output_base_dir, quantize=True):
    """
    Merge LoRA weights and convert to GGUF format.

    Args:
        model_dir: Path to checkpoint with LoRA adapters
        output_base_dir: Base output directory (e.g., training_args.output_dir)
        quantize: Whether to create Q4 quantized version (recommended)
    """
    temp_path = os.path.join(output_base_dir, 'merged_temp')
    gguf_path = os.path.join(output_base_dir, 'gguf')

    # Step 1: Merge
    print("="*60)
    print("MERGING LORA WEIGHTS")
    print("="*60)

    model = AutoPeftModelForCausalLM.from_pretrained(
        model_dir,
        device_map="cpu",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    merged = model.merge_and_unload()
    merged.save_pretrained(temp_path, safe_serialization=True, max_shard_size="2GB")

    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    tokenizer.save_pretrained(temp_path)

    print(f"✓ Merged: {temp_path}")

    # Step 2: Install llama.cpp if needed
    if not os.path.exists("llama.cpp/convert-hf-to-gguf.py"):
        print("\n" + "="*60)
        print("INSTALLING LLAMA.CPP")
        print("="*60)
        os.system("git clone https://github.com/ggerganov/llama.cpp")
        os.system("cd llama.cpp && make")

    # Step 3: Convert
    os.makedirs(gguf_path, exist_ok=True)

    print("\n" + "="*60)
    print("CONVERTING TO GGUF")
    print("="*60)

    fp16_file = os.path.join(gguf_path, 'model-fp16.gguf')
    os.system(f"python llama.cpp/convert_hf_to_gguf.py {temp_path} --outtype f16 --outfile {fp16_file}")

    if quantize and os.path.exists(fp16_file):
        print("\n" + "="*60)
        print("QUANTIZING TO Q4")
        print("="*60)

        q4_file = os.path.join(gguf_path, 'model-q4.gguf')
        os.system(f"./llama.cpp/quantize {fp16_file} {q4_file} Q4_K_M")

        if os.path.exists(q4_file):
            q4_size = os.path.getsize(q4_file) / (1024**3)
            print(f"\n✓ GGUF Q4: {q4_file} ({q4_size:.2f} GB)")
            return q4_file

    return fp16_file

# Use it:
gguf_model = save_as_gguf(
    model_dir=model_dir,
    output_base_dir=training_args.output_dir,
    quantize=True
)

print(f"\n✓ Model saved as GGUF: {gguf_model}")

MERGING LORA WEIGHTS
✓ Merged: email-finetuned-model/merged_temp

INSTALLING LLAMA.CPP

CONVERTING TO GGUF

QUANTIZING TO Q4

✓ Model saved as GGUF: email-finetuned-model/gguf/model-fp16.gguf


In [12]:
!zip -r /content/email-finetuned.zip /content/email-finetuned-model/gguf/
from google.colab import files
files.download('/content/email-finetuned.zip')

  adding: content/email-finetuned-model/gguf/ (stored 0%)
  adding: content/email-finetuned-model/gguf/model-fp16.gguf (deflated 10%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    model_dir,
    device_map="cpu",  # Use "cpu" instead of leaving it auto
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# Merge LoRA weights with base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(
    training_args.output_dir + '/merged_model',
    safe_serialization=True,
    max_shard_size="2GB"
)

# Also save the tokenizer
tokenizer.save_pretrained(training_args.output_dir + '/merged_model')

print("Model merged and saved successfully!")