Colab URL: https://colab.research.google.com/drive/1Fek9aow_9hJJ6Me3muKWJVsMaRA2Cg4O?usp=sharing

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.6: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("serpentilec137/gita-verse-qna-dataset", split="train")

README.md:   0%|          | 0.00/515 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/383k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/46.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1138 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/127 [00:00<?, ? examples/s]

In [None]:
dataset[5]

{'question': "What happens quickly to one engaged in devotional service even after committing abominable acts, and what declaration should Arjuna boldly make about Krsna's devotee, as per Bhagavad Gita 9.31?",
 'answer': "Gita 9.31 states: 'He quickly becomes righteous (ksipram bhavati dharmatma) and attains lasting peace (sasvac-chantim nigacchati). O son of Kunti (kaunteya), declare it boldly (pratijanihi) that My devotee never perishes (na me bhaktah pranasyati).'",
 'source_chapter': '9',
 'source_verse': '31'}

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("serpentilec137/gita-verse-qna-dataset", split="train")

# Inspect the first entry to verify field names
print(dataset[0])

# Format into ShareGPT-style 'conversations' for chatml template
def format_to_conversations(example):
    return {
        "conversations": [
            {"from": "human", "value": example["question"]},
            {"from": "gpt", "value": example["answer"]},
        ]
    }

dataset = dataset.map(format_to_conversations)

# Now apply tokenizer's chat template
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    map_eos_token = True,
)

def formatting_prompts_func(examples):
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in examples["conversations"]]
    return { "text": texts }

# Apply formatting in batched mode
dataset = dataset.map(formatting_prompts_func, batched=True)

# Optional sanity check
print(dataset[0]["text"])


{'question': 'What does Krsna state He is in relation to death and generation, and what feminine qualities does He embody in Bhagavad Gita 10.34?', 'answer': "Krsna states in Gita 10.34: 'I am all-devouring death (mrtyuh sarva-haras caham), and I am the generator of all things yet to be (udbhavas ca bhavisyatam). Among women (narinam) I am fame (kirtih), fortune (srir), speech (vak ca), memory (smrtir), intelligence (medha), faithfulness (dhrtih) and patience (ksama).'", 'source_chapter': '10', 'source_verse': '34'}


Map:   0%|          | 0/1138 [00:00<?, ? examples/s]

Unsloth: Will map <|im_end|> to EOS = </s>.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Map:   0%|          | 0/1138 [00:00<?, ? examples/s]

<|im_start|>user
What does Krsna state He is in relation to death and generation, and what feminine qualities does He embody in Bhagavad Gita 10.34?<|im_end|>
<|im_start|>assistant
Krsna states in Gita 10.34: 'I am all-devouring death (mrtyuh sarva-haras caham), and I am the generator of all things yet to be (udbhavas ca bhavisyatam). Among women (narinam) I am fame (kirtih), fortune (srir), speech (vak ca), memory (smrtir), intelligence (medha), faithfulness (dhrtih) and patience (ksama).'<|im_end|>



In [None]:
dataset[5]["conversations"]

[{'from': 'human',
  'value': "What happens quickly to one engaged in devotional service even after committing abominable acts, and what declaration should Arjuna boldly make about Krsna's devotee, as per Bhagavad Gita 9.31?"},
 {'from': 'gpt',
  'value': "Gita 9.31 states: 'He quickly becomes righteous (ksipram bhavati dharmatma) and attains lasting peace (sasvac-chantim nigacchati). O son of Kunti (kaunteya), declare it boldly (pratijanihi) that My devotee never perishes (na me bhaktah pranasyati).'"}]

In [None]:
print(dataset[5]["text"])

<|im_start|>user
What happens quickly to one engaged in devotional service even after committing abominable acts, and what declaration should Arjuna boldly make about Krsna's devotee, as per Bhagavad Gita 9.31?<|im_end|>
<|im_start|>assistant
Gita 9.31 states: 'He quickly becomes righteous (ksipram bhavati dharmatma) and attains lasting peace (sasvac-chantim nigacchati). O son of Kunti (kaunteya), declare it boldly (pratijanihi) that My devotee never perishes (na me bhaktah pranasyati).'<|im_end|>



In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant/therapist to the user which aims to guide the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,  # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1138 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
14.576 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,138 | Num Epochs = 3 | Total steps = 426
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)


KeyboardInterrupt: 

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2689.3553 seconds used for training.
44.82 minutes used for training.
Peak reserved memory = 14.576 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 98.881 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
model.save_pretrained("gita-therapist-llm")  # Local saving
tokenizer.save_pretrained("gita-therapist-llm")

('gita-therapist-llm/tokenizer_config.json',
 'gita-therapist-llm/special_tokens_map.json',
 'gita-therapist-llm/tokenizer.json')

In [None]:
model.push_to_hub("serpentilec137/gita-therapist-llm") # Online saving
tokenizer.push_to_hub("serpentilec137/gita-therapist-llm") # Online saving

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/serpentilec137/gita-therapist-llm


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

In [None]:
model.push_to_hub_merged(new_model_name, tokenizer, save_method = "merged_16bit")

Unsloth: You are pushing to hub, but you passed your HF username = serpentilec137.
We shall truncate serpentilec137/gita-therapist-llm to gita-therapist-llm
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.18 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  0%|          | 0/32 [00:00<?, ?it/s]
We will save to Disk and not RAM now.
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32/32 [04:36<00:00,  8.65s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gita-therapist-llm/pytorch_model-00001-of-00003.bin...
Unsloth: Saving gita-therapist-llm/pytorch_model-00002-of-00003.bin...
Unsloth: Saving gita-therapist-llm/pytorch_model-00003-of-00003.bin...
Done.
Saved merged model to https://huggingface.co/serpentilec137/gita-therapist-llm


In [None]:
model.push_to_hub_gguf(new_model_name, tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.76 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32/32 [05:00<00:00,  9.41s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving serpentilec137/gita-therapist-llm/pytorch_model-00001-of-00003.bin...
Unsloth: Saving serpentilec137/gita-therapist-llm/pytorch_model-00002-of-00003.bin...
Unsloth: Saving serpentilec137/gita-therapist-llm/pytorch_model-00003-of-00003.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at serpentilec137/gita-therapist-llm into f16 GGUF format.
The output location will be /content/serpentilec137/gita-therapist-llm/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gita-therapist-llm
INFO:hf-to-gguf:Model architecture: MistralForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading m

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "gita-therapist-llm", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "How can I overcome fear?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

<|im_start|>user
How can I overcome fear?<|im_end|>
<|im_start|>assistant
Krsna advises in Bhagavad Gita 2.19: 'Be steadfast in yoga (yoga-sthah kuru karmani), O Arjuna, and perform your duty (karma-samah sarvasah) without seeking to obtain any result (sangam tyaktva). Such evenness of mind (samatvam) is said to be the highest form of religion (yoga ucyate).'<|im_end|>


In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer

    model = AutoModelForPeftCausalLM.from_pretrained(
        "gita-therapist-llm",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora")

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

In [45]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096, padding_idx=770)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [None]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

# Load the model from Hugging Face Hub
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "serpentilec137/gita-therapist-llm",
    dtype = None,
    load_in_4bit = True,
)

# Set up the chat template again
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    map_eos_token = True,
)

# Chat function
def chat_with_model(prompt):
    messages = [{"from": "human", "value": prompt}]
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt_text):].strip()


==((====))==  Unsloth 2025.5.6: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

In [None]:
# Try a sample question
response = chat_with_model("How can I overcome fear?")
print("Assistant:", response)

Assistant: ays in Bhagavad Gita 16.2: 'Be always in My protection (mam asritya), with mind and intelligence fixed on Me (tatah satatam kalpate). Thus you will pass over all obstacles of life (anekasheshu yogesvara) and come to Me (prasiddhim sam-agram mayi).'


trying the model