In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install wandb --upgrade

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0.05,
    bias = "none",
    random_state = 3407,
    target_modules=['q_proj','k_proj','v_proj','o_proj']
)



## Data Preparation

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("franco334578/doric-conversations", split = "train")

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

In [None]:
dataset[100]

In [None]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

In [None]:
dataset[100]["text"]

In [None]:
from google.colab import userdata
import os


os.environ['WANDB_API_KEY'] = userdata.get("WANDB_API_KEY")
!wandb login

In [None]:
from trl import SFTTrainer, SFTConfig
from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name_with_time = f"doric_v4_{current_time}"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size=4, # NEW
        gradient_accumulation_steps = 8, #NEW
        warmup_steps=0,
        warmup_ratio = 0.075,
        num_train_epochs = 3,
        # max_steps = 30,
        learning_rate = 5e-4, #NEW changed from 2e-4
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb",
        run_name=run_name_with_time
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

In [None]:
trainer_stats = trainer.train()

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "What existed before the big bang?",
    }]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = True,
    return_tensors = "pt",
    return_dict = True,
)
outputs = model.generate(
    **inputs.to("cuda"),
    max_new_tokens = 2048, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

In [None]:
model.save_pretrained("gemma-3-doric")  # Local saving
tokenizer.save_pretrained("gemma-3-doric")

In [None]:
# if False:
#     from unsloth import FastModel
#     model, tokenizer = FastModel.from_pretrained(
#         model_name = "gemma-3-doric", # YOUR MODEL YOU USED FOR TRAINING
#         max_seq_length = 2048,
#         load_in_4bit = True,
#     )

# messages = [{
#     "role": "user",
#     "content": [{"type" : "text", "text" : "Explain game theory using Python",}]
# }]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize = True,
#     return_tensors = "pt",
#     return_dict = True,
# )

# from transformers import TextStreamer
# _ = model.generate(
#     **inputs.to("cuda"),
#     max_new_tokens = 1440, # Increase for longer outputs!
#     # Recommended Gemma-3 settings!
#     temperature = 1.0, top_p = 0.95, top_k = 64,
#     streamer = TextStreamer(tokenizer, skip_prompt = True),
# )

In [None]:
from google.colab import userdata

# Always keep this
tokenizer.save_pretrained("gemma-3-doric")

if False: # Change to True to save/upload GGUF
    from unsloth import FastModel
    model.save_pretrained("gemma-3-doric")  # Local saving
    model.push_to_hub_gguf(
        "franco334578/unsloth-gemma-3-4b-it-doric-v4-GGUF",
        tokenizer,
        quantization_method = ["q4_k_m",], # Only Q8_0, BF16, F16 supported
        token = userdata.get("HF_TOKEN"),
    )

if True: # Change to True to save/upload float16 for VLLM
  model.save_pretrained_merged("gemma-3-finetune", tokenizer)
  model.push_to_hub_merged("franco334578/unsloth-gemma-3-4b-it-doric-v4-f16", tokenizer, save_method = "merged_16bit", token = userdata.get("HF_TOKEN"),)

In [None]:
# # run in a Colab shell cell
# !apt-get update
# !apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
# !git clone https://github.com/ggerganov/llama.cpp
# !cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
# !cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
# !cp llama.cpp/build/bin/llama-* llama.cpp

## Run local model in LLama.cppl

In [None]:
# alternatively open a a Terminal and run

# ./llama.cpp/llama-cli \
#   -m gemma-3-4b-it.Q8_0.gguf \
#   --jinja \
#   --threads -1 \
#   --ctx-size 40960 \
#   --n-gpu-layers 99 \
#   --temp 0.7 \
#   --top-p 0.95



In [None]:
# # or load from HF

# !./llama.cpp/llama-cli -hf franco334578/unsloth-gemma-3-4b-it-doric-v4-GGUF