In [1]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [2]:


# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.11 (you have 3.11.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B", #unsloth/Llama-3.2-3B-Instruct
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-27b-bnb-4bit", #unsloth/Llama-3.2-3B-Instruct
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.11 (you have 3.11.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.8.5: Fast Gemma2 patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/7.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/7.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.8.5 patched 46 layers with 46 QKV layers, 46 O layers and 46 MLP layers.


In [None]:
from huggingface_hub import login
login("****")  # Replace with your actual token


In [4]:

from datasets import load_dataset
dataset = load_dataset("ahmadtab/medical_qa_dataset_tr", split="train")

def convert_to_prompt_format(examples):
    instructions = ["Cevap verilecek bir sağlık sorusu var. Doktorun uzmanlığı: " + speciality
                    for speciality in examples["doctor_speciality"]]
    inputs = examples["question_content"]  # Kullanıcının sorduğu soru
    outputs = examples["question_answer"]  # Doktorun cevabı

    return {"instruction": instructions, "input": inputs, "output": outputs}

dataset = dataset.map(convert_to_prompt_format, batched=True)


README.md:   0%|          | 0.00/525 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/76.1M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/167604 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/37527 [00:00<?, ? examples/s]

Map:   0%|          | 0/167604 [00:00<?, ? examples/s]

In [9]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

#EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + "end"#EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/167604 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 200,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/167604 [00:00<?, ? examples/s]

In [11]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
19.07 GB of memory reserved.


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 167,604 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 114,180,096 of 27,341,308,416 (0.42% trained)
AUTOTUNE bmm(64x184x128, 64x128x184)
  bmm 0.0276 ms 100.0% 
  triton_bmm_14 0.0287 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_6 0.0297 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_7 0.0307 ms 90.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  trit

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.6023
2,2.5439
3,2.6523
4,2.4814
5,2.4747
6,1.9882
7,2.112
8,2.1226
9,2.0908
10,1.9317


In [13]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

977.4387 seconds used for training.
16.29 minutes used for training.
Peak reserved memory = 19.07 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 48.209 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [14]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "kadin-hastaliklari-ve-dogum", # instruction
        "merhaba doktor bey, eşimle bebek sahibi olmak istiyoruz 2 aydır korunmuyoruz.bu arada vajinal kuruluğum olduğu için bebe yağı kullanıyorduk sanırım spermleri öldürüomuş bebe yağı pressed adında bi jeli gördüm netten önerir misiniz? ya da bu konuda ne önerir siniz? teşekkürler", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
kadin-hastaliklari-ve-dogum

### Input:
merhaba doktor bey, eşimle bebek sahibi olmak istiyoruz 2 aydır korunmuyoruz.bu arada vajinal kuruluğum olduğu için bebe yağı kullanıyorduk sanırım spermleri öldürüomuş bebe yağı pressed adında bi jeli gördüm netten önerir misiniz? ya da bu konuda ne önerir siniz? teşekkürler

### Response:


AUTOTUNE bmm(32x130x128, 32x128x130)
  triton_bmm_158 0.0154 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_157 0.0174 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_162 0.0174 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_165 0.0174 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_166 0.0174 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_161 0.

Merhaba, Bebe yağı kullanmayın. Vajinal kuruluk için vajinal jel kullanın.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adresine girin.endometriozis.com adres


In [15]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "mrb hocam ben panik atak hastasıyım hep vücudumu dinlemekle meşgulum midemde yanma geğirti var birde mide ve karın bölgemde kalp atışı gibi bişey var dışarıdan belli oluyor atışı özlellikle heyacanlandığımda sinirlendiğimde daha hızlı atıyor ne olabilir bu bir hastalık belirtisimi yoksa stresten mi oluyor ne yapacağımı bilmiyorum lütfen bana yardımcı olun", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
 

### Input:
mrb hocam ben panik atak hastasıyım hep vücudumu dinlemekle meşgulum midemde yanma geğirti var birde mide ve karın bölgemde kalp atışı gibi bişey var dışarıdan belli oluyor atışı özlellikle heyacanlandığımda sinirlendiğimde daha hızlı atıyor ne olabilir bu bir hastalık belirtisimi yoksa stresten mi oluyor ne yapacağımı bilmiyorum lütfen bana yardımcı olun

### Response:
Merhaba, panik atak hastalarında bu tür şikayetler sıklıkla görülür. Ancak bu şikayetlerinizin panik atakla ilgili olup olmadığını ancak bir psikiyatristin muayene etmesi ve değerlendirmesi sonucunda anlaşılabilir. Bu nedenle bir psikiyatristten randevu almanızda fayda var. Geçmiş olsun.endokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojiendokrinolojie

In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "Ayşe Hocam merhaba, Şuan 23 aylık olan kızıma yaklaşık bir yıl önce epilepsi tedavisi uygulanmaya başlandı. Luminetten adlı ilacı kullanıyoruz. 14 aylıkken yeniden geçirdi ve size danışarak ilaç arttırımına gittik. dirençli bir enfeksiyon döneminde son nöbetinden 9 ay sonra yeniden nöbet geçirmeye başladı. Nöbetler uyku esnasında oluyor ve en uzunu 10 sn sürüyor. Kilosu10.5 şuan sabah 1.5 akşam 1.5 luminetten kullanıyoruz. Görmeniz gerekir mi? Kanındaki fenobarbital verildikten 4saat sonra 31.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
 

### Input:
Ayşe Hocam merhaba, Şuan 23 aylık olan kızıma yaklaşık bir yıl önce epilepsi tedavisi uygulanmaya başlandı. Luminetten adlı ilacı kullanıyoruz. 14 aylıkken yeniden geçirdi ve size danışarak ilaç arttırımına gittik. dirençli bir enfeksiyon döneminde son nöbetinden 9 ay sonra yeniden nöbet geçirmeye başladı. Nöbetler uyku esnasında oluyor ve en uzunu 10 sn sürüyor. Kilosu10.5 şuan sabah 1.5 akşam 1.5 luminetten kullanıyoruz. Görmeniz gerekir mi? Kanındaki fenobarbital verildikten 4saat sonra 31.

### Response:
Merhaba, Epilepsi tedavisi için çocuk nöroloji uzmanına başvurmanız gerekir.endokrinoloji uzmanı epilepsi tedavisi yapmaz.endokrinoloji uzmanı çocuklukta büyüme bozuklukları,ergenlik çağı sorunları,tiroid hastalıkları,diyabet,kalsiyum metabolizması bozuklukları,yüksek tansiyon,kalp hastalık

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
model.save_pretrained("/content/drive/MyDrive/turkishLLMCompetition/LastModel")  # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/turkishLLMCompetition/LastModel")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('/content/drive/MyDrive/turkishLLMCompetition/LastModel/tokenizer_config.json',
 '/content/drive/MyDrive/turkishLLMCompetition/LastModel/special_tokens_map.json',
 '/content/drive/MyDrive/turkishLLMCompetition/LastModel/tokenizer.model',
 '/content/drive/MyDrive/turkishLLMCompetition/LastModel/added_tokens.json',
 '/content/drive/MyDrive/turkishLLMCompetition/LastModel/tokenizer.json')

In [30]:
from google.colab import files
import os
import shutil

# Define the path to the saved model
model_path = "/content/Model2/lora_model"
zip_path = "/content/gemma-2-27b-bnb-4bit2222.zip"

# Create a zip file of the model directory
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', model_path)

# Download the zip file
files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
import os
import shutil

# Define the path to the saved model
model_path = "/content/Model2/lora_model"
zip_path = "/content/gemma-2-27b-bnb-4bit2222.zip"

# Create a zip file of the model directory
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', model_path)

# Download the zip file
files.download(zip_path)

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )

In [None]:
if True: model.push_to_hub_gguf("ahmadtab/3bModel", tokenizer, quantization_method = "q4_k_m", token = "****")


In [None]:
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")