In [1]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B", #unsloth/Llama-3.2-3B-Instruct
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from huggingface_hub import login
login("-***")  # Replace with your actual token

from datasets import load_dataset
dataset = load_dataset("alibayram/doktorsitesi", split="train")

def convert_to_prompt_format(examples):
    instructions = ["Cevap verilecek bir sağlık sorusu var. Doktorun uzmanlığı: " + speciality
                    for speciality in examples["doctor_speciality"]]
    inputs = examples["question_content"]  # Kullanıcının sorduğu soru
    outputs = examples["question_answer"]  # Doktorun cevabı

    return {"instruction": instructions, "input": inputs, "output": outputs}

dataset = dataset.map(convert_to_prompt_format, batched=True)


README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/84.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/187632 [00:00<?, ? examples/s]

Map:   0%|          | 0/187632 [00:00<?, ? examples/s]

In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

#EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + "end"#EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

#from datasets import load_dataset
#dataset = load_dataset("alibayram/doktorsitesi", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/187632 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/187632 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/187632 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/187632 [00:00<?, ? examples/s]

In [7]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.748 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 187,632 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.0068
2,3.1825
3,3.2808
4,3.1133
5,2.8352
6,2.8435
7,2.5903
8,2.6176
9,2.3805
10,2.3948


In [9]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

922.4141 seconds used for training.
15.37 minutes used for training.
Peak reserved memory = 7.637 GB.
Peak reserved memory for training = 1.889 GB.
Peak reserved memory % of max memory = 51.808 %.
Peak reserved memory for training % of max memory = 12.815 %.


In [10]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "kadin-hastaliklari-ve-dogum", # instruction
        "merhaba doktor bey, eşimle bebek sahibi olmak istiyoruz 2 aydır korunmuyoruz.bu arada vajinal kuruluğum olduğu için bebe yağı kullanıyorduk sanırım spermleri öldürüomuş bebe yağı pressed adında bi jeli gördüm netten önerir misiniz? ya da bu konuda ne önerir siniz? teşekkürler", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
kadin-hastaliklari-ve-dogum

### Input:
merhaba doktor bey, eşimle bebek sahibi olmak istiyoruz 2 aydır korunmuyoruz.bu arada vajinal kuruluğum olduğu için bebe yağı kullanıyorduk sanırım spermleri öldürüomuş bebe yağı pressed adında bi jeli gördüm netten önerir misiniz? ya da bu konuda ne önerir siniz? teşekkürler

### Response:
Merhaba, Bebe Yağı Pressed diye bir ürün yok. Bu ürünün adı Bebe Yağı'dir. Bebe yağı sperm öldürmez. Ancak korunma yöntemini kullanmamanız bebe yağı kullanmanızın da bebe yağı kullanmanızızn sperm öldürmesi arasında fark yoktur. Bu nedenle korunma yöntemini kullanmanız gerekir. Geçmiş olsunendan sonra korunma yöntemini kullanmanız gerekirendan sonra bu sorununuzu tekrar etmeyinendan sonra korunma yöntemini kullanmanız gerekir. Geçmiş olsunendan sonra


In [11]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "mrb hocam ben panik atak hastasıyım hep vücudumu dinlemekle meşgulum midemde yanma geğirti var birde mide ve karın bölgemde kalp atışı gibi bişey var dışarıdan belli oluyor atışı özlellikle heyacanlandığımda sinirlendiğimde daha hızlı atıyor ne olabilir bu bir hastalık belirtisimi yoksa stresten mi oluyor ne yapacağımı bilmiyorum lütfen bana yardımcı olun", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
 

### Input:
mrb hocam ben panik atak hastasıyım hep vücudumu dinlemekle meşgulum midemde yanma geğirti var birde mide ve karın bölgemde kalp atışı gibi bişey var dışarıdan belli oluyor atışı özlellikle heyacanlandığımda sinirlendiğimde daha hızlı atıyor ne olabilir bu bir hastalık belirtisimi yoksa stresten mi oluyor ne yapacağımı bilmiyorum lütfen bana yardımcı olun

### Response:
Bu durumda doktorunuza danışmanızda yarar var.endoskopik incelemeler ve diğer testler ile bu durumunuzu aydınlatmak gerekebilir.endoskopik incelemeler için endoskopist ile görüşmenizde yarar var.endoskopistiniz ile görüşüp teşhis ve tedavi olmanızda yarar var.endoskopistiniz ile görüşüp teşhis ve tedavi olmanızda yarar var.endoskopistiniz ile görüşüp teşhis ve tedavi olmanızda yarar var.endoskopistiniz ile görüşüp te

In [12]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "Ayşe Hocam merhaba, Şuan 23 aylık olan kızıma yaklaşık bir yıl önce epilepsi tedavisi uygulanmaya başlandı. Luminetten adlı ilacı kullanıyoruz. 14 aylıkken yeniden geçirdi ve size danışarak ilaç arttırımına gittik. dirençli bir enfeksiyon döneminde son nöbetinden 9 ay sonra yeniden nöbet geçirmeye başladı. Nöbetler uyku esnasında oluyor ve en uzunu 10 sn sürüyor. Kilosu10.5 şuan sabah 1.5 akşam 1.5 luminetten kullanıyoruz. Görmeniz gerekir mi? Kanındaki fenobarbital verildikten 4saat sonra 31.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
 

### Input:
Ayşe Hocam merhaba, Şuan 23 aylık olan kızıma yaklaşık bir yıl önce epilepsi tedavisi uygulanmaya başlandı. Luminetten adlı ilacı kullanıyoruz. 14 aylıkken yeniden geçirdi ve size danışarak ilaç arttırımına gittik. dirençli bir enfeksiyon döneminde son nöbetinden 9 ay sonra yeniden nöbet geçirmeye başladı. Nöbetler uyku esnasında oluyor ve en uzunu 10 sn sürüyor. Kilosu10.5 şuan sabah 1.5 akşam 1.5 luminetten kullanıyoruz. Görmeniz gerekir mi? Kanındaki fenobarbital verildikten 4saat sonra 31.

### Response:
Merhaba. Şuan nöbetleriniz bir sorun değil, normal bir süreç. Bu nedenle ilaçlarınızı azaltmanız gerekmiyor. Geçmiş olsun.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe etmeyin.endişe 

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is a famous tall tower in Paris?

### Input:


### Response:
One of the most famous and iconic tall towers in Paris is the Eiffel Tower. Standing at 324 meters (1,063 feet) tall, this wrought iron tower is a symbol of the city and a must-see attraction for tourists from all over the world.<|end_of_text|>


In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )

In [None]:
if True: model.push_to_hub_gguf("ahmadtab/3bModel", tokenizer, quantization_method = "q4_k_m", token = "****")


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.49 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 44%|████▍     | 14/32 [00:01<00:01, 11.80it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:42<00:00,  3.19s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving ahmadtab/3bModel/pytorch_model-00001-of-00004.bin...
Unsloth: Saving ahmadtab/3bModel/pytorch_model-00002-of-00004.bin...
Unsloth: Saving ahmadtab/3bModel/pytorch_model-00003-of-00004.bin...
Unsloth: Saving ahmadtab/3bModel/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at ahmadtab/3bModel into f16 GGUF format.
The output location will be /content/ahmadtab/3bModel/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: 3bModel
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

In [None]:
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")