<a href="https://colab.research.google.com/github/sammyzane2/images2/blob/main/story_finetunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ✅ One-shot install for Unsloth on Google Colab (T4 GPU compatible)
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.18.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.6.1-py3-none-any.whl.metadata (8.1 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.18.2-py3-none-any.whl (366 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
# file ipython-input-18-49973641
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, Dataset # Import Dataset
import json # Import json
from accelerate import Accelerator # Import Accelerator

# Initialize Accelerator explicitly
# This helps ensure that accelerate correctly detects the device
accelerator = Accelerator()

max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!

# Get LAION dataset - Load data manually and create a Dataset
try:
    with open("formatted_dataset.json", "r") as f:
        data = json.load(f)
    # Assuming the JSON structure is a list of dictionaries
    # If your JSON is structured differently, you might need to adjust this
    dataset = Dataset.from_list(data)
except FileNotFoundError:
    print("Error: formatted_dataset.json not found.")
    # Handle the case where the file is not found, perhaps exit or load a dummy dataset
    # Create a dummy dataset to allow the rest of the code to run
    dataset = Dataset.from_dict({
        'instruction': ["This is a dummy instruction."],
        'response': ["This is a dummy response."]
    })
except json.JSONDecodeError:
    print("Error: Could not decode formatted_dataset.json. Make sure it is valid JSON.")
    # Handle the case where the JSON is invalid
    # Create a dummy dataset to allow the rest of the code to run
    dataset = Dataset.from_dict({
        'instruction': ["This is a dummy instruction."],
        'response': ["This is a dummy response."]
    })


# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # <--- This is a good candidate
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

# Change the model_name to a smaller model that fits in memory, e.g., Mistral-7B
# Changed model name to a smaller Llama 3.2 1B model
model_name_to_load = "unsloth/Llama-3.2-1B-bnb-4bit" # Changed model name

# Check if CUDA is available
if not torch.cuda.is_available():
    print("CUDA is not available. Training will not be possible on GPU.")
else:
    print(f"CUDA is available. Using device: {torch.cuda.current_device()}")


model, tokenizer = FastModel.from_pretrained(
    model_name = model_name_to_load, # Use the smaller model
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
    # Removed explicit torch_dtype as Unsloth handles this with load_in_4bit/8bit
    # token = "hf_...", # use one if using gated models
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,
    loftq_config = None,
)

# Add this line to see the structure of your dataset
print(dataset.features)

# Define the formatting function based on your dataset structure
def formatting_func(examples):
    output_texts = []
    for i in range(len(examples['instruction'])):
        # Combine instruction and response into a single string
        # Adjust the format string below based on what your model expects
        # For example, you might use special tokens like <|user|> and <|assistant|>
        text = f"### Instruction:\n{examples['instruction'][i]}\n### Response:\n{examples['response'][i]}"
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    formatting_func = formatting_func,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 1, # Reduced batch size
        gradient_accumulation_steps = 8, # Increased gradient accumulation steps
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
        report_to = "none",
        # Remove explicit bf16/fp16 settings and let Unsloth/Trainer manage precision
        # based on the load_in_4bit=True flag.
        # fp16 = not torch.cuda.is_bf16_supported(),
        # bf16 = torch.cuda.is_bf16_supported(),
    ),
)

# Now run the training
trainer.train()

CUDA is available. Using device: 0
==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients
{'instruction': Value(dtype='string', id=None), 'response': Value(dtype='string', id=None)}


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/108 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 108 | Num Epochs = 5 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.4695
2,2.2281
3,2.3641
4,2.4478
5,2.3189
6,2.3058
7,2.359
8,2.327
9,2.3562
10,2.2847


TrainOutput(global_step=60, training_loss=2.27881178855896, metrics={'train_runtime': 654.7834, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.092, 'total_flos': 5404675029823488.0, 'train_loss': 2.27881178855896})

In [40]:
from transformers import TextStreamer

# Add this after training
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

prompt = "JAPANESE FAIRY TALES."

# Generate response
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
_ = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
)


 7 - The BabooBab1 is a0b the book of B3d for d5 in English E4 F ghyin G I7 8 In9 a9 that A12 out10 and B13 14 on line15210 out163a16 for y17 in be 20 a Fish144g in be19out18a at222 a Future20 Information23 an inside25 about World35 out of26 Head28306 New World32 M for Free40 About I1 7 For2 Out20 I5 about 14 New7 about world-0
Jubut 7about me8about the10711u find8f14at1ing7for9at the11onbehind5just4bycause21after4becauseof1while4ifmaytheounlessuntilwheneversounlessotherwisewhensoeverwhereyetthoughwhilstwhilewhilst while yet all though wherever whatever whenever when
