<a href="https://colab.research.google.com/github/srinath-96/STTMachineTranslation/blob/main/finetuningGemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth



In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from unsloth import FastModel
import torch

# The model to use
model_name = "unsloth/gemma-3-1b-it-bnb-4bit"

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.6.2: Fast Gemma3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


In [22]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [23]:
import json

problematic_lines = []
file_path = '/content/drive/MyDrive/gemma_simple_dataset.jsonl'

print(f"Analyzing file: {file_path}")

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line)
                # Check if the 'output' field is NOT a string
                if 'output' in data and not isinstance(data['output'], str):
                    problematic_lines.append({
                        "line_number": i + 1,
                        "type_found": str(type(data['output'])),
                        "content": line.strip()
                    })
            except json.JSONDecodeError:
                problematic_lines.append({
                    "line_number": i + 1,
                    "type_found": "Invalid JSON",
                    "content": line.strip()
                })

    if problematic_lines:
        print(f"\nFound {len(problematic_lines)} problematic lines.")
        for problem in problematic_lines[:10]: # Print the first 10 problems
            print(f"  - Line {problem['line_number']}: Type is {problem['type_found']}, Content: {problem['content']}")
    else:
        print("\nAnalysis complete. No data type issues were found in the 'output' field.")

except FileNotFoundError:
    print(f"ERROR: The file was not found at {file_path}")
    print("Please make sure the path is correct.")

Analyzing file: /content/drive/MyDrive/gemma_simple_dataset.jsonl

Analysis complete. No data type issues were found in the 'output' field.


In [24]:
from datasets import load_dataset

# Gemma's instruction-tuning chat template
prompt_template = """<start_of_turn>user
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn>"""
# Load your dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/gemma_simple_dataset.jsonl", split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [25]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False, # Can make training faster but not necessary
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # A good starting point
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"]:   0%|          | 0/4018 [00:00<?, ? examples/s]

In [None]:
# Start the training!
trainer_stats = trainer.train()

# You can check the stats after training
#

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,018 | Num Epochs = 3 | Total steps = 1,509
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)


Step,Training Loss
10,3.225
20,2.1693
30,2.0308
40,1.9558
50,1.7808
60,1.9414
70,1.7712
80,1.8104
90,1.6717
100,1.8456


In [None]:
trainer_stats.metrics

In [12]:
# To save the LoRA adapters
model.save_pretrained("lora_gemma_translator")

# To run inference
from transformers import TextStreamer

# Use the same prompt structure for inference
instruction = "Translate the following text from Korean to English. The provided grammar information can be used as a hint."
korean_text = "Grammar: N은/는 N이에요/예요? - It is the interrogative form of 'N은/는 N이에요/예요'.\n\n저는 학생이에요."

prompt = prompt_template.format(instruction + "\n" + korean_text, "") # Leave the output empty

# Tokenize the input
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

# Generate the translation
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

<bos><start_of_turn>user
Translate the following text from Korean to English. The provided grammar information can be used as a hint.
Grammar: N은/는 N이에요/예요? - It is the interrogative form of 'N은/는 N이에요/예요'.

저는 학생이에요.<end_of_turn>
<start_of_turn>model
<end_of_turn><bos><start_of_turn>user
Translate the following text from Korean to English. The provided grammar information can be used as a hint.
Grammar: N은/는 N이에요/예요? - It is the interrogative form of 'N은/는 N이에요/예요'.

저는 

You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.


학생이에요.<end_of_turn>
<start_of_turn>model
<end_of_turn>model
<end_of_turn>


In [16]:
from peft import PeftModel

In [18]:
# Your fine-tuned model and tokenizer should be loaded
# model = ...
# tokenizer = ...

# 1. Define and format the prompt (same as before)
instruction = "Translate the following text from Korean to English. The provided grammar information can be used as a hint."
grammar_hint = "Grammar: N은/는 N이에요/예요. - The statement form."
korean_sentence = "저는 학생이에요."

prompt_template = """<start_of_turn>user
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn>"""

full_instruction = f"{instruction}\n{grammar_hint}\n\n{korean_sentence}"
prompt = prompt_template.format(full_instruction, "")


# 2. Tokenize the input and get its length
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
prompt_length = inputs.input_ids.shape[1]


# 3. Generate the output tokens without a streamer
print("Generating response...")
outputs = model.generate(**inputs, max_new_tokens=128, eos_token_id=tokenizer.eos_token_id)
print("Generation complete.")


# 4. Decode the newly generated tokens ONLY
# This is the most reliable way to get the clean output.
newly_generated_tokens = outputs[0][prompt_length:]
clean_translation = tokenizer.decode(newly_generated_tokens, skip_special_tokens=True)


# --- Display the results ---
print("\n✅ Model's Clean Translation:")
print(clean_translation)


# --- For Debugging (Optional) ---
# You can uncomment the lines below to see exactly what the model generated,
# including the prompt and all special tokens.
#
# print("\n--- Full Raw Output (for debugging) ---")
# full_decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
# print(full_decoded_text)
# print("--------------------------------------")

Generating response...
Generation complete.

✅ Model's Clean Translation:
model

