# installation dependances

In [None]:
%%capture
import os

print(os.environ.keys())

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
!pip install -U ipywidgets

# configuration du model

In [None]:
from unsloth import FastModel
import torch

max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)




In [None]:

model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = False,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

In [None]:
from datasets import load_dataset

# Load CSV into Hugging Face dataset
dataset = load_dataset("csv", data_files="fr-train-dataset2.csv")

# Access the split (train by default)
train_dataset = dataset["train"]

# Preview
print(train_dataset[0])
train_dataset

In [None]:
train_dataset[0]["Address"]

In [None]:
# Define columns to exclude from tgt
exclude_cols = ["Address", "Street_Name_old"]  # 👈 Add any column to ignore here

# Determine tgt columns
tgt_cols = [col for col in train_dataset.column_names if col not in exclude_cols]
src_col = "Address"


# Transform
def transform(example):
    return {
        "question": example[src_col],
        "answer": {col: example[col] for col in tgt_cols}
    }

dataset_refined = train_dataset.map(transform)


# Preview
print(dataset_refined[0])


print(dataset_refined.column_names)


In [None]:
dataset_refined[0]["answer"]

In [None]:
dataset_refined[0]["question"]

# configuration de l'entrainement

In [None]:
import json

def format_example(example):
    return {
        "text": f"Parsing: {example['question']} \nChamps: {json.dumps(example['answer'], ensure_ascii=False)}"+tokenizer.eos_token
    }

formatted_dataset = dataset_refined.map(format_example)
formatted_dataset = formatted_dataset.remove_columns(
    [col for col in formatted_dataset.column_names if col != "text"]
)

print(formatted_dataset[0])



In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    args=SFTConfig(
        max_seq_length=2048,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",
    ),
)


In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Save It

In [None]:
raise SystemExit("Execution stopped here on purpose.")

In [None]:
#trainer.save_model("gemma3-address-parser")
model.save_pretrained("gemma3-address-parser-lora", save_adapter=True)
tokenizer.save_pretrained("gemma3-address-parser-lora")

model.config.save_pretrained("gemma3-address-parser-base")

In [None]:
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
    model_name = "gemma3-address-parser-lora", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    load_in_4bit = True,
)

address = "Leclerc 10 bis route Victor Hugo 92200 Neuilly-sur-Seine"

prompt = f"Parsing: {address} \nChamps:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# Change to True to save to GGUF


model.save_pretrained_merged("gemma3-address-parser-finetune", tokenizer)

In [None]:
model.save_pretrained_gguf("gemma3-address-parser-finetune",
    quantization_type = "F16", # For now only Q8_0, BF16, F16 supported
)

In [None]:
from unsloth import FastLanguageModel

# Load the model with LoRA adapter
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gemma3-address-parser-lora",  # The LoRA repo (with adapter_config.json)
    base_model_name = "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit", # The base model name (e.g. "unsloth/gemma-2b")
    load_in_4bit = False,
)

# Merge LoRA into base model weights
model = FastLanguageModel.merge_lora(model)

# Save the merged model (base + LoRA)
model.save_pretrained("gemma3-address-parser-merged")
tokenizer.save_pretrained("gemma3-address-parser-merged")


# run it

In [None]:
address = "Leclerc 10 bis route Victor Hugo 92200 Neuilly-sur-Seine"

prompt = f"Parsing: {address} \nChamps:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


# use unsloth to run model

In [None]:
raise SystemExit("Execution stopped here on purpose.")

In [None]:
# skip
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gemma3-address-parser-lora", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 1024,
    dtype = torch.float16,
    load_in_4bit = False,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
# skip
address = "Leclerc 10 bis avenue Victor Hugo 92200 Neuilly-sur-Seine"
prompt = f"Parsing: {address} \nChamps:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))