<a href="https://colab.research.google.com/github/pravinpardeshi/LLM_FineTUning/blob/main/FT_Working_30May2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unsloth trl

In [None]:
from unsloth import FastLanguageModel

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-1B",
        load_in_4bit = True,
        max_seq_length = 1024,
        dtype = None
    )

In [None]:
model

In [None]:
model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = True,
        random_state = 3407,
        use_rslora = False, # rank stabilized LoRA
        loftq_config = None
    )

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
import json, yaml
import torch

In [None]:
from datasets import load_dataset
import pandas as pd

from datasets import Dataset, DatasetDict
data_files = {
    "train": "20_records.json",
    "validation": "eval.json"
}

print(data_files['validation'])

# Load JSON files using pandas
train_df = pd.read_json(data_files["train"])
validation_df = pd.read_json(data_files["validation"])

# Convert pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Combine the datasets into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})

#dataset = load_dataset("json", data_files=data_files)

In [None]:
prompt_format = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt_format.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts, }

# Uploaded custom data json into colab's session
#with open('37_records.csv', 'r') as f:

#    json_f = yaml.safe_load(f.read())

#df = pd.DataFrame(json_f)
#print(df.columns)

#dataset = Dataset.from_pandas(df)
dataset = dataset.map(formatting_prompts_func, batched = True,)


In [None]:
dataset.column_names

In [None]:
from unsloth import  is_bfloat16_supported


In [None]:
from trl import SFTTrainer, SFTConfig

train_args = SFTConfig(
    auto_find_batch_size = True,
    gradient_accumulation_steps = 4,
    warmup_ratio = 0.1,
    num_train_epochs = 5,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    optim = "adamw_torch",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True,
    save_strategy = "epoch",
    save_total_limit = 1,
    output_dir = "outputs"
)

In [None]:
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["validation"],
    dataset_text_field = "text",
    max_seq_length = 1024,
    dataset_num_proc = 2,
    packing = False,
    args = train_args
)

In [None]:
trainer_status = trainer.train()

In [None]:
# Test the model

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(
[
    prompt_format.format(
        "You are an expert at drawing inferences based on your knowledge.", # instruction
        "Based on your knowledge, create the best answer for the question asked on the Go Getter Book. If you do not know then say, you cannot answer the question.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

In [None]:
outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Response:")[1].strip())

In [None]:
# STEP 8
# Save locally as sharded model files
# model.save_pretrained_merged("amrs_csv_gen_model", tokenizer, save_method = "merged_16bit", )

In [None]:
## STEP 9
# Push to Huggingface hub - replace the space username as required
#model.push_to_hub_merged("amrs-tech/csv_gen_model",
#							tokenizer, save_method = "merged_16bit",
#							token = "hf_HqxQEPdJ******enubpWyh"
#							)