# 0. Introduction
Fine tune an LLM using Huggingface platform

# 1. Install
_Run once_

In [136]:
# !pip install --upgrade pip

# !pip install unsloth
# !pip install "torch >= 2.0.0"
# !pip install "bitsandbytes >= 0.40.0"
# !pip install "transformers >= 4.30.0"
# !pip install "accelerate >= 0.20.3"
# !pip install "datasets >= 2.12.0"
# !pip install "trl > 0.6.0"
# !pip install "unsloth >= 0.1.0"
# !pip install "peft >= 0.4.0"
# !pip install "evaluate"
# !pip install "tensorboard"

# !pip install colorama

# 2. Imports

In [164]:
import os
from datetime import datetime

import torch

import bitsandbytes as bnb

import accelerate

import trl

import datasets as ds
from datasets import load_dataset
from datasets import Dataset
from datasets import DatasetDict
from datasets import IterableDataset
from datasets import IterableDatasetDict

import transformers
from transformers import AutoTokenizer

import unsloth
from unsloth import FastLanguageModel

from typing import Any

from colorama import Fore, Back, Style

## 2.1. Check versions

In [165]:
print(f"Unsloth Version\t\t: {Style.BRIGHT}{Fore.CYAN}{unsloth.__version__}{Style.RESET_ALL}")
print(f"Torch Version\t\t: {Style.BRIGHT}{Fore.CYAN}{torch.__version__}{Style.RESET_ALL}")
print(f"BitsAndBytes Version\t: {Style.BRIGHT}{Fore.CYAN}{bnb.__version__}{Style.RESET_ALL}")
print(f"Transformers Version\t: {Style.BRIGHT}{Fore.CYAN}{transformers.__version__}{Style.RESET_ALL}")
print(f"Accelerate Version\t: {Style.BRIGHT}{Fore.CYAN}{accelerate.__version__}{Style.RESET_ALL}")
print(f"Datasets Version\t: {Style.BRIGHT}{Fore.CYAN}{ds.__version__}{Style.RESET_ALL}")
print(f"TRL Version\t\t: {Style.BRIGHT}{Fore.CYAN}{trl.__version__}{Style.RESET_ALL}")
print(f"\nCUDA Availability\t: {Style.BRIGHT}{Fore.GREEN}{torch.cuda.is_available()}{Style.RESET_ALL}")
print(f"CUDA Device Count\t: {Style.BRIGHT}{Fore.GREEN}{torch.cuda.device_count()}{Style.RESET_ALL}")

Unsloth Version		: [1m[36m2025.6.2[0m
Torch Version		: [1m[36m2.7.0+cu126[0m
BitsAndBytes Version	: [1m[36m0.46.0[0m
Transformers Version	: [1m[36m4.52.4[0m
Accelerate Version	: [1m[36m1.7.0[0m
Datasets Version	: [1m[36m3.6.0[0m
TRL Version		: [1m[36m0.18.1[0m

CUDA Availability	: [1m[32mTrue[0m
CUDA Device Count	: [1m[32m1[0m


# 3. Variables

## 3.1. Environment Variables

In [166]:
# Configured env var for Unsloth <---- particularly for colab notebook
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## 3.2. User Variables

In [167]:
max_steps: int = 1000

model_name: str = "distilgpt2"
dataset_name: str = "timdettmers/openassistant-guanaco"
gigs: int = 1024**3

# 4. Code

## 4.1. Prepare dataset
Steps:
<ol>
    <li>Fetch a dataset</li>
    <li>Put an <font color="#22819F"><b>eos_token</b></font> if the examples do not have one at the end</li>
    <li>For tokenized dataset, change token ids to a reserved keyword (<i>-100 here</i>) for pad tokens</li>
    <li>Tokenize</li>
</ol>

### 4.1.1. Fetch a dataset
(from Hugging face)

In [168]:
def get_dataset(dataset_name: str, max_samples: int
    ) -> Dataset | DatasetDict | IterableDataset | IterableDatasetDict:
    """
    Get dataset from HuggingFace.

    Args:
        dataset_name (str): Name of the dataset.
        max_samples (int): Maximum number of samples in the dataset to use.

    Returns:
        Dataset | DatasetDict | IterableDataset | IterableDatasetDict
    """

    dataset = load_dataset(dataset_name, split="train")

    if max_samples is not None and max_samples < len(dataset):
        dataset = dataset.select(range(max_samples))

    print(f"Dataset {Fore.LIGHTBLUE_EX}{dataset_name}{Style.RESET_ALL} loaded successfully.")

    return dataset

In [169]:
def get_tokenizer(model_name: str) -> Any:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    print(f"Tokenizer {Fore.LIGHTBLUE_EX}{model_name}{Style.RESET_ALL} loaded successfully.")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

### 4.1.2. Put an <font color="#22819F"><b>eos_token</b></font> if the examples do not have one at the end

In [170]:
def add_pad_tokens_at_end(texts, tokenizer) -> list[str]:
    processed_texts: list[str] = []

    for text in texts:
        if not text.endswith(tokenizer.eos_token):
            text += tokenizer.eos_token

        processed_texts.append(text)

    return processed_texts

### 4.1.3. Change <font color="#22819F">pad_token id</font> to reserved keyword

In [171]:
def change_pad_tokens_ids_to_res_keyword(tokenized_text_id: list[int], tokenizer: Any) -> list[int]:
    tokenized_texts_label_ids = []

    for label_token_ids in tokenized_text_id:
        processed_label_token_ids: list[str] = []

        for token_id in label_token_ids:
            if token_id == tokenizer.pad_token_id:
                token_id = -100

            processed_label_token_ids.append(token_id)

        tokenized_texts_label_ids.append(processed_label_token_ids)

    return tokenized_texts_label_ids

### 4.1.4. Tokenize

In [172]:
def tokenize(examples: Dataset | DatasetDict | IterableDataset | IterableDatasetDict,
             tokenizer: Any, max_length: int):
    texts: list[str] = examples["text"]
    processed_texts: list[str] = add_pad_tokens_at_end(texts, tokenizer)

    tokenized_texts = tokenizer(
        processed_texts,
        max_length = max_length,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt")

    tokenized_text_input_id: list[int] = tokenized_texts["input_ids"].clone()
    tokenized_texts["labels"] = change_pad_tokens_ids_to_res_keyword(tokenized_text_input_id, tokenizer)

    return tokenized_texts

In [173]:
def prepare_dataset_from_hugging_face(
    dataset_name: str = dataset_name,
    model_name: str = model_name,
    max_samples: int = None,
    max_length: int = 512
    ) -> tuple[Dataset, Any]:
    """
    Prepare dataset from HuggingFace.

    Args:
        dataset_name (str): Name of the dataset.
        model_name (str): Name of the model.
        max_samples (int): Maximum number of samples in the dataset to use.
        max_length (int): Maximum length of the input.

    Returns:
        tuple[ds.Dataset, AutoTokenizer]: Prepared dataset and tokenizer.
    """
    dataset = get_dataset(dataset_name, max_samples)

    tokenizer = get_tokenizer(model_name)

    tokenize_function_args_map: dict = {
        "tokenizer": tokenizer,
        "max_length": max_length
        }

    print(f"{Fore.CYAN}Tokenization started{Style.RESET_ALL}")
    tokenized_dataset = dataset.map(tokenize,
        fn_kwargs = tokenize_function_args_map,
        remove_columns = dataset.column_names,
        batched=True, desc = "Tokenizing dataset")
    print(f"{Fore.CYAN}Tokenization completed{Style.RESET_ALL}")

    return tokenized_dataset, tokenizer

### 4.1.5. Test Data preparation module

In [174]:
# tokenized_dataset, tokenizer = prepare_dataset_from_hugging_face()

In [175]:
# tokenized_dataset

In [176]:
# tokenizer

## 4.2. Train with <font color = "#EE0099"><b>Lo</b></font>w <font color = "#EE0099"><b>R</b></font>ank <font color = "#EE0099"><b>A</b></font>daptation

## 4.2.1. Prepare <font color = "#FF0077">LoRA</font> Config
<i>Note: This config class is a carrier of all values default and user assigned. It is not LoRA constructor in any way.</i>

In [177]:
from dataclasses import dataclass
import math


@dataclass
class ProductionLoraConfig:
    model_name: str = "openai-community/gpt2"
    max_seq_length: int = 512
    dtype: torch.dtype = torch.float16
    lora_r: int = 16 # lora attention dimension, or rank of matrices: the minimum number of independent rows / columns in a matrix is rank.
    lora_alpha: int = 16 # alpha parameter: factor which is multiplied to LoRA module matrices before it is added to original params. lower values makes LoRA params less significant.
    lora_dropout: float = 0.05 # probability for dropping out the LoRA elements in low rank matrices to prevent overfitting. it is regularisation technique.
    target_modules: list = None # modules are q_proj, K_proj, etc depending on the architecture
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: torch.dtype = torch.float16
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True
    bnb_8bit_quant_type: str = "nf4"
    bnb_8bit_use_double_quant: bool = True
    bnb_8bit_compute_dtype: torch.dtype = torch.float16
    max_steps: int = 2000 # risk of overfitting
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    learning_rate: float = 2e-4
    weight_decay: float = 0.001
    warmup_steps: int = 100
    warmup_ratio: int = 0.1
    optimizer: str = "adamw_8bith"
    save_steps: int = 250
    save_total_limit: int = 3
    eval_strategy: str = "steps"
    eval_steps: int = 250
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "loss"
    greater_is_better: bool = False # for "accuracy" metric, make it true
    gradient_checkpointing: bool = True # recommended for large models on a limited GPU memory. In Hugging Face platform, it is implemented as model.gradient_checkpointing_enabled().
    dataloader_num_workers: int = 4
    remove_unused_columns: bool = False
    group_by_length: bool = True
    ddp_find_unused_parameters: bool = False
    logging_steps: int = 10
    report_to: str = "tensorboard" # "wandb" or "tensorboard" or "none"
    # task_type is ignored here as a param as we choose the type of LLM later on.


    def __post_init__(self):
        if self.target_modules is None:
            if "openai-community/gpt2" in self.model_name:
                self.target_modules = ["c_attn", "c_proj"]
            else:
                self.target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]


    def get_effective_batch_size(self) -> tuple[int, int]:
        num_gpus: int = 1

        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()

        effective_batch_size: int = self.per_device_train_batch_size * num_gpus * self.gradient_accumulation_steps

        return effective_batch_size, num_gpus


    def find_max_tab_count(self, items: str) -> int:
        max_length: int = 0

        for item in items:
            if item.startswith("_"):
                continue

            if len(item) > max_length:
                max_length = len(item)

        return int(math.ceil(max_length / 4))


    def print_config(self):
        print("LoRA Configuration")

        max_tab_count = self.find_max_tab_count(self.__dict__.keys())

        for field, value in self.__dict__.items():
            if not field.startswith("_"):
                tab_count = max_tab_count - int(math.floor(len(field) / 4))
                tabs: str = "\t" * tab_count

                print(f"{field}{tabs}: {Style.BRIGHT}{Fore.CYAN}{value}{Style.RESET_ALL}")

        effective_batch_size, num_gpus = self.get_effective_batch_size()

        print(f"\n{'Effective Batch Size':30s}\t: {Style.BRIGHT}{Fore.CYAN}{effective_batch_size}{Style.RESET_ALL} across {Style.BRIGHT}{Fore.RED}{num_gpus}{Style.RESET_ALL} GPU(s)\n\n")

## 4.2.2. Configure LoRA using the above carrier config class

In [178]:
from transformers import BitsAndBytesConfig


def get_model_and_tokenizer(config: ProductionLoraConfig):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = config.load_in_4bit,
        bnb_4bit_quant_type = config.bnb_4bit_quant_type,
        bnb_4bit_use_double_quant = config.bnb_4bit_use_double_quant,
        bnb_4bit_compute_dtype = config.bnb_4bit_compute_dtype
    )

    device_map = "auto" if torch.cuda.device_count() > 1 else None

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.model_name,
        quantization_config = bnb_config,
        max_seq_length = config.max_seq_length,
        dtype = config.dtype,
        device_map = device_map,
        trust_remote_code = True
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer

## 4.2.3. Inject LoRA in the model using <font color = #5577FF>unsloth.FastLanguageModel.<b>get_peft_model(...)</b></font>

In [179]:
def inject_lora_in_model(model: FastLanguageModel, config: ProductionLoraConfig):
    lora_injected_model: FastLanguageModel = FastLanguageModel.get_peft_model(
        model = model,
        r = config.lora_r,
        target_modules = config.target_modules,
        lora_alpha = config.lora_alpha,
        lora_dropout = config.lora_dropout,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42,
        max_seq_length = config.max_seq_length,
        use_rslora = False # Don't use Rank Swappable LoRA (extreme memory savings on huge models)
    )

    if config.gradient_checkpointing is not None:
        lora_injected_model.gradient_checkpointing_enable()

    lora_injected_model.print_trainable_parameters()

    if torch.cuda.is_available():
        device_count: int = torch.cuda.device_count()

        for i in range(device_count):
            allocated = torch.cuda.memory_allocated(i) / gigs
            cached = torch.cuda.memory_reserved(i) / gigs
            print(f"Memory allocated for GPU {Fore.MAGENTA}{i}{Style.RESET_ALL}:\n\tAllocated: {Style.BRIGHT}{Fore.GREEN}{allocated:.2f} GB{Style.RESET_ALL}\n\tCached: {Style.BRIGHT}{Fore.GREEN}{cached:.2f} GB{Style.RESET_ALL}")

        print("Model and Tokenizer loaded with LoRA config")

    return lora_injected_model

## 4.2.4. Train with unsloth

In [180]:
from transformers import TrainerCallback


class ProductionTrainerCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
        self.best_loss = float('inf')
        self.time_format = "%Y-%m-%d %H:%M:%S"


    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = datetime.now()

        print(f"Training started at {Fore.GREEN}{self.start_time.strftime(self.time_format)}{Style.RESET_ALL}")


    def on_train_end(self, args, state, control, **kwargs):
        end_time = datetime.now()
        print(f"Training completed at {Fore.GREEN}{end_time.strftime(self.time_format)}{Style.RESET_ALL}")

        duration = end_time - self.start_time
        print(f"Training duration: {Fore.GREEN}{duration}{Style.RESET_ALL}")

        print(f"Best loss: {Fore.GREEN}{self.best_loss}{Style.RESET_ALL}")


    def on_log(self, args, state, control, logs = None, **kwargs):
        if logs is not None:
            current_learning_rate = logs.get("learning_rate", -1)
            train_loss = logs.get("loss", -1) # TODO: or is it just loss?

            if train_loss < self.best_loss:
                self.best_loss = train_loss

            print(f"Step {Fore.RED}{state.global_step:>4d}{Style.RESET_ALL} | Loss: {Fore.GREEN}{train_loss:.4f}{Style.RESET_ALL} | LR: {Fore.GREEN}{current_learning_rate:.2e}{Style.RESET_ALL} | Best Loss: {Fore.GREEN}{self.best_loss:.4f}{Style.RESET_ALL}")

            if torch.cuda.is_available() and state.global_step % 50 == 0:
                for i in range(torch.cuda.device_count()):
                    allocated = torch.cuda.memory_allocated(i) / gigs

                    print(f"GPU {Fore.MAGENTA}{i}{Style.RESET_ALL}\tMemory allocated: {Fore.GREEN}{allocated:.2f}{Style.RESET_ALL}GB")


    def on_evaluate(self, args, state, control, logs = None, **kwargs):
        if logs is not None:
            eval_loss = logs.get("eval_loss", -1)

            print(f"Step {Fore.RED}{state.global_step}{Style.RESET_ALL} | Evaluation Loss: {Fore.GREEN}{eval_loss:.4f}{Style.RESET_ALL} | Best Loss: {Fore.GREEN}{self.best_loss:.4f}{Style.RESET_ALL}")

In [181]:
from transformers import TrainingArguments
from trl import SFTTrainer



def create_production_trainer(model, tokenizer, train_dataset, eval_dataset, config:ProductionLoraConfig):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_dir: str = f"./lora_production_{timestamp}"

    training_args = TrainingArguments(
        output_dir = output_dir,
        overwrite_output_dir = True,
        max_steps = config.max_steps,
        per_device_train_batch_size = config.per_device_train_batch_size,
        per_device_eval_batch_size = config.per_device_train_batch_size,
        gradient_accumulation_steps = config.gradient_accumulation_steps,
        learning_rate = config.learning_rate,
        weight_decay = config.weight_decay,
        warmup_ratio = config.warmup_ratio,
        warmup_steps = config.warmup_steps,
        optim = config.optimizer,
        eval_strategy = "no",
        save_strategy = "steps",
        save_steps = config.save_steps,
        save_total_limit = config.save_total_limit,
        load_best_model_at_end = False, # Temporary fix for colab. avoid conflict with Supervised Fine Tuning trainer with multi processes.
        logging_strategy = "steps",
        logging_steps = config.logging_steps,
        report_to = "none",
        gradient_checkpointing = config.gradient_checkpointing,
        dataloader_num_workers = 0,
        remove_unused_columns = False,
        group_by_length = False,

        #Multi GPU training
        ddp_find_unused_parameters = config.ddp_find_unused_parameters,

        fp16 = config.dtype == torch.float16,
        bf16 = config.dtype == torch.bfloat16,

        dataloader_pin_memory = True,
        skip_memory_metrics = False
    )

    callbacks = [ProductionTrainerCallback()]

    trainer = SFTTrainer(
        model = model,
        train_dataset = train_dataset,
        args = training_args,
        tokenizer = tokenizer,
        callbacks = callbacks,
        max_seq_length = config.max_seq_length
    )

    return trainer, output_dir

## 4.2.5. Train production model method

### 4.2.5.1. Step 1: Dataset preparation code

In [182]:
def prepare_dataset(dataset_name: str, model_name: str, max_samples: int, max_seq_length: int):
    tokenized_dataset, tokenizer = prepare_dataset_from_hugging_face(
    dataset_name = dataset_name,
    model_name = model_name,
    max_samples = max_samples,
    max_length = max_seq_length
    )

    train_dataset = tokenized_dataset
    eval_dataset = None

    return train_dataset, eval_dataset, tokenizer

### 4.2.5.2. Step 5: Save objects
Notes:
<ul>
    <li>Saves adapter and tokenizer.</li>
    <li>Steps 2, 3, and 4 are in the training production model method</li>
</ul>

In [183]:
from peft import PeftModel
from transformers import AutoModelForCausalLM


def merge_models(model, root_dir):
    adapter_dir = os.path.join(root_dir, "adapter")
    model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
    model = PeftModel.from_pretrained(model, adapter_dir)
    model = model.merge_and_unload() # Merge the adapters, and release the memory

    return model


def save_object(obj, root_dir, dir_name, is_sfttrainer: bool = False):
    dir = os.path.join(root_dir, dir_name)
    os.makedirs(dir, exist_ok = True)

    if is_sfttrainer:
        obj.save_model(dir)
    else:
        obj.save_pretrained(dir)


def save_objects(model, tokenizer, trainer, root_dir):
    save_object(model, root_dir, "adapter")
    save_object(tokenizer, root_dir, "tokenizer")
    save_object(trainer, root_dir, "final_model", True)


def merge_and_save_objects(model, tokenizer, trainer, root_dir, merge_models: bool = False):
    if merge_models:
        model = merge_models(model, root_dir)

    save_objects(model, tokenizer, trainer, root_dir)

### 4.2.5.3. train production model method

In [184]:
def train_production_model(dataset_name: str = "timdettmers/openassistant-guanco", config: ProductionLoraConfig = None, max_samples: int = None, eval_split_ratio = 0.0):
    if config is None:
        config = ProductionLoraConfig()

    print("LoRA pipeline started")
    config.print_config()

    # Step 1: call dataset preparation code
    train_dataset, eval_dataset, tokenizer = prepare_dataset(dataset_name, config.model_name, max_samples, config.max_seq_length)

    # Step 2: load model
    model, tokenizer = get_model_and_tokenizer(config)
    model = inject_lora_in_model(model, config)

    # Step 3: Training setup
    trainer, output_dir = create_production_trainer(model, tokenizer, train_dataset, eval_dataset, config)

    # Step 4: Train
    trainer.train()

    # Step 5: Save model
    merge_and_save_objects(model, tokenizer, trainer, output_dir)

    return trainer, output_dir

### 4.2.5.4. Run training code

In [185]:
production_config = ProductionLoraConfig(
    model_name = "openai-community/gpt2",
    lora_r = 16,
    lora_alpha = 16,
    max_steps = max_steps,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-4,
    optimizer = "adamw_8bit",
    warmup_steps = 100,
    save_steps = 250,
    logging_steps = 25,
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    dtype = torch.float16,
    eval_strategy = "no",
    eval_steps = None,
    load_best_model_at_end = False
)

print("Production training pipeline started")

Production training pipeline started


In [186]:
trainer, output_dir = train_production_model(
    dataset_name = "timdettmers/openassistant-guanaco",
    config = production_config,
    max_samples = 1000,
    eval_split_ratio = 0.0
)

print("Production training pipeline ended")

LoRA pipeline started
LoRA Configuration
model_name					: [1m[36mopenai-community/gpt2[0m
max_seq_length				: [1m[36m512[0m
dtype						: [1m[36mtorch.float16[0m
lora_r						: [1m[36m16[0m
lora_alpha					: [1m[36m16[0m
lora_dropout				: [1m[36m0.05[0m
target_modules				: [1m[36m['c_attn', 'c_proj'][0m
load_in_4bit				: [1m[36mTrue[0m
bnb_4bit_compute_dtype		: [1m[36mtorch.float16[0m
bnb_4bit_quant_type			: [1m[36mnf4[0m
bnb_4bit_use_double_quant	: [1m[36mTrue[0m
bnb_8bit_quant_type			: [1m[36mnf4[0m
bnb_8bit_use_double_quant	: [1m[36mTrue[0m
bnb_8bit_compute_dtype		: [1m[36mtorch.float16[0m
max_steps					: [1m[36m1000[0m
per_device_train_batch_size	: [1m[36m2[0m
gradient_accumulation_steps	: [1m[36m4[0m
learning_rate				: [1m[36m0.0002[0m
weight_decay				: [1m[36m0.001[0m
warmup_steps				: [1m[36m100[0m
warmup_ratio				: [1m[36m0.1[0m
optimizer					: [1m[36madamw_8bit[0m
save_steps					: [1m[36m250[0m
save_total_limit	

Repo card metadata block was not found. Setting CardData to empty.


Dataset [94mtimdettmers/openassistant-guanaco[0m loaded successfully.
Tokenizer [94mopenai-community/gpt2[0m loaded successfully.
[36mTokenization started[0m
[36mTokenization completed[0m
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.2: Fast Gpt2 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gpt2 does not support SDPA - switching to eager!
openai-community/gpt2 does not have a padding token! Will use pad_token = <|endoftext|>.
Unsloth: Making `model.base_model.model.transformer` require gradients
trainable params: 1,622,016 || all params: 126,061,824 || trainable%: 1.2867
Memory

Step,Training Loss
25,26.071
50,15.6258
75,11.1537
100,9.6104
125,9.5972
150,9.5498
175,9.5345
200,9.3698
225,9.5346
250,9.824


Step [31m  25[0m | Loss: [32m26.0710[0m | LR: Loss: [32m4.80e-05[0m | Best Loss: [32m26.0710[0m
Step [31m  50[0m | Loss: [32m15.6258[0m | LR: Loss: [32m9.80e-05[0m | Best Loss: [32m15.6258[0m
GPU [35m0[0m	Memory allocated: [32m0.51[0mGB
Step [31m  75[0m | Loss: [32m11.1537[0m | LR: Loss: [32m1.48e-04[0m | Best Loss: [32m11.1537[0m
Step [31m 100[0m | Loss: [32m9.6104[0m | LR: Loss: [32m1.98e-04[0m | Best Loss: [32m9.6104[0m
GPU [35m0[0m	Memory allocated: [32m0.51[0mGB
Step [31m 125[0m | Loss: [32m9.5972[0m | LR: Loss: [32m1.95e-04[0m | Best Loss: [32m9.5972[0m
Step [31m 150[0m | Loss: [32m9.5498[0m | LR: Loss: [32m1.89e-04[0m | Best Loss: [32m9.5498[0m
GPU [35m0[0m	Memory allocated: [32m0.51[0mGB
Step [31m 175[0m | Loss: [32m9.5345[0m | LR: Loss: [32m1.84e-04[0m | Best Loss: [32m9.5345[0m
Step [31m 200[0m | Loss: [32m9.3698[0m | LR: Loss: [32m1.78e-04[0m | Best Loss: [32m9.3698[0m
GPU [35m0[0m	Memory allocated

## 4.3. Infer

In [187]:
from transformers import pipeline

In [210]:
prompts = [
    "The future of AI is",
    "The story of Terminator 2 movie is",
    "The most important lesson from the story of David and Goliath is",
    "Climate change represents a challenge that",
    "Innovation in healthcare could lead to"
]

In [211]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch

class Completer:
    def __init__(self, model_path):
        self.model_path = model_path
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_path)
        print("Final model loaded")

        self.generator = pipeline(
            task = "text-generation",
            model = self.model,
            tokenizer = self.tokenizer,
            max_new_tokens = 200,
            device = 0 if torch.cuda.is_available() else -1,
            pad_token_id = self.tokenizer.eos_token_id,
            bos_token_id = self.tokenizer.eos_token_id # Set bos_token_id here as well
        )

        self.generation_config: dict = {
            "max_length": 400,
            "num_return_sequences": 1,
            "temperature": 0.7,
            "do_sample": True,
            "pad_token_id": self.tokenizer.eos_token_id,
            "eos_token_id": self.tokenizer.eos_token_id,
            "bos_token_id": self.tokenizer.eos_token_id
        }


    def complete_prompt(self, prompt: str) -> str:
        # Convert the dictionary to a GenerationConfig object
        gen_config = GenerationConfig.from_dict(self.generation_config)
        outputs = self.generator(prompt, generation_config = gen_config)
        generated_text = outputs[0]["generated_text"]

        return generated_text

In [212]:
model_path = "./lora_production_2025-06-15_03-05-06/final_model"

text_completer: Completer = Completer(model_path)
print()

for i, prompt in enumerate(prompts):
    print(f"Prompt {Fore.LIGHTRED_EX}{i}{Style.RESET_ALL}:\n\t{Fore.BLUE}{prompt}...{Style.RESET_ALL}")
    generated_text: str = text_completer.complete_prompt(prompt)
    print(f"\nCompleted prompt:\n\t{Style.BRIGHT}{Fore.GREEN}{generated_text}{Style.RESET_ALL}\n\n")

print("Code completed!")

Device set to use cuda:0


Final model loaded

Prompt [91m0[0m:
	[34mThe future of AI is...[0m

Completed prompt:
	[1m[32mThe future of AI is always up for debate.[0m


Prompt [91m1[0m:
	[34mThe story of Terminator 2 movie is...[0m

Completed prompt:
	[1m[32mThe story of Terminator 2 movie is a story of a robot who is trying to take over the world and kill humans. The robot tries to kill humans, but the humans are able to stop him and make him believe he is the only one left.

The real Terminator 3 is a story of a robot who has gone rogue and is trying to save humanity from a mysterious force.

The real Terminator 4 is a story of a robot who is trying to take over the world and kill humans. The robot tries to save humanity from a mysterious force.[0m


Prompt [91m2[0m:
	[34mThe most important lesson from the story of David and Goliath is...[0m

Completed prompt:
	[1m[32mThe most important lesson from the story of David and Goliath is to not dwell on the small things. They had nothing to lose.

# END