## <center>**`Implementation`**</center>

#### Check gpus availability

In [1]:
# check gpus availability
import torch

print(f"GPU Available: {torch.cuda.is_available()}")  
print(f"GPU Name: {torch.cuda.get_device_name(0)}")

GPU Available: True
GPU Name: NVIDIA GeForce RTX 4080 SUPER


### Load libraries

In [2]:
import os
import sys
import json
import math
import numpy as np
from datetime import datetime

import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import (
    TrainingArguments,
    TrainerCallback,
    EarlyStoppingCallback,)

import wandb
import gc

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Params

In [3]:
# Model and dataset configuration
model_name = "meta-llama/Meta-Llama-3-8B"
dataset_path = "fka/awesome-chatgpt-prompts"
working_dir = './'
output_dir = os.path.join(working_dir, "unsloth_lab_outputs")

# Training parameters - Proven stable hyperparameters from memory
learning_rate = 2e-5  # Conservative learning rate for stability
num_epochs = 100
batch_size = 4
gradient_accumulation_steps = 2
max_seq_length = 2048
warmup_steps = 50
weight_decay = 0.1  # Strong regularization
max_grad_norm = 0.3  # Conservative gradient clipping

# LoRA parameters - Reduced for stability
lora_r = 16  # Reduced rank for stability
lora_alpha = 16
lora_dropout = 0.2
'''
target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]
'''
target_modules = ["q_proj", "v_proj"]

# Early stopping and stability
early_stopping_patience = 4
eval_steps = 50
save_steps = 100

# Weights & Biases
#wandb_project = "llama-3.1-8b-instruct-sg-legislation"
wandb_project = "llama-3-8B-finetuning"
wandb_run_name = None

# Other parameters
resume_from_checkpoint = None
test_dataset_split = 0.2
seed = 42

### Config

In [4]:
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("unsloth_finetuning.log"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)


### Helper functions

#### Load and prepare dataset

In [5]:
def load_and_prepare_dataset(dataset_path, tokenizer):
    """Load and prepare dataset."""
    logger.info(f"Loading dataset from {dataset_path}")
    try:
        dataset = load_dataset(dataset_path)
        logger.info(f"Dataset loaded successfully: {dataset}")

        dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
        dataset = dataset["train"]

        dataset = dataset.remove_columns('act')

        # Verify dataset structure
        logger.info(
            (
            f"Sample formatted dataset: "
            f"{dataset[:1]}..." )
        )

        return dataset
    
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        raise

#### Load foundation model and tokenizer

In [6]:
def load_model_unsloth(model_name, max_seq_length=2048, load_in_4bit=False, dtype=None):
    """ Load model and tokenizer with Unsloth optimizations """
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map="auto"    
    )
    return model, tokenizer

#### PEFT version of the foundation model

In [7]:
def load_peft_model(model, target_modules, lora_r=16, lora_alpha=16, lora_dropout=0.0, bias="none", random_state=42):
    """ Set LoRA config and return PEFT model """

    peft_model = FastLanguageModel.get_peft_model(
        model=model,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=target_modules,
        bias=bias,
        use_gradient_checkpointing="unsloth",
        random_state=random_state
    )
    return peft_model

#### Create training arguments

In [8]:
def create_training_arguments(output_dir, learning_rate, num_epochs, batch_size=4,
                              warmup_steps=0, weight_decay=0., 
                              lr_scheduler_type='linear', max_grad_norm=1.0,
                              gradient_accumulation_steps=1, run_name=None, 
                              logging_steps=500, eval_steps=250, save_steps=500, seed=42):
    """ Create training args"""

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        max_grad_norm=max_grad_norm,
        warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type,  # Stable cosine scheduler
        logging_steps=logging_steps,
        eval_steps=eval_steps,
        save_steps=save_steps,
        eval_strategy="steps",  # Fixed: was evaluation_strategy
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="wandb",
        run_name=run_name,
        seed=seed,
        data_seed=seed,
        # RTX optimizations
        #bf16=True,  # Use bf16 to match model precision
        #fp16=False,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        dataloader_pin_memory=True,
        dataloader_num_workers=4,
        remove_unused_columns=False,
        # Stability improvements
        save_safetensors=True,
        ddp_find_unused_parameters=False,
    )
    return training_args

#### Trainer

In [9]:
def create_trainer(model, tokenizer, training_args, 
                   train_dataset, eval_dataset = None,
                   max_seq_length=2048, dataset_text_field="text",
                   callbacks=[]):
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field=dataset_text_field,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        packing=True,
        dataset_kwargs={
            "add_special_tokens": False,
            "append_concat_token": False,
        },
        callbacks=callbacks,        
    )
    return trainer

#### Callbacks

In [10]:
class StabilityCallback(TrainerCallback):
    """Callback for training stability and divergence detection."""

    def __init__(
        self,
        divergence_threshold=1.3,
        min_steps_before_check=100,
        patience=2,
        no_improvement_patience=5,
        gradient_explosion_threshold=5.0,
    ):
        super().__init__()
        self.divergence_threshold = divergence_threshold
        self.min_steps_before_check = min_steps_before_check
        self.patience = patience
        self.no_improvement_patience = no_improvement_patience
        self.gradient_explosion_threshold = gradient_explosion_threshold
        self.best_val_loss = float("inf")
        self.divergence_count = 0
        self.no_improvement_count = 0
        self.validation_losses = []
        self.gradient_norms = []
        self.consecutive_high_gradients = 0

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Monitor gradient norms and training stability."""
        if logs:
            # Track gradient norms
            if "grad_norm" in logs:
                self.gradient_norms.append(logs["grad_norm"])

                # Check for gradient explosion
                if logs["grad_norm"] > self.gradient_explosion_threshold:
                    self.consecutive_high_gradients += 1
                    logger.error(
                        f"""Gradient explosion detected: {
                            logs['grad_norm']:.4f} > {
                            self.gradient_explosion_threshold}"""
                    )

                    if self.consecutive_high_gradients >= 3:
                        logger.error(
                            (
                            "Stopping training due to persistent "
                            "gradient explosion"
                        )
                        )
                        control.should_training_stop = True
                        return
                elif logs["grad_norm"] > 1.0:
                    logger.warning(
                        f"""High gradient norm detected: {
                            logs['grad_norm']:.4f}"""
                    )
                    self.consecutive_high_gradients = max(
                        0, self.consecutive_high_gradients - 1
                    )
                else:
                    self.consecutive_high_gradients = 0

                # Log gradient statistics
                if len(self.gradient_norms) >= 10:
                    recent_norms = self.gradient_norms[-10:]
                    avg_norm = np.mean(recent_norms)
                    max_norm = np.max(recent_norms)

                    wandb.log(
                        {
                            "gradient_norm_avg_10": avg_norm,
                            "gradient_norm_max_10": max_norm,
                            "gradient_norm_current": logs["grad_norm"],
                            "consecutive_high_gradients":
                                self.consecutive_high_gradients,
                        },
                        step=state.global_step,
                    )

            # Check for NaN/Inf in training loss
            if "train_loss" in logs:
                if np.isnan(logs["train_loss"]) or np.isinf(
                    logs["train_loss"]
                ):
                    logger.error(
                        (
                        f"NaN/Inf training loss detected at "
                        f"step {state.global_step}"
                    )
                    )
                    control.should_training_stop = True

    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        """Callback for divergence detection and early stopping."""
        if (
            logs
            and "eval_loss" in logs
            and state.global_step >= self.min_steps_before_check
        ):
            current_val_loss = logs["eval_loss"]
            self.validation_losses.append(current_val_loss)

            # Check for improvement
            if current_val_loss < self.best_val_loss:
                improvement = self.best_val_loss - current_val_loss
                self.best_val_loss = current_val_loss
                self.divergence_count = 0
                self.no_improvement_count = 0
                logger.info(
                    f"""New best validation loss: {
                        self.best_val_loss:.4f} (improvement: {
                        improvement:.4f})"""
                )
            else:
                self.no_improvement_count += 1

                # Check for divergence
                if (
                    current_val_loss
                    > self.best_val_loss * self.divergence_threshold
                ):
                    self.divergence_count += 1
                    logger.warning(
                        f"""Potential divergence detected: {
                            current_val_loss:.4f} > {
                            self.best_val_loss *
                            self.divergence_threshold:.4f} (count: {
                            self.divergence_count})"""
                    )

                    if self.divergence_count >= self.patience:
                        logger.error(
                            f"""Training diverged! Stopping at step {
                                state.global_step}"""
                        )
                        control.should_training_stop = True
                        return

                # Check for no improvement
                if (
                    self.no_improvement_count
                    >= self.no_improvement_patience
                ):
                    logger.warning(
                        (
                        (f"No improvement for {self.no_improvement_count} "
                         f"evaluations. Stopping training.")
                    )
                    )
                    control.should_training_stop = True
                    return

            # Calculate perplexity
            perplexity = math.exp(current_val_loss)

            # Enhanced logging
            wandb.log(
                {
                    "eval_perplexity": perplexity,
                    "best_val_loss": self.best_val_loss,
                    "divergence_count": self.divergence_count,
                    "no_improvement_count": self.no_improvement_count,
                    "val_loss_trend": (
                        current_val_loss - self.validation_losses[-2]
                        if len(self.validation_losses) >= 2
                        else 0
                    ),
                },
                step=state.global_step,
            )

            logger.info(
                f"""Step {
                    state.global_step}: Validation Loss: {
                    current_val_loss:.4f}, Perplexity: {
                    perplexity:.2f}, No improvement: {
                    self.no_improvement_count}"""
            )




#### Get latest checkpoint

In [11]:
def find_latest_checkpoint(output_dir):
    """Find the latest checkpoint in the output directory."""
    if not os.path.exists(output_dir):
        return None
    
    checkpoints = []
    for item in os.listdir(output_dir):
        if item.startswith("checkpoint-") and os.path.isdir(
            os.path.join(output_dir, item)
        ):
            try:
                step_num = int(item.split("-")[1])
                checkpoints.append(
                    (step_num, os.path.join(output_dir, item))
                )
            except (ValueError, IndexError):
                continue

    if checkpoints:
        # Return the checkpoint with the highest step number
        latest_step, latest_path = max(checkpoints, key=lambda x: x[0])
        logger.info(
            f"Found latest checkpoint: {latest_path} (step {latest_step})"
        )
        return latest_path
    
    return None

#### System infos

In [12]:
def print_system_info():
    """System information."""
    logger.info("=== System Information ===")
    logger.info(f"Python version: {sys.version}")
    logger.info(f"PyTorch version: {torch.__version__}")
    logger.info(f"CUDA available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        logger.info(f"CUDA version: {torch.version.cuda}")
        logger.info(f"GPU count: {torch.cuda.device_count()}")
        logger.info(f"GPU name: {torch.cuda.get_device_name(0)}")
        logger.info(f"GPU memory: { torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

    logger.info("=== End System Information ===")

#### Main

In [13]:
def main():
    # Print system information
    print_system_info()

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Initialize Weights & Biases
    run_name = (
        wandb_run_name
        or f"llama-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
    )

    wandb.init(
        project=wandb_project,
        name=run_name,
        config={
            "model_name": model_name,
            "dataset_path": dataset_path,
            "learning_rate": learning_rate,
            "num_epochs": num_epochs,
            "batch_size": "auto",
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "max_seq_length": max_seq_length,
            "warmup_steps": warmup_steps,
            "weight_decay": weight_decay,
            "max_grad_norm": max_grad_norm,
            "lora_r": lora_r,
            "lora_alpha": lora_alpha,
            "lora_dropout": lora_dropout,
            "early_stopping_patience": early_stopping_patience,
            "seed": seed,
        },
    )
    
    logger.info(
        f"Starting {model_name} fine-tuning ..."
    )
    logger.info(
        (f"Fixed parameters: model={model_name}, lr={learning_rate}, "
         f"epochs={num_epochs}")
    )
    
    # Auto-detect checkpoint if not specified
    resume_checkpoint = resume_from_checkpoint
    if resume_checkpoint is None:
        latest_checkpoint = find_latest_checkpoint(output_dir)
        if latest_checkpoint:
            logger.info(
                (
                f"Auto-resuming from latest checkpoint: "
                f"{latest_checkpoint}"
            )
            )
            resume_checkpoint = latest_checkpoint
        else:
            logger.info(
                "No existing checkpoints found. Starting fresh training."
            )
    elif resume_checkpoint and not os.path.exists(
        resume_checkpoint
    ):
        logger.warning(
            (f"Checkpoint {resume_checkpoint} not found. "
 "Starting fresh training.")
        )
        resume_checkpoint = None

    
    # Load model and tokenizer with Unsloth optimizations
    logger.info("Loading model and tokenizer with Unsloth...")
    model, tokenizer = load_model_unsloth(model_name=model_name, 
                                          max_seq_length=max_seq_length, 
                                          load_in_4bit=True, 
                                          dtype=None)
        
    
    # Load dataset
    dataset = load_and_prepare_dataset(dataset_path, tokenizer)
    dataset = dataset.shuffle(seed=seed)
    dataset = dataset.train_test_split(test_size=test_dataset_split)

    # Log dataset information
    logger.info(f"Training samples: {len(dataset['train'])}")
    logger.info(f"Validation samples: {len(dataset['test'])}")
    
    # Create PEFT vesion of the foundation model
    model = load_peft_model(model=model, 
                            target_modules=target_modules, 
                            lora_r=lora_r, 
                            lora_alpha=lora_alpha, 
                            lora_dropout=lora_dropout, 
                            bias="none", 
                            random_state=seed)    

    # Set Training Args
    training_args = create_training_arguments(output_dir=output_dir, 
                                              learning_rate=learning_rate, 
                                              num_epochs=num_epochs, 
                                              batch_size=batch_size,
                                              warmup_steps=warmup_steps, 
                                              weight_decay=weight_decay,
                                              lr_scheduler_type='cosine', 
                                              max_grad_norm=max_grad_norm,
                                              gradient_accumulation_steps=gradient_accumulation_steps, 
                                              run_name=run_name,
                                              logging_steps=50, 
                                              eval_steps=eval_steps, 
                                              save_steps=save_steps, 
                                              seed=seed)    
    
    # Initialize callbacks with enhanced stability monitoring
    stability_callback = StabilityCallback(
        divergence_threshold=1.3,  # More conservative threshold
        min_steps_before_check=100,  # Check earlier
        patience=2,  # Less patience for divergence
        no_improvement_patience=5,  # Stop if no improvement
        gradient_explosion_threshold=5.0,  # Stop if gradients exceed 5.0
    )

    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=early_stopping_patience,
    )    

    # Set Trainer
    logger.info("Initializing trainer...")    
    trainer = create_trainer(model=model, 
                             tokenizer=tokenizer, 
                             training_args=training_args,
                             train_dataset=dataset['train'], 
                             eval_dataset = dataset['test'],
                             max_seq_length=max_seq_length, 
                             dataset_text_field="prompt",
                             callbacks=[stability_callback, early_stopping_callback]
                             )    
    
    # Train
    logger.info("Starting training...")
    try:
        trainer.train(resume_from_checkpoint=resume_checkpoint)
        logger.info("Training completed successfully!")
    except Exception as e:
        logger.error(f"Training failed: {e}")
        raise    
    
    # Save the model
    logger.info("Saving model...")
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Free some memory
    del model
    del trainer
    del dataset
    torch.cuda.empty_cache()
    gc.collect()

    # Finalize
    logger.info(f"Model saved to {output_dir}")
    logger.info("Fine-tuning completed successfully!")

    # Finish wandb
    wandb.finish()
    

In [14]:
if __name__ == "__main__":
    main()

2025-09-02 08:45:23,411 - INFO - === System Information ===
2025-09-02 08:45:23,412 - INFO - Python version: 3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]
2025-09-02 08:45:23,412 - INFO - PyTorch version: 2.8.0+cu128
2025-09-02 08:45:23,413 - INFO - CUDA available: True
2025-09-02 08:45:23,413 - INFO - CUDA version: 12.8
2025-09-02 08:45:23,414 - INFO - GPU count: 1
2025-09-02 08:45:23,415 - INFO - GPU name: NVIDIA GeForce RTX 4080 SUPER
2025-09-02 08:45:23,416 - INFO - GPU memory: 16.0 GB
2025-09-02 08:45:23,416 - INFO - === End System Information ===


[34m[1mwandb[0m: Currently logged in as: [33msilverkonlambigue[0m ([33msilverkonlambigue-skd[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-09-02 08:45:25,268 - INFO - Starting meta-llama/Meta-Llama-3-8B fine-tuning ...
2025-09-02 08:45:25,269 - INFO - Fixed parameters: model=meta-llama/Meta-Llama-3-8B, lr=2e-05, epochs=100
2025-09-02 08:45:25,272 - INFO - No existing checkpoints found. Starting fresh training.
2025-09-02 08:45:25,273 - INFO - Loading model and tokenizer with Unsloth...
==((====))==  Unsloth 2025.7.2: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4080 SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
2025-09-02 08:45:27,626 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.2.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


2025-09-02 08:45:42,079 - INFO - Initializing trainer...
2025-09-02 08:45:42,509 - INFO - Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 162 | Num Epochs = 100 | Total steps = 2,100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 6,815,744 of 8,037,076,992 (0.08% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,2.7333,2.699003
100,2.5632,2.409425
150,2.3029,2.21694
200,2.1395,2.129347
250,2.0406,2.10809
300,1.9854,2.094071
350,1.9186,2.091607
400,1.871,2.091548
450,1.7972,2.106251
500,1.7692,2.114968


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


2025-09-02 08:54:42,185 - INFO - Training completed successfully!
2025-09-02 08:54:42,186 - INFO - Saving model...
2025-09-02 08:54:42,975 - INFO - Model saved to ./unsloth_lab_outputs
2025-09-02 08:54:42,976 - INFO - Fine-tuning completed successfully!


0,1
consecutive_high_gradients,▁▁▁
eval/loss,█▅▂▁▁▁▁▁▁▁▂▂
eval/runtime,█▁▂▂▂▂▁▂▁▂▂▂
eval/samples_per_second,▁█▇▇▇▇█▇█▇▇▇
eval/steps_per_second,▁█▇▇▇▇█▇█▇▇▇
gradient_norm_avg_10,▁▅█
gradient_norm_current,▁█▃
gradient_norm_max_10,▁██
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███

0,1
consecutive_high_gradients,0.0
eval/loss,2.16148
eval/runtime,1.5834
eval/samples_per_second,25.893
eval/steps_per_second,6.947
gradient_norm_avg_10,2.15972
gradient_norm_current,3.17144
gradient_norm_max_10,3.72582
total_flos,3.0603010736553984e+16
train/epoch,28.58537


In [15]:
stop

NameError: name 'stop' is not defined

## Inference

In [16]:
import os
import torch
from unsloth import FastLanguageModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import argparse
import readline

In [17]:
def load_model(model_path, device):
    """
    Load the model and tokenizer from the specified path.
    """
    # Set dtype accordingly
    torch_dtype = (
        torch.bfloat16
        if device == "cuda" and torch.cuda.is_bf16_supported()
        else torch.float16
    )

    print(f"Loading model from {model_path}...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)    
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch_dtype,
        device_map="auto"
    )
    print("Model loaded successfully.")
    return model, tokenizer

def load_model_unsloth(model_name, max_seq_length=2048, load_in_4bit=False, dtype=None):
    """ Load model and tokenizer with Unsloth optimizations """
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map="auto"    
    )
    return model, tokenizer

def generate_response(
    model, tokenizer, prompt, max_new_tokens=512, temperature=0.7
):
    """
    Generate a response from the model.
    """

    inputs = tokenizer(prompt, return_tensors="pt").to(
        model.device
    )

    streamer = TextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    # Generate the response
    _ = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,        
        # Stop generation when these tokens are encountered.        
        #eos_token_id=[
        #    tokenizer.eos_token_id,
        #    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        #],
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return ""  # Streamer handles the output

In [19]:
def main():
    """
    Main function to run the inference script.
    """
    
    working_dir = './'
    model_path = os.path.join(working_dir, "unsloth_lab_outputs")

    prompt = "I want you to act as a motivational coach. "
    max_new_tokens = 100
    temperature = 0.2

    # Determine the device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load the model and tokenizer
    #model, tokenizer = load_model(model_path, device)
    model, tokenizer = load_model_unsloth(model_path, 
                                          load_in_4bit=True)

    # Single prompt mode
    print(f"Prompt: {prompt}")
    print("\nResponse:")
    generate_response(
        model,
        tokenizer,
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
    )
    print("\n" + "=" * 80 + "\n")    

In [20]:
if __name__ == "__main__":
    main()

Using device: cuda
==((====))==  Unsloth 2025.7.2: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4080 SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
2025-09-02 11:37:02,981 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Prompt: I want you to act as a motivational coach. 

Response:
 I will provide some information about an individual, and your task is help them develop the mindset necessary for achieving their goals by providing positive affirmations or advice on how they can overcome any obstacles s