# 0. Introduction
Fine tune an LLM using Huggingface platform

# 1. Install
_Run once_

In [1]:
!pip install --upgrade pip

!pip install unsloth
!pip install "torch >= 2.0.0"
!pip install "bitsandbytes >= 0.40.0"
!pip install "transformers >= 4.30.0"
!pip install "accelerate >= 0.20.3"
!pip install "datasets >= 2.12.0"
!pip install "trl > 0.6.0"
!pip install "unsloth >= 0.1.0"
!pip install "peft >= 0.4.0"
!pip install "evaluate"
!pip install "tensorboard"

!pip install colorama

# 2. Imports

In [2]:
import unsloth
import torch
import bitsandbytes as bnb
import transformers
import accelerate
import datasets as ds
import trl

import os

from colorama import Fore, Back, Style

ModuleNotFoundError: No module named 'unsloth'

## 2.1. Check versions

In [None]:
print(f"Unsloth Version\t\t: {Style.BRIGHT}{Fore.CYAN}{unsloth.__version__}{Style.RESET_ALL}")
print(f"Torch Version\t\t: {Style.BRIGHT}{Fore.CYAN}{torch.__version__}{Style.RESET_ALL}")
print(f"BitsAndBytes Version\t: {Style.BRIGHT}{Fore.CYAN}{bnb.__version__}{Style.RESET_ALL}")
print(f"Transformers Version\t: {Style.BRIGHT}{Fore.CYAN}{transformers.__version__}{Style.RESET_ALL}")
print(f"Accelerate Version\t: {Style.BRIGHT}{Fore.CYAN}{accelerate.__version__}{Style.RESET_ALL}")
print(f"Datasets Version\t: {Style.BRIGHT}{Fore.CYAN}{ds.__version__}{Style.RESET_ALL}")
print(f"TRL Version\t\t: {Style.BRIGHT}{Fore.CYAN}{trl.__version__}{Style.RESET_ALL}")
print(f"\nCUDA Availability\t: {Style.BRIGHT}{Fore.GREEN}{torch.cuda.is_available()}{Style.RESET_ALL}")
print(f"CUDA Device Count\t: {Style.BRIGHT}{Fore.GREEN}{torch.cuda.device_count()}{Style.RESET_ALL}")

# 3. Variables

## 3.1. Environment Variables

In [None]:
# Configured env var for Unsloth <---- particularly for colab notebook
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## 3.2. User Variables

In [None]:
model_name: str = "distilgpt2"
dataset_name: str = "timdettmers/openassistant-guanaco"

# 4. Code

## 4.1. Prepare dataset
Steps:
<ol>
    <li>Fetch a dataset</li>
    <li>Put an <font color="#22819F"><b>eos_token</b></font> if the examples do not have one at the end</li>
    <li>For tokenized dataset, change token ids to a reserved keyword (<i>-100 here</i>) for pad tokens</li>
    <li>Tokenize</li>
</ol>

In [None]:
from datasets import load_dataset
from datasets import Dataset
from datasets import DatasetDict
from datasets import IterableDataset
from datasets import IterableDatasetDict

from transformers import AutoTokenizer

from typing import Any

### 4.1.1. Fetch a dataset
(from Hugging face)

In [None]:
def get_dataset(dataset_name: str, max_samples: int
    ) -> Dataset | DatasetDict | IterableDataset | IterableDatasetDict:
    """
    Get dataset from HuggingFace.

    Args:
        dataset_name (str): Name of the dataset.
        max_samples (int): Maximum number of samples in the dataset to use.

    Returns:
        Dataset | DatasetDict | IterableDataset | IterableDatasetDict
    """

    dataset = load_dataset(dataset_name, split="train")

    if max_samples is not None and max_samples < len(dataset):
        dataset = dataset.select(range(max_samples))

    print(f"Dataset {Fore.LIGHTBLUE_EX}{dataset_name}{Style.RESET_ALL} loaded successfully.")

    return dataset

In [None]:
def get_tokenizer(model_name: str) -> Any:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    print(f"Tokenizer {Fore.LIGHTBLUE_EX}{model_name}{Style.RESET_ALL} loaded successfully.")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

### 4.1.2. Put an <font color="#22819F"><b>eos_token</b></font> if the examples do not have one at the end

In [None]:
def add_pad_tokens_at_end(texts, tokenizer) -> list[str]:
    processed_texts: list[str] = []

    for text in texts:
        if not text.endswith(tokenizer.eos_token):
            text += tokenizer.eos_token

        processed_texts.append(text)

    return processed_texts

### 4.1.3. Change <font color="#22819F">pad_token id</font> to reserved keyword

In [None]:
def change_pad_tokens_ids_to_res_keyword(tokenized_text_id: list[int], tokenizer: Any) -> list[int]:
    tokenized_texts_label_ids = []

    for label_token_ids in tokenized_text_id:
        processed_label_token_ids: list[str] = []

        for token_id in label_token_ids:
            if token_id == tokenizer.pad_token_id:
                token_id = -100

            processed_label_token_ids.append(token_id)

        tokenized_texts_label_ids.append(processed_label_token_ids)

    return tokenized_texts_label_ids

### 4.1.4. Tokenize

In [None]:
def tokenize(examples: Dataset | DatasetDict | IterableDataset | IterableDatasetDict,
             tokenizer: Any, max_length: int):
    texts: list[str] = examples["text"]
    processed_texts: list[str] = add_pad_tokens_at_end(texts, tokenizer)

    tokenized_texts = tokenizer(
        processed_texts,
        max_length = max_length,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt")

    tokenized_text_input_id: list[int] = tokenized_texts["input_ids"].clone()
    tokenized_texts["labels"] = change_pad_tokens_ids_to_res_keyword(tokenized_text_input_id, tokenizer)

    return tokenized_texts

In [None]:
def prepare_dataset_from_hugging_face(
    dataset_name: str = dataset_name,
    model_name: str = model_name,
    max_samples: int = None,
    max_length: int = 512
    ) -> tuple[Dataset, Any]:
    """
    Prepare dataset from HuggingFace.

    Args:
        dataset_name (str): Name of the dataset.
        model_name (str): Name of the model.
        max_samples (int): Maximum number of samples in the dataset to use.
        max_length (int): Maximum length of the input.

    Returns:
        tuple[ds.Dataset, AutoTokenizer]: Prepared dataset and tokenizer.
    """
    dataset = get_dataset(dataset_name, max_samples)

    tokenizer = get_tokenizer(model_name)

    tokenize_function_args_map: dict = {
        "tokenizer": tokenizer,
        "max_length": max_length
        }

    print(f"{Fore.CYAN}Tokenization started{Style.RESET_ALL}")
    tokenized_dataset = dataset.map(tokenize,
        fn_kwargs = tokenize_function_args_map,
        remove_columns = dataset.column_names,
        batched=True, desc = "Tokenizing dataset")
    print(f"{Fore.CYAN}Tokenization completed{Style.RESET_ALL}")

    return tokenized_dataset, tokenizer

### 4.1.5. Test Data preparation module

In [None]:
# tokenized_dataset, tokenizer = prepare_dataset_from_hugging_face()

In [None]:
# tokenized_dataset

In [None]:
# tokenizer

## 4.2. <font color = "#EE0099"><b>Lo</b></font>w <font color = "#EE0099"><b>R</b></font>ank <font color = "#EE0099"><b>A</b></font>daptation

## 4.2.1. Prepare <font color = "#FF0077">LoRA</font> Config
<i>Note: This config class is a carrier of all values default and user assigned. It is not LoRA constructor in any way.</i>

In [None]:
from dataclasses import dataclass
import torch


@dataclass
class ProductionLoRAConfig:
    model_name: str = "openai-community/gpt2"
    max_seq_length: int = 512
    dtype: torch.dtype = torch.float16
    lora_r: int = 16 # lora attention dimension, or rank of matrices: the minimum number of independent rows / columns in a matrix is rank.
    lora_alpha: int = 16 # alpha parameter: factor which is multiplied to LoRA module matrices before it is added to original params. lower values makes LoRA params less significant.
    lora_dropout: float = 0.05 # probability for dropping out the LoRA elements in low rank matrices to prevent overfitting. it is regularisation technique.
    target_modules: list = None # modules are q_proj, K_proj, etc depending on the architecture
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: torch.dtype = torch.float16
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True
    bnb_8bit_quant_type: str = "nf4"
    bnb_8bit_use_double_quant: bool = True
    bnb_8bit_compute_dtype: torch.dtype = torch.float16
    max_steps: int = 2000 # risk of overfitting
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    learning_rate: float = 2e-4
    weight_decay: float = 0.001
    warmup_steps: int = 100
    warmup_ratio: int = 0.1
    optimizer: str = "adamw_8bith"
    save_steps: int = 250
    save_total_limit: int = 3
    eval_strategy: str = "steps"
    eval_steps: int = 250
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "loss"
    greater_is_better: bool = False # for "accuracy" metric, make it true
    gradient_checkpointing: bool = True # recommended for large models on a limited GPU memory. In Hugging Face platform, it is implemented as model.gradient_checkpointing_enabled().
    dataloader_num_workers: int = 4
    remove_unused_columns: bool = False
    group_by_length: bool = True
    ddp_find_unused_parameters: bool = False
    logging_steps: int = 10
    report_to: str = "tensorboard" # "wandb" or "tensorboard" or "none"
    # task_type is ignored here as a param as we choose the type of LLM later on.


    def __post_init__(self):
        if self.target_modules is None:
            if "openai-community/gpt2" in self.model_name:
                self.target_modules = ["c_attn", "c_proj"]
            else:
                self.target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]


    def get_effective_batch_size(self) -> tuple[int, int]:
        num_gpus: int = 1

        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()

        effective_batch_size: int = self.per_device_train_batch_size * num_gpus * self.gradient_accumulation_steps

        return effective_batch_size, num_gpus


    def print_config(self):
        print("LoRA Configuration")

        for field, value in self.__dict__.items():
            if not field.startswith("_"):
                print(f"{field}\t: {Style.BRIGHT}{Fore.CYAN}{value}{Style.RESET_ALL}")


        effective_batch_size, num_gpus = self.get_effective_batch_size()

        print(f"\n{'Effective Batch Size':30s}\t: {Style.BRIGHT}{Fore.CYAN}{effective_batch_size}{Style.RESET_ALL} across {Style.BRIGHT}{Fore.RED}{num_gpus}{Style.RESET_ALL} GPU(s)")

## 4.2.2. Configure LoRA using the above carrier config class

## 4.2.3. Inject LoRA in the model using <font color = #5577FF>unsloth.FastLanguageModel.<b>get_peft_model(...)</b></font>

## 4.2.4. Train with unsloth