# 1. Package Installation

In [None]:
!nvidia-smi

Thu Jul  4 23:40:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   52C    P8              13W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
#@title Requirements
%%writefile requirements.txt
peft
fire
accelerator
transformers
datasets
evaluate
pyarrow
galore-torch
pytorch-ignite
rouge-score
nltk
py7zr
optimum[exporters]
trl
lightning
jsonargparse[signatures]
deepspeed
colossalai
wandb

Writing requirements.txt


In [2]:
#@title Install Packages
%%capture
!CUDA_EXT=1 DS_BUILD=1 pip install --no-cache -r requirements.txt

In [3]:
#@title Huggingface Login
#@markdown huggingface weight 를 이용하고 싶다면 로그인 필수
!huggingface-cli login --add-to-git-credential



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [4]:
#@title Weight and Bias Train Logger Login
#@markdown weight and bias 로그인
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# 2. Load Model


In [14]:
#@title Get peft model from huggingface
#@markdown Colab 고용량 Ram CPU에서 가능한 범위 ~8B(테스트 중)
#@markdown
#@markdown  |모델       | Normal   | DeepSpeed |
#@markdown  |---        | ---      | ---       |
#@markdown  |Llama3-8B  |  X       |   O       |
#@markdown  |Mistral-7B |  X       |   O       |
#@markdown  |Llama3-70B |  X       |   X       |

%%writefile peft_model.py

import os
import fire
import torch
from peft import AutoPeftModelForCausalLM
from peft import LoraConfig
from peft import inject_adapter_in_model
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from random import randint

base_model_id = "Qwen/Qwen2-1.5B-Instruct" # @param ["Gunulhona/tb_pretrained_sts", "Gunulhona/tb_pretrained", "google/flan-t5-xxl", "meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-70B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen2-7B-Instruct", "google/gemma-7b", "MLP-KTLim/llama-3-Korean-Bllossom-8B", "EleutherAI/polyglot-ko-12.8b", "vilm/vulture-40b", "arcee-ai/Arcee-Spark", "Qwen/Qwen2-1.5B-Instruct", "OuteAI/Lite-Mistral-150M"] {allow-input: true}

peft_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True)

# adapter configuration
lora_config = LoraConfig(
    target_modules=["q_proj", "k_proj"],
    init_lora_weights="gaussian", #"gaussian", "pissa", "pissa_niter_{n}", "loftq", False
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    inference_mode=False,
    use_dora=False,
)

# peft_model.add_adapter(lora_config, adapter_name="adapter_1")
inject_adapter_in_model(lora_config, peft_model, "adapter_1")

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    trust_remote_code=True)
tokenizer.model_input_names=['input_ids', 'attention_mask']
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Overwriting peft_model.py


#3. Load Dataset

In [6]:
#@title Load data From huggingface datasets
#@markdown summary task에 대해 우선적으로 실험
%%writefile finetuning_datasets.py
import numpy as np
from datasets import load_dataset, concatenate_datasets

from evaluate import load
from peft_model import tokenizer

dataset_path = "Samsung/samsum" # @param ["Samsung/samsum", "emozilla/soda_synthetic_dialogue", "frcp/summary-alpaca-v01"] {allow-input: true}

dataset = load_dataset(
  dataset_path,
  trust_remote_code=True,
  revision="main"  # tag name, or branch name, or commit hash
)

metric = load("rouge")
full_dataset = concatenate_datasets([dataset["train"], dataset["test"]])
tokenized_inputs = full_dataset.map(
    lambda x: tokenizer(x["dialogue"], truncation=True),
    batched=True,
    remove_columns=["dialogue", "summary"])

input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))

tokenized_targets = full_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True),
    batched=True,
    remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))


def preprocess_function(sample, max_source_length, max_target_length, padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"],
                       max_length=max_target_length,
                       padding=padding,
                       truncation=True,)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else 1) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(preprocess_function,
                      batched=True,
                      remove_columns=["dialogue", "summary", "id"],
                      fn_kwargs={
                          "max_source_length": max_source_length,
                           "max_target_length": max_source_length
                          },)

if any([d for d in dataset.values() if "token_type_ids" in d.features]):
    dataset = dataset.map(lambda x: x,
                          batched=True,
                          remove_columns=["token_type_ids"], )


Writing finetuning_datasets.py


#4. Train

In [7]:
#@title Start Training
#@markdown transformers trainer 이용, 추후 lightning 으로 이전 가능
%%writefile train.py
import nltk
import numpy as np
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from ignite.metrics import Rouge

from peft_model import peft_model, tokenizer
from finetuning_datasets import dataset, metric


# Callback Class
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, num_steps=10):
        self.num_steps = num_steps

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step >= self.num_steps:
            control.should_training_stop = True

        return control

# metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)

training_args = TrainingArguments(
    output_dir="llm_output",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    # use_cpu=True,
    # load_best_model_at_end=True,
    remove_unused_columns=False,
    push_to_hub=True,
    logging_steps=1000,
    save_steps=1000,
    fp16=True,
    save_total_limit=3,
    # logging_dir="llm_output/logs",
    optim="adamw_hf",
    report_to="tensorboard",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()],
)

trainer.train()


Writing train.py


In [None]:
!python train.py

## Training code to Lightning module

In [8]:
#@title Lightning Data Moudle
%%writefile l_datamodule.py

import lightning as L
from torch.utils.data import DataLoader

from transformers import DataCollatorForSeq2Seq
from peft_model import peft_model, tokenizer
from finetuning_datasets import dataset


class FTDataModule(L.LightningDataModule):
    def __init__(self, train_dataset, val_dataset, test_dataset, data_collator, train_batch_size, eval_batch_size,training_args,):
        super().__init__()
        self.train_dataset = dataset["train"]
        self.val_dataset = dataset["validation"]
        self.test_dataset = dataset["test"]
        self.data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.training_args = training_args

    def _get_dataloader(self, dataset, eval_mode: bool = False):
        return DataLoader(dataset=dataset,
                          batch_size=self.train_batch_size if eval_mode else self.eval_batch_size,
                          shuffle=not eval_mode,
                          num_workers=8,
                          collate_fn=self.data_collator)

    def train_dataloader(self):
        return self._get_dataloader(dataset=self.train_dataset)

    def val_dataloader(self):
        return self._get_dataloader(dataset=self.val_dataset, eval_mode=True)

    def test_dataloader(self):
        return self._get_dataloader(dataset=self.test_dataset, eval_mode=True)


Writing l_datamodule.py


In [9]:
#@title Lightning Model
%%writefile l_model.py

import lightning as L
import torch

from transformers import DataCollatorForSeq2Seq
from peft_model import peft_model, tokenizer
from bitsandbytes.optim import AdamW, Lion
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from finetuning_datasets import dataset
from torchmetrics.functional.text.rouge import rouge_score

class LLamaFTLightningModule(L.LightningModule):
    def __init__(self, data_collator, learning_rate: float = 2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = peft_model
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)
        self.learning_rate = learning_rate

    def _get_rouge_score(self, predictions, labels):
        generated_tokens = predictions.argmax(dim=-1)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        return rouge_score(preds=decoded_preds, target=decoded_labels)

    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        rouge_score = self._get_rouge_score(outputs.logits, batch.labels)

        loss = outputs.loss
        self.log("train_loss",
                 loss,
                 prog_bar=True, on_step=True, on_epoch=True)
        for k, v in rouge_score.items():
            self.log(f"train_{k}",
                     v,
                     prog_bar=True, on_step=True, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        rouge_score = self._get_rouge_score(outputs.logits, batch.labels)
        val_loss = outputs.loss
        self.log("val_loss",
                 val_loss,
                 prog_bar=True, on_step=True, on_epoch=True)
        for k, v in rouge_score.items():
            self.log(f"val_{k}",
                     v,
                     prog_bar=True, on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = Lion(params=self.model.parameters(),
                         lr=self.learning_rate,
                         weight_decay=0.01,
                         optim_bits=32,)
        scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                T_0=10,
                                                T_mult=2,
                                                eta_min=0.00001)
        # scheduler = ReduceLROnPlateau(optimizer=optimizer, mode="min")
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss",
                "interval": "step",
                "frequency": 1,

            },
        }




Writing l_model.py


In [10]:
#@title Trainer
%%writefile l_trainer.py

import os
import lightning as L
from lightning.pytorch.cli import LightningCLI, LightningArgumentParser
from lightning.pytorch.strategies.deepspeed import DeepSpeedStrategy
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from transformers import DataCollatorForSeq2Seq

from l_datamodule import FTDataModule
from l_model import LLamaFTLightningModule
from peft_model import peft_model, tokenizer
from finetuning_datasets import dataset

os.environ["TOKENIZERS_PARALLELISM"] = "0"


if __name__ == "__main__":
    L.pytorch.cli_lightning_logo()
    training_args = LightningArgumentParser()
    cli = LightningCLI(
        model_class=LLamaFTLightningModule,
        datamodule_class=FTDataModule,
        seed_everything_default=42,
        trainer_defaults={
            "reload_dataloaders_every_n_epochs": 1,
            "strategy": "deepspeed",
            "precision": "bf16-mixed",
            "profiler": "PassThroughProfiler",
            "logger": [WandbLogger(project="LLM-Finetuning"),],
            "callbacks": [EarlyStopping(monitor="val_loss", patience=5), LearningRateMonitor()]
        },
        save_config_callback=None)
    # cli.add_arguments_to_parser(training_args)


Writing l_trainer.py


In [None]:
#@title Start Training
#@markdown 실험 결과
#@markdown
#@markdown * batch_size <b>2</b> 넘기는 경우 OOM
#@markdown * DeepSpeed의 경우 GPU Ram 20GB로 7B finetuning 가능
#@markdown * DeepSpeed의 경우, 7B L4 GPU에서 사용 가능
#@markdown * 70B의 경우 RAM에서 Weight 가져오다 OOM
%%shell

python l_trainer.py fit \
    --trainer.max_epochs 4 \
    --model.learning_rate 5e-5 \
    --data.train_batch_size 2 \
    --data.eval_batch_size 2

#    --trainer.fast_dev_run 1\

2024-07-10 05:40:02.264868: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 05:40:02.264921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 05:40:02.266283: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100% 660/660 [00:00<00:00, 4.04MB/s]
model.safetensors: 100% 3.09G/3.09G [01:46<00:00, 29.0MB/s]
generation_config.json: 100% 242/242 [00:00<00:00, 1.81MB/s]
tokenizer_config.json: 100% 1.29k/1.29k [00:00<00:00, 9.94MB/s]
vocab.json: 100% 2.78M/2.78M [00:00<00:00, 3.27MB/s]
merges.txt: 100% 1.67M/1.67M [00:00<00:00, 2.39MB/s]
tokenizer.json: 100% 7.0

# Model Saving

In [None]:
#@title ONNX model save
#@markdown ONNX 로 모델 변형 후 저장
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTModelForCausalLM

model_checkpoint = "./" #@param{"type":"string"}
save_directory = "./" #@param{"type":"string"}

ort_model = ORTModelForCausalLM.from_pretrained(model_checkpoint, export=True)
ort_model.save_pretrained(save_directory)