# Train models using HuggingFace libraries

This notebook takes parameters from a params.json file which is automatically
created by Substratus K8s operator.

The following parameters influence what happens in this notebook:
- `dataset_urls`: A comma separated list of URLs. The URLs should point to
  json files that contain your training dataset. If unset a json or jsonl
  file should be present under the `/content/data/` directory.
- `prompt_template`: The prompt template to use for training
- `push_to_hub`: if this variable is set a repo id, then the trained
  model will get pushed to HuggingFace hub. For example,
  set it to "substratusai/my-model" to publish to substratusai HF org.

In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)

params


In [3]:
import os 
from datasets import load_dataset

dataset_urls = params.get("dataset_urls")
if dataset_urls:
    urls = [u.strip() for u in dataset_urls.split(",")]
    print(f"Using the following URLs for the dataset: {urls}")
    data = load_dataset("json", data_files=urls)
else:
    data = load_dataset("json", data_files="/content/data/*.json*")
data

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 282
    })
})

In [None]:
import transformers
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/model/"
trained_model_path = "/content/artifacts"
trained_model_path_lora = "/content/artifacts/lora"

tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          local_files_only=True,
                                          use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, load_in_8bit=True)
model

Prompt before fine tuning

In [4]:
default_prompt = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{prompt}
### Response:
{completion}
"""

prompt = params.get("prompt_template", default_prompt)
print(prompt.format_map(data["train"][0]))

In [None]:
from typing import Dict
# source: https://github.com/artidoro/qlora
DEFAULT_PAD_TOKEN = params.get("pad_token", "[PAD]")

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg

if tokenizer._pad_token is None:
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
        tokenizer=tokenizer,
        model=model,
    )

if isinstance(tokenizer, transformers.LlamaTokenizer):
    # LLaMA tokenizer may not have correct special tokens set.
    # Check and add them if missing to prevent them from being parsed into different tokens.
    # Note that these are present in the vocabulary.
    # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
    print('Adding special tokens.')
    tokenizer.add_special_tokens({
            "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
            "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
            "unk_token": tokenizer.convert_ids_to_tokens(
                model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
            ),
    })

tokenizer

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7224d2ad124fbca0.arrow


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 282
    })
})
After tokenizing: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 282
    })
})


In [None]:
from typing import Dict

def generate_prompt(dataset_entry: Dict):
    tokens = tokenizer(prompt.format_map(dataset_entry), padding='max_length', truncation=True)
    last_token = tokens["input_ids"][-1]
    nr_of_tokens = len(tokens["input_ids"])
    if last_token != tokenizer.eos_token_id and nr_of_tokens < tokenizer.model_max_length:
            tokens["input_ids"].append(tokenizer.eos_token_id)
            tokens["attention_mask"].append(1)

data = data.map(generate_prompt)

print("After tokenizing:", data)

In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

lora_config2 = LoraConfig(
 r=16,
 lora_alpha=16,
 lora_dropout=0.05,
 bias="none",
 task_type="CAUSAL_LM"
)
target_modules = params.get("target_modules")
if target_modules:
    lora_config2.target_modules = [mod.strip() for mod in target_modules.split(",")]

model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config2)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 6,926,439,296 || trainable%: 0.06812435363037071


In [None]:
from utils import parse_training_args

training_args = parse_training_args(params)
training_args

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

checkpoint_path = Path("/content/artifacts/checkpoints")

# Only set resume_from_checkpoint True when directory exists and contains files
resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())
if resume_from_checkpoint:
    print("Resuming from checkpoint:", list(checkpoint_path.rglob("")))
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

trainer.save_model(trained_model_path)

In [None]:
! nvidia-smi

In [None]:
! ls -lash {trained_model_path}

In [None]:
inference_prompt = params.get("inference_prompt_template")
if inference_prompt:
    model.config.use_cache = True
    device = "cuda"
    model_inputs = tokenizer([inference_prompt.format_map(data["train"][0])],
                             return_tensors="pt").to(device)

    generated_ids = model.generate(**model_inputs,
                                   max_new_tokens=300)

    print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

In [None]:
! mkdir -p {trained_model_path_lora}
model.save_pretrained(trained_model_path_lora)

In [None]:
from peft import PeftModel

print("Clearing existing GPU memory to merge lora with base model")
del model
torch.cuda.empty_cache()

# Note this needs to happen in 16 bit hence the reload of the model
base_model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

model = PeftModel.from_pretrained(base_model, trained_model_path_lora, torch_dtype=torch.float16)
model.merge_and_unload()

In [None]:
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
! ls -lash {trained_model_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 13G
4.0K drwxr-xr-x 1 root root 4.0K Jul  5 22:35 .
8.0K drwxr-xr-x 1 root root 4.0K Jul  5 10:00 ..
4.0K -rw-r--r-- 1 root root  707 Jul  5 22:35 config.json
4.0K -rw-r--r-- 1 root root  116 Jul  5 22:35 generation_config.json
9.3G -rw-r--r-- 1 root root 9.3G Jul  5 22:35 pytorch_model-00001-of-00002.bin
3.7G -rw-r--r-- 1 root root 3.7G Jul  5 22:35 pytorch_model-00002-of-00002.bin
 20K -rw-r--r-- 1 root root  17K Jul  5 22:35 pytorch_model.bin.index.json


In [None]:
print("Running inference after merging lora layer")
inference_prompt = params.get("inference_prompt_template")
if inference_prompt:
    model.config.use_cache = True
    device = "cuda"
    model_inputs = tokenizer([inference_prompt.format_map(data["train"][0])],
                             return_tensors="pt").to(device)

    generated_ids = model.generate(**model_inputs,
                                   max_new_tokens=300)

    print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

In [None]:
from huggingface_hub import HfApi
import shutil

tokenizer_model_path_base = Path(model_path) / "tokenizer.model"
tokenizer_model_path_trained = Path(trained_model_path) / "tokenizer.model"
if tokenizer_model_path_base.exists() and not tokenizer_model_path_trained.exists():
    shutil.copy(tokenizer_model_path_base, tokenizer_model_path_trained)

repo_id = params.get("push_to_hub")
if repo_id:
    model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)
    hf_api = HfApi()
    # Upload tokenizer.model if it was in base model
    if tokenizer_model_path_base.exists():
        hf_api.upload_file(
            path_or_fileobj=tokenizer_model_path_base,
            path_in_repo=tokenizer_model_path_base.name,
            repo_id=repo_id,
        )
    logs_path = Path("/content/artifacts/src/train.ipynb")
    if logs_path.exists():
        hf_api.upload_file(
            path_or_fileobj=logs_path,
            path_in_repo=logs_path.name,
            repo_id=repo_id,
        )
