# Train models using HuggingFace libraries

This notebook takes parameters from a params.json file which is automatically
created by Substratus K8s operator.

The following parameters influence what happens in this notebook:
- `dataset_urls`: A comma separated list of URLs. The URLs should point to
  json files that contain your training dataset.
- `prompt_template`: The prompt template to use for training
- `push_to_hub`: if this variable is set a repo id, then the trained
  model will get pushed to HuggingFace hub. For example,
  set it to "substratusai/my-model" to publish to substratusai HF org.

In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)


In [3]:
import os 
from datasets import load_dataset

dataset_urls = params.get("dataset_urls")
if dataset_urls:
    urls = [u.strip() for u in dataset_urls.split(",")]
    print(f"Using the following URLs for the dataset: {urls}")
    data = load_dataset("json", data_files=urls)
else:
    data = load_dataset("json", data_files="/content/data/*.json*")
data

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 282
    })
})

In [None]:
import transformers
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/saved-model/"
trained_model_path = "/content/model"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)


Prompt before fine tuning

In [4]:
default_prompt = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{prompt}
### Response:
{completion}
"""

prompt = params.get("prompt_template", default_prompt)
prompt.format_map(data["train"][0])

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print(data)
data = data.map(lambda x: tokenizer(
    prompt.format_map(x), padding='max_length', truncation=True))

print("After tokenizing:", data)

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-2d69f16079490881/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7224d2ad124fbca0.arrow


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 282
    })
})
After tokenizing: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 282
    })
})


In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

lora_config2 = LoraConfig(
 r=16,
 lora_alpha=32,
 # target modules should be unset so it can detect target_modules automatically
 # target_modules=["query_key_value"],
 lora_dropout=0.05,
 bias="none",
 task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config2)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 6,926,439,296 || trainable%: 0.06812435363037071


In [None]:
from utils import parse_training_args

training_args = parse_training_args(params)
training_args

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

checkpoint_path = Path("/content/model/checkpoints")

# Only set resume_from_checkpoint True when directory exists and contains files
resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())
if resume_from_checkpoint:
    print("Resuming from checkpoint:", list(checkpoint_path.rglob("")))
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

trainer.save_model(trained_model_path)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.7497
2,1.5701
3,1.6786
4,1.6526
5,1.5714
6,1.3363
7,1.4101
8,1.6196
9,1.4076
10,1.2498


In [None]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Jul  5 22:32:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0    57W /  72W |   7314MiB / 23034MiB |     89%      Default |
|                               |            

In [None]:
! ls -lash {trained_model_path}

In [None]:
print(model)

In [None]:
model = model.merge_and_unload()

In [None]:
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
! ls -lash {trained_model_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 13G
4.0K drwxr-xr-x 1 root root 4.0K Jul  5 22:35 .
8.0K drwxr-xr-x 1 root root 4.0K Jul  5 10:00 ..
4.0K -rw-r--r-- 1 root root  707 Jul  5 22:35 config.json
4.0K -rw-r--r-- 1 root root  116 Jul  5 22:35 generation_config.json
9.3G -rw-r--r-- 1 root root 9.3G Jul  5 22:35 pytorch_model-00001-of-00002.bin
3.7G -rw-r--r-- 1 root root 3.7G Jul  5 22:35 pytorch_model-00002-of-00002.bin
 20K -rw-r--r-- 1 root root  17K Jul  5 22:35 pytorch_model.bin.index.json


In [None]:
from huggingface_hub import HfApi

repo_id = params.get("push_to_hub")
if repo_id:
    model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)
    tokenizer_model_path_base = Path(model_path) / "tokenizer.model"
    tokenizer_model_path_trained = Path(trained_model_path) / "tokenizer.model"
    hf_api = HfApi()
    # Upload tokenizer.model if it was not included in trained model but was in base model
    if tokenizer_model_path_base.exists() and not tokenizer_model_path_trained.exists():
        hf_api.upload_file(
            path_or_fileobj=tokenizer_model_path_base,
            path_in_repo=tokenizer_model_path_base.name,
            repo_id=repo_id,
        )
    logs_path = Path("/content/logs/train.ipynb")
    if logs_path.exists():
        hf_api.upload_file(
            path_or_fileobj=logs_path,
            path_in_repo=logs_path.name,
            repo_id=repo_id,
        )
