# Get and Upload your on checkpoint to SambaStudio

In [1]:
# Import libraries
import os
import sys
import re
import json
from huggingface_hub import hf_hub_download, HfApi

# Get absolute paths for kit_dir and repo_dir
current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))

# Adding directories to the Python module search path
sys.path.append(repo_dir)

from utils.byoc.src.snsdk_byoc_wrapper import BYOC

  from .autonotebook import tqdm as notebook_tqdm


# Step by Step / Manual setting

In [2]:
byoc = BYOC()

2024-11-25 12:20:51,796 [INFO] Using variables from Snapi config to set up Snsdk.


### Download a base chekpoint from HuggingFace 

In [3]:
hf_model = 'lightblue/suzume-llama-3-8B-multilingual'
target_dir = os.path.join(kit_dir, 'data', 'models') 

In [6]:
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

repo_files = HfApi().list_repo_files(hf_model)
for file_name in repo_files:
    hf_hub_download(repo_id=hf_model, filename=file_name, cache_dir=target_dir)

In [4]:
# Find the snapshot folder
for root, dirs, files in os.walk(target_dir):
    if "snapshots" in root and hf_model.replace("/", "--") in root:
        checkpoint_folder = os.path.join(root,dirs[0])
        break

checkpoint_folder

'/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/models/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/0cb15aa9ec685eef494f9a15f65aefcfe3c04c66'

### Set checkpoint configs

In [5]:
checkpoint = {
    'model_name':'Suzume-Llama-3-8B-Multilingual',
    'publisher': "lightblue",
    'description': " Suzume 8B, a multilingual finetune of Llama 3",
    'param_count':8,  # number in billions of parameters
    'checkpoint_path': checkpoint_folder
}

### Set and check chat template (Optional) 

In [6]:
jinja_chat_template = """ 
{% for message in messages %}
    {% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] | trim + '<|eot_id|>'+'\n' %}
    {% if loop.index0 == 0 %}{% set content = bos_token + content %}
    {% endif %}
    {{content}}
{% endfor %}
{{'<|start_header_id|>assistant<|end_header_id|>'+'\n'}}
"""
jinja_chat_template = re.sub(r"(?<!')\n(?!')", "", jinja_chat_template).strip().replace('  ','')

In [7]:
with open(os.path.join(checkpoint['checkpoint_path'], 'tokenizer_config.json'), 'r+') as file:
    data = json.load(file)
    data['chat_template'] = jinja_chat_template
    file.seek(0)
    file.truncate()
    json.dump(data, file, indent=4)

In [8]:
test_messages = [
    {"role": "system", "content": "This is a system prompt."},
    {"role": "user", "content": "This is a user prompt."},
    {"role": "assistant", "content": "This is a response from the assistant."},
    {"role": "user", "content": "This is an user follow up"}
    ]
byoc.check_chat_templates(test_messages, checkpoint_paths=checkpoint['checkpoint_path'])

2024-11-25 12:21:29,117 [INFO] Raw chat template for checkpoint in /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/models/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/0cb15aa9ec685eef494f9a15f65aefcfe3c04c66:
{% for message in messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '
' + message['content'] | trim + '<|eot_id|>'+'
' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{content}}{% endfor %}{{'<|start_header_id|>assistant<|end_header_id|>'+'
'}}

2024-11-25 12:21:29,121 [INFO] Rendered template with input test messages:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
This is a system prompt.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
This is a user prompt.<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
This is a response from the assistant.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
This is an user follow up<|eot_id|>
<|start_

### Set padding token (required for training)

In [9]:
with open(os.path.join(checkpoint['checkpoint_path'], 'config.json'), 'r+') as file:
    data = json.load(file)
    data['pad_token_id']=None
    file.seek(0)
    file.truncate()
    json.dump(data, file, indent=4)

### Get model params and Sambastudio suitable Apps

In [10]:
checkpoint_config_params = byoc.find_config_params(checkpoint_paths=checkpoint['checkpoint_path'])[0]
checkpoint.update(checkpoint_config_params)

2024-11-25 12:21:33,276 [INFO] Params for checkpoint in /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/models/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/0cb15aa9ec685eef494f9a15f65aefcfe3c04c66:
[{'model_arch': 'llama', 'seq_length': 8192, 'vocab_size': 128256}]


In [11]:
suitable_apps = byoc.get_suitable_apps(checkpoint)

2024-11-25 12:21:35,336 [INFO] Checkpoint Suzume-Llama-3-8B-Multilingual suitable apps:
[{'id': '61fa0993-04a2-42ca-9db1-1eff693ea978', 'name': 'Samba1 Llama3 Experts'}, {'id': 'ad39e323-9878-4914-8e29-82c9f2939475', 'name': 'Llama 3'}]


In [12]:
checkpoint["app_id"]=suitable_apps[0][0]['id']

In [13]:
checkpoint

{'model_name': 'Suzume-Llama-3-8B-Multilingual',
 'publisher': 'lightblue',
 'description': ' Suzume 8B, a multilingual finetune of Llama 3',
 'param_count': 8,
 'checkpoint_path': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/models/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/0cb15aa9ec685eef494f9a15f65aefcfe3c04c66',
 'model_arch': 'llama',
 'seq_length': 8192,
 'vocab_size': 128256,
 'app_id': '61fa0993-04a2-42ca-9db1-1eff693ea978'}

### Upload the checkpoint

If uploading only one checkpoint

In [None]:
model_id=byoc.upload_checkpoint(
    model_name=checkpoint['model_name'],
    checkpoint_path=checkpoint['checkpoint_path'],
    description=checkpoint['description'],
    publisher=checkpoint['publisher'],
    param_count=checkpoint['param_count'],
    model_arch=checkpoint['model_arch'],
    seq_length=checkpoint['seq_length'],
    vocab_size=checkpoint['vocab_size'],
    app_id=checkpoint['app_id'],
    retries=3
)

2024-11-25 12:21:42,947 [INFO] Model with name 'Suzume-Llama-3-8B-Multilingual' not found
2024-11-25 12:21:43,138 [INFO] App with name '61fa0993-04a2-42ca-9db1-1eff693ea978' found with id 61fa0993-04a2-42ca-9db1-1eff693ea978


In [None]:
byoc.get_checkpoints_status(model_id)

Alternatively if uploading multiple checkpoints in parallel

In [None]:
models=byoc.upload_checkpoints([checkpoint], max_parallel_jobs=4, retries=3)
byoc.get_checkpoints_status([model['id'] for model in models])

## Streamlined Execution

In [None]:
config_file = os.path.join(repo_dir, 'checkpoints_config.yaml')
byoc = BYOC(config_file)
byoc.find_config_params()
byoc.upload_checkpoints()
byoc.get_checkpoints_status()