# 05. Draft Model training

In [104]:
import os
import sys
from pprint import pprint
import yaml

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

## Step by Step / Manual setting

First instantiate the SambaStudio client 

In [105]:
sambastudio_client = SnsdkWrapper()

2025-04-02 23:38:51,871 [INFO] Using variables from .snapi config to set up Snsdk.


In [106]:
# Load the data generation config
config_draft_model_training_yaml = '../05_config_draft_model_training.yaml'

# Open and load the YAML file into a dictionary
with open(config_draft_model_training_yaml, 'r') as file:
    config_draft_model_training = yaml.safe_load(file)
pprint('Draft model training:')
pprint(config_draft_model_training)

model_name = config_draft_model_training['model_checkpoint']['model_name']
dataset_name = config_draft_model_training['dataset']['dataset_name']

'Draft model training:'
{'dataset': {'dataset_description': 'xxx', 'dataset_name': 'dummy_example'},
 'model_checkpoint': {'checkpoint_name': '',
                      'model_description': 'The Meta Llama 3.1 collection of '
                                           'multilingual large language models '
                                           '(LLMs) is a collection of '
                                           'pretrained and instruction tuned '
                                           'generative models in 8B, 70B and '
                                           '405B sizes (text in/text out).',
                      'model_name': 'meta-llama-3.1-8b-instruct',
                      'model_type': 'finetuned'},
 'project': {'project_description': 'This project will be used to test the E2E '
                                    'Draft Model Training pipeline '
                                    'implementation.',
             'project_name': 'e2e-draft-model-training-project'},


### Check model and dataset

In [107]:
available_models = [model["model_checkpoint_name"]for model in sambastudio_client.list_models(filter_job_types=["train"])]

assert model_name in available_models

### List available datasets

In [108]:
assert dataset_name in [dataset["dataset_name"] for dataset in sambastudio_client.list_datasets()]

### Create a project

#### Set Project configs 

In [109]:
project = {
    'project_name': config_draft_model_training['project']['project_name'],
    'project_description': config_draft_model_training['project']['project_description']
}

In [110]:
# Execute the create project method from client with project parameters
sambastudio_client.create_project(
    project_name = project['project_name'],
    project_description = project['project_description']
)

2025-04-02 23:38:58,020 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:38:58,020 [INFO] Project with name 'e2e-draft-model-training-project' already exists with id '0b4bf4f5-5e29-409c-8a40-7a2e4183ec95', using it


'0b4bf4f5-5e29-409c-8a40-7a2e4183ec95'

### Set train job config

In [111]:
# check required hyperparams for training job 
hyperparams = sambastudio_client.get_default_hyperparms(model,'train')
pprint(hyperparams)

2025-04-02 23:38:59,390 [INFO] Default Hyperparameters for train in SN40L-8 for meta-llama-3.1-8b-instruct: 

                    ['batch_size:`8`', 'debug_mode:`off`', 'do_eval:`false`', 'dump_inputs:`false`', 'eval_steps:`50`', 'evaluation_strategy:`no`', 'fix_rank_rdu_mapping:`false`', 'grad_accumulation_steps:`1`', 'learning_rate:`1.0e-05`', 'logging_steps:`1`', 'lr_schedule:`fixed_lr`', 'max_seq_length:`8192`', 'model_parallel_rdus:`1`', 'model_parameter_count:`8b`', 'num_iterations:`100`', 'prompt_loss_weight:`0.0`', 'run_mode:`balanced`', 'safe_mode:`off`', 'save_optimizer_state:`true`', 'save_steps:`50`', 'skip_checkpoint:`false`', 'subsample_eval:`0.01`', 'subsample_eval_seed:`123`', 'use_token_type_ids:`true`', 'vocab_size:`128256`', 'warmup_steps:`0`', 'weight_decay:`0.1`']

                    


{'SN40L-8': [{'constrains': None,
              'description': 'The per-worker batch size',
              'field_name': 'batch_size',
              'settings': {'DEFAULT': '8', 'USER_MODIFIABLE': True}},
             {'constrains': {'ge': '',
                             'gt': '',
                             'le': '',
                             'lt': '',
                             'values': ['on', 'off']},
              'description': "Toggles debug mode. Debug mode 'on' persists "
                             'logs to the RDU host to help diagnose certain '
                             'issues during training.',
              'field_name': 'debug_mode',
              'settings': {'DEFAULT': 'off', 'USER_MODIFIABLE': True}},
             {'constrains': {'ge': '',
                             'gt': '',
                             'le': '',
                             'lt': '',
                             'values': ['true', 'false']},
              'description': 'whether or not 

In [112]:
job = {
    'job_name': 'e2e_draft_model_training_job',
    'job_description': 'dummy_description',
    'job_type': 'train',
    'model': model_name,
    'model_version': '1',
    'parallel_instances': '1',
    'dataset_name': dataset_name,
    'load_state': False,
    'sub_path': '',
    'hyperparams': {
        "batch_size": 8,
        "do_eval": False,
        "eval_steps":50,
        "evaluation_strategy": "no",
        "learning_rate": 0.00001,
        "logging_steps": 1,
        "lr_schedule": "fixed_lr",
        "max_sequence_length": 8192,
        "num_iterations": 100,
        "prompt_loss_weight": 0.0,
        "save_optimizer_state": True,
        "save_steps": 50,
        "skip_checkpoint": False,
        "subsample_eval": 0.01,
        "subsample_eval_seed": 123,
        "use_token_type_ids": True,
        "vocab_size": 128256,
        "warmup_steps": 0,
        "weight_decay": 0.1,
    }
}

### Execute training job

In [113]:
sambastudio_client.run_training_job(
    project_name = project["project_name"],
    job_name = job['job_name'],
    job_description = job['job_description'],
    job_type = job['job_type'],
    model = job['model'],
    model_version = job['model_version'],
    dataset_name = job['dataset_name'],
    parallel_instances = job['parallel_instances'],
    load_state = job['load_state'],
    sub_path = job['sub_path'],
    rdu_arch = 'SN40L-8',
    hyperparams = job['hyperparams']
)

2025-04-02 23:39:08,832 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:39:12,076 [INFO] Model 'meta-llama-3.1-8b-instruct' with id 'd5b59dcf-84db-4f84-a1de-077b217f9b49' available for training and deployment found
2025-04-02 23:39:12,312 [INFO] Dataset with name 'dummy_example' found with id a77fc17c-5a75-4e41-98f7-35f1f509bd04
2025-04-02 23:39:12,527 [ERROR] Failed to create job with name 'e2e_draft_model_training_job'. Details: {'code': 3, 'message': 'Job with e2e_draft_model_training_job name already exists in project 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95', 'details': [], 'status_code': 400, 'headers': {'access-control-allow-headers': 'Accept, Content-Type, Content-Length, Accept-Encoding, Authorization, ResponseType, Access-Control-Allow-Origin', 'access-control-allow-methods': 'GET, POST, PATCH, DELETE', 'access-control-allow-origin': 'https://sjc3-demo2.sambanova.net', 'content-security-policy': "default

Exception: Error message: {'code': 3, 'message': 'Job with e2e_draft_model_training_job name already exists in project 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95', 'details': [], 'status_code': 400, 'headers': {'access-control-allow-headers': 'Accept, Content-Type, Content-Length, Accept-Encoding, Authorization, ResponseType, Access-Control-Allow-Origin', 'access-control-allow-methods': 'GET, POST, PATCH, DELETE', 'access-control-allow-origin': 'https://sjc3-demo2.sambanova.net', 'content-security-policy': "default-src 'self'", 'content-type': 'application/json,application/grpc', 'permissions-policy': 'none', 'referrer-policy': 'no-referrer', 'strict-transport-security': 'max-age=31536000; includeSubDomains, max-age=31536000; includeSubDomains', 'x-content-type-options': 'nosniff', 'x-correlation-id': '39cf5f06-d0c0-484c-8b4c-e4c80860e415', 'x-frame-options': 'DENY', 'date': 'Wed, 02 Apr 2025 22:39:12 GMT', 'content-length': '141', 'x-envoy-upstream-service-time': '60', 'server': 'istio-envoy'}}

In [114]:
sambastudio_client.check_job_progress(
    project_name=project['project_name'],
    job_name=job['job_name'],
    verbose=True,
    wait=False
)

2025-04-02 23:39:17,644 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:39:17,885 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:39:18,126 [INFO] Job with name 'e2e_draft_model_training_job' in project 'e2e-draft-model-training-project' found with id '63757c29-244e-48da-9972-dc9256edc474'
2025-04-02 23:39:18,402 [INFO] Job `e2e_draft_model_training_job` with progress status: TRAINING


{'job_id': '63757c29-244e-48da-9972-dc9256edc474',
 'job_name': 'e2e_draft_model_training_job',
 'job_type': 'train',
 'user_id': 'francesca.raimondi',
 'project_id': '0b4bf4f5-5e29-409c-8a40-7a2e4183ec95',
 'tenant_id': 'cf1dd082-103d-4236-b3f2-ca792d73b77d',
 'rdu_arch': 'SN40L-8',
 'result_path': '',
 'parallel_instances': 1,
 'app_id': '45c7e5b8-2c12-45c3-aecc-713dded73b8f',
 'model_checkpoint': 'meta-llama-3.1-8b-instruct',
 'checkpoint_id': '',
 'dataset_id': 'a77fc17c-5a75-4e41-98f7-35f1f509bd04',
 'description': 'dummy_description',
 'status': 'TRAINING',
 'image_version': '',
 'variant_set_version': '',
 'variant_name': '',
 'project_name': '',
 'dataset_name': '',
 'input_data_path': '',
 'hyperparams': [{'DATATYPE': '',
   'DESCRIPTION': 'The per-worker batch size',
   'FIELD_NAME': 'batch_size',
   'MESSAGE': '',
   'TASK_TYPE': [],
   'TYPE_SPECIFIC_SETTINGS': {},
   'CONSTRAINTS': None,
   'VARIANT_SELECTION': False,
   'FIELD_VALUE': '8'},
  {'DATATYPE': '',
   'DESCRIPT

### Promote Checkpoint

In [100]:
# we will promote the checkpoint with less training loss so we list it sorted 
checkpoints = sambastudio_client.list_checkpoints(
    project_name=project['project_name'],
    job_name=job['job_name'],
    sort=True
)
checkpoints

2025-04-02 23:27:25,731 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:27:25,959 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-02 23:27:26,202 [INFO] Job with name 'e2e_draft_model_training_job' in project 'e2e-draft-model-training-project' found with id '63757c29-244e-48da-9972-dc9256edc474'


[]

#### Promoted checkpoint config

In [101]:
# set checkpoint to promote config
model_checkpoint = {
    'checkpoint_name': checkpoints[0]['name'],
    'model_name': 'Suzume-Llama-3-8B-Multilingual-Publichealth',
    'model_description': 'finetuned suzume multilingual in public health qa dataset',
    'model_type': 'finetuned'
}

IndexError: list index out of range

In [None]:
# Execute the promote_checkpoint method from client with checkpoint parameters
sambastudio_client.promote_checkpoint(
    checkpoint_name = model_checkpoint['name'],
    project_name=project['project_name'],
    job_name=job['job_name'],
    model_name=model_checkpoint['model_name'],
    model_description=model_checkpoint['model_description'],
    model_type=model_checkpoint['model_type']
)

2024-11-25 16:03:51,838 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:52,088 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:52,329 [INFO] Job with name 'e2e_fc_taining_job' in project 'byoc fine-tuning project' found with id '1819ba81-9f93-4197-a7c3-51df6a3f8f0e'
2024-11-25 16:03:53,245 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10' promoted to model 'Suzume-Llama-3-8B-Multilingual-Publichealth'


'c867b392-2d02-453d-9fd8-e14016e39153'

In [14]:
# check the promoted model is now in SambaStudio models
[model for model in sambastudio_client.list_models() if model['model_checkpoint_name']==model_checkpoint['model_name']]

[{'model_id': 'c867b392-2d02-453d-9fd8-e14016e39153',
  'model_checkpoint_name': 'Suzume-Llama-3-8B-Multilingual-Publichealth',
  'version': 1}]

#### Delete all saved training checkpoints, after promotion (optional)

In [None]:
# We can delete all intermediate checkpoints saved during the training job 
for checkpoint in checkpoints:
    sambastudio_client.delete_checkpoint(checkpoint["name"])

2024-11-25 16:06:20,296 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10' deleted
2024-11-25 16:06:20,586 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-5' deleted
