# Fine-Tuning Executing Training Job

In [1]:
import os
import sys
from pprint import pprint

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

## Step by Step / Manual setting

First instantiate the SambaStudio client 

In [2]:
sambastudio_client = SnsdkWrapper()

2024-12-19 12:48:48,995 [INFO] Using variables from Snapi config to set up Snsdk.


### List trainable models

In [3]:
[model["model_checkpoint_name"]for model in sambastudio_client.list_models(filter_job_types=["train"])]

['meta-llama-3-8b-instruct-128384-vocab',
 'E5 Large V2',
 'GPT13B 2k SS ITv3',
 'GPT_1.5B_GT_Finetuned',
 'Multilingual E5 Large',
 'GPT_1.5B_Base_Model',
 'CLIP-ViT-B-32-laion2B-s34B-b79k',
 'CLIP ViT-B-32 Backbone (Deprecated)',
 'meta-llama-3.1-70b',
 'llava-v1.5-7b',
 'Multilingual E5 Large Instruct',
 'Hubert_ASR',
 'GPT_1.5B_GT_Pretrained',
 'Suzume-Llama-3-8B-Multilingual-Publichealth',
 'FakeBox',
 'Deepseek-coder-6.7b-instruct',
 'RC4_VIEW_TEST',
 'SimpleTextClassGenerativeTrained',
 'RC4_Colab_Test',
 'Suzume-Llama-3-8B-Multilingual',
 'Deepseek-coder-6.7b-base',
 'meta-llama-3.1-70b-instruct',
 'meta-llama-3.1-8b-instruct',
 '1218_SN10_GPT13B 8k SS HAv3',
 'GPT_13B_Human_Aligned_Instruction_Tuned_V2',
 '1218_meta-llama-3.1-8b',
 'meta-llama-3.1-8b',
 '1218_meta-llama-3-70b-128256-vocab',
 'Llama-2-7b-16k-hf',
 'YANZHEC_TEST_SNAPI_GPT1.5B_GT_Finetuned',
 'meta-llama-3-8b-instruct-128256-vocab',
 'GPT_1.5B_Dialog_Act_Classification_Finetuned',
 'LlamaGuard_7b',
 'meta-llama-3

In [4]:
model = 'Suzume-Llama-3-8B-Multilingual'

### List available datasets

In [5]:
[dataset["dataset_name"] for dataset in sambastudio_client.list_datasets()]

['openthaigpt_50k_IT0913',
 'Generative_Inference_Dataset',
 'GPT_13B_Inference_Dataset',
 'FiQA',
 'Super_Glue_4k_SS',
 'E5_Large_V2_Inference_Example',
 'ASR_With_Diarization_Dataset',
 'Restore_Punctuation_Data',
 'ASR_Without_Diarization_Dataset',
 'GPT_13B_8k_SS_Toy_Training_Dataset',
 'Librispeech',
 'GPT_1.5B_Training_Dataset',
 'GPT_13B_Training_Dataset',
 'Speaker_Diarization',
 'test',
 'Coding_Generative_Train_4k_SS_Dataset',
 'Mistral_Tokenized_Copa',
 'thai-dpo-sft-ss4k',
 'RBAC_Test_Curl',
 'test_upload',
 'Coding_Generative_Inference_Dataset',
 'E5_Large_V2_Training_MSMarco_Distillation',
 'console_upload',
 'Super_Glue_8k_SS_128k_vocab',
 'yc_snapi_add_localmachine_test_13B_2451_rc3',
 '0606qa03orgadmin',
 'LLaVA-example',
 'openwebtext_ss4096_32k_vocab',
 'aniket-e5-dataset-upload-trial5',
 'GPT_1.5B_Inference_Dataset',
 'test_dataset',
 'Super_Glue_16k_SS',
 'Caltech_256_Clip',
 'Superglue_Sarashina_4k',
 'Superglue_Sarashina_8k',
 '1029test',
 '1113AWS',
 'smol_sql_d

In [6]:
dataset_name = 'publichealth'

### Create a project

#### Set Project configs 

In [7]:
project = {
    'project_name': 'byoc fine-tuning project',
    'project_description': 'this project will be used to test the BYOC and Fine-tuning e2e pipeline implementation'
}

In [8]:
# Execute the create project method from client with project parameters
sambastudio_client.create_project(
    project_name = project['project_name'],
    project_description = project['project_description']
)

2024-12-19 12:48:56,862 [INFO] Project with name 'byoc fine-tuning project' found with id 296e0c05-1338-4119-9ef6-0a5667b55b09
2024-12-19 12:48:56,866 [INFO] Project with name 'byoc fine-tuning project' already exists with id '296e0c05-1338-4119-9ef6-0a5667b55b09', using it


'296e0c05-1338-4119-9ef6-0a5667b55b09'

### Set train job config

In [9]:
# check required hyperparams for training job 
hyperparams = sambastudio_client.get_default_hyperparms(model,'train')
pprint(hyperparams)

2024-12-19 12:48:58,118 [INFO] Default Hyperparameters for train in SN40L-8 for Suzume-Llama-3-8B-Multilingual: 

                    ['batch_size:`8`', 'debug_mode:`off`', 'do_eval:`false`', 'dump_inputs:`false`', 'eval_steps:`50`', 'evaluation_strategy:`no`', 'fix_rank_rdu_mapping:`false`', 'grad_accumulation_steps:`1`', 'learning_rate:`1.0e-05`', 'logging_steps:`1`', 'lr_schedule:`fixed_lr`', 'max_seq_length:`8192`', 'model_parallel_rdus:`1`', 'model_parameter_count:`8b`', 'num_iterations:`100`', 'prompt_loss_weight:`0.0`', 'run_mode:`balanced`', 'save_optimizer_state:`true`', 'save_steps:`50`', 'skip_checkpoint:`false`', 'subsample_eval:`0.01`', 'subsample_eval_seed:`123`', 'use_token_type_ids:`true`', 'vocab_size:`128256`', 'warmup_steps:`0`', 'weight_decay:`0.1`']

                    


{'SN40L-8': [{'constrains': None,
              'description': 'The per-worker batch size',
              'field_name': 'batch_size',
              'settings': {'DEFAULT': '8', 'USER_MODIFIABLE': True}},
             {'constrains': None,
              'description': "Toggles debug mode. Debug mode 'on' mode turns "
                             'on additional output to help diagnose certain '
                             "issues during training. Please keep 'off' unless "
                             'advised otherwise by your SambaNova admin.',
              'field_name': 'debug_mode',
              'settings': {'DEFAULT': 'off', 'USER_MODIFIABLE': True}},
             {'constrains': {'ge': '',
                             'gt': '',
                             'le': '',
                             'lt': '',
                             'values': ['true', 'false']},
              'description': 'whether or not to do final evaluation',
              'field_name': 'do_eval',
           

In [30]:
job = {
    'job_name': 'e2e_fc_taining_job2',
    'job_description': 'e2e finetuning training job public health for suzume multilingual',
    'job_type': 'train',
    'model': model,
    'model_version': '1',
    'parallel_instances': '1',
    'dataset_name': dataset_name,
    'load_state': False,
    'sub_path': '',
    'hyperparams': {
        "batch_size": 8,
        "do_eval": False,
        "eval_steps":50,
        "evaluation_strategy": "no",
        "learning_rate": 0.00001,
        "logging_steps": 1,
        "lr_schedule": "fixed_lr",
        "max_sequence_length": 8192,
        "num_iterations": 100,
        "prompt_loss_weight": 0.0,
        "save_optimizer_state": True,
        "save_steps": 50,
        "skip_checkpoint": False,
        "subsample_eval": 0.01,
        "subsample_eval_seed": 123,
        "use_token_type_ids": True,
        "vocab_size": 128256,
        "warmup_steps": 0,
        "weight_decay": 0.1,
    }
}

### Execute training job

In [29]:
sambastudio_client.run_training_job(
    project_name = project["project_name"],
    job_name = job['job_name'],
    job_description = job['job_description'],
    job_type = job['job_type'],
    model = job['model'],
    model_version = job['model_version'],
    dataset_name = job['dataset_name'],
    parallel_instances = job['parallel_instances'],
    load_state = job['load_state'],
    sub_path = job['sub_path'],
    rdu_arch = 'SN40L-8',
    hyperparams = job['hyperparams']
)

2024-12-18 14:42:42,232 [INFO] Project with name 'byoc fine-tuning project' found with id 296e0c05-1338-4119-9ef6-0a5667b55b09
2024-12-18 14:42:45,306 [INFO] Model 'Suzume-Llama-3-8B-Multilingual' with id '35847978-1a28-45da-99b1-0c2cea94d116' available for training and deployment found
2024-12-18 14:42:45,662 [INFO] Dataset with name 'publichealth' found with id 6ac585ad-107c-45f5-a2de-129dd1a69279
2024-12-18 14:42:45,983 [INFO] Job with name 'e2e_fc_taining_job2' created: '{'job_id': '0a09cb4e-01be-4807-911a-f1d2cc3904bf', 'job_name': 'e2e_fc_taining_job2', 'job_type': 'train', 'user_id': 'jorge.piedrahita', 'project_id': '296e0c05-1338-4119-9ef6-0a5667b55b09', 'tenant_id': 'f254d0b5-fb45-4501-9740-93183e7c6f4c', 'rdu_arch': 'SN40L-8', 'result_path': '', 'parallel_instances': 1, 'app_id': '61fa0993-04a2-42ca-9db1-1eff693ea978', 'model_checkpoint': 'Suzume-Llama-3-8B-Multilingual', 'checkpoint_id': '', 'dataset_id': '6ac585ad-107c-45f5-a2de-129dd1a69279', 'description': 'e2e finetunin

'0a09cb4e-01be-4807-911a-f1d2cc3904bf'

In [32]:
sambastudio_client.check_job_progress(
    project_name=project['project_name'],
    job_name=job['job_name'],
    verbose=True,
    wait=False
)

2024-12-18 14:43:19,561 [INFO] Project with name 'byoc fine-tuning project' found with id 296e0c05-1338-4119-9ef6-0a5667b55b09
2024-12-18 14:43:20,002 [INFO] Project with name 'byoc fine-tuning project' found with id 296e0c05-1338-4119-9ef6-0a5667b55b09
2024-12-18 14:43:20,704 [INFO] Job with name 'e2e_fc_taining_job2' in project 'byoc fine-tuning project' found with id '0a09cb4e-01be-4807-911a-f1d2cc3904bf'
2024-12-18 14:43:21,005 [INFO] Job `e2e_fc_taining_job2` with progress status: TRAINING


{'job_id': '0a09cb4e-01be-4807-911a-f1d2cc3904bf',
 'job_name': 'e2e_fc_taining_job2',
 'job_type': 'train',
 'user_id': 'jorge.piedrahita',
 'project_id': '296e0c05-1338-4119-9ef6-0a5667b55b09',
 'tenant_id': 'f254d0b5-fb45-4501-9740-93183e7c6f4c',
 'rdu_arch': 'SN40L-8',
 'result_path': '',
 'parallel_instances': 1,
 'app_id': '61fa0993-04a2-42ca-9db1-1eff693ea978',
 'model_checkpoint': 'Suzume-Llama-3-8B-Multilingual',
 'checkpoint_id': '',
 'dataset_id': '6ac585ad-107c-45f5-a2de-129dd1a69279',
 'description': 'e2e finetuning training job public health for suzume multilingual',
 'status': 'TRAINING',
 'image_version': '',
 'variant_set_version': '',
 'variant_name': '',
 'project_name': '',
 'dataset_name': '',
 'input_data_path': '',
 'hyperparams': [{'DATATYPE': '',
   'DESCRIPTION': 'The per-worker batch size',
   'FIELD_NAME': 'batch_size',
   'MESSAGE': '',
   'TASK_TYPE': [],
   'TYPE_SPECIFIC_SETTINGS': {},
   'CONSTRAINTS': None,
   'VARIANT_SELECTION': False,
   'FIELD_VALU

### Promote Checkpoint

In [8]:
# we will promote the checkpoint with less training loss so we list it sorted 
checkpoints = sambastudio_client.list_checkpoints(
    project_name=project['project_name'],
    job_name=job['job_name'],
    sort=True
)
checkpoints

2024-11-25 16:03:15,340 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:15,664 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:15,905 [INFO] Job with name 'e2e_fc_taining_job' in project 'byoc fine-tuning project' found with id '1819ba81-9f93-4197-a7c3-51df6a3f8f0e'


[{'checkpoint_name': '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10',
  'checkpoint_id': '6925d395-3251-4465-8ec6-225761536680',
  'steps': 10,
  'time_created': '2024-11-25T20:55:13.923615Z',
  'metrics': {'single_value': {'train_learning_rate': 0.0,
    'train_loss': 1.6116},
   'multi_value': {},
   'last_batch_omitted': []},
  'labels': None,
  'job_id': '1819ba81-9f93-4197-a7c3-51df6a3f8f0e',
  'app_id': '61fa0993-04a2-42ca-9db1-1eff693ea978',
  'app_name': 'Samba1 Llama3 Experts',
  'path': 'default/default/b11867e6-7ca8-45bd-b09b-41cbc7ba73ce/jobs/1819ba81-9f93-4197-a7c3-51df6a3f8f0e/checkpoints/1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10',
  'transformers_version': '',
  'torch_version': '',
  'user_id': 'jorge.piedrahita',
  'tenant_id': 'f254d0b5-fb45-4501-9740-93183e7c6f4c',
  'image_version': '1.1.6-20241025',
  'dependent_jobs': []},
 {'checkpoint_name': '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-5',
  'checkpoint_id': '19a5cf40-1663-406c-816a-fceb759624f5',
  'steps': 5,
  'time_created

#### Promoted checkpoint config

In [None]:
# set checkpoint to promote config
model_checkpoint = {
    'checkpoint_name': checkpoints[0]['checkpoint_name'],
    'model_name': 'Suzume-Llama-3-8B-Multilingual-Publichealth',
    'model_description': 'finetuned suzume multilingual in public health qa dataset',
    'model_type': 'finetuned'
}

In [10]:
# Execute the promote_checkpoint method from client with checkpoint parameters
sambastudio_client.promote_checkpoint(
    checkpoint_name = model_checkpoint['checkpoint_name'],
    project_name=project['project_name'],
    job_name=job['job_name'],
    model_name=model_checkpoint['model_name'],
    model_description=model_checkpoint['model_description'],
    model_type=model_checkpoint['model_type']
)

2024-11-25 16:03:51,838 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:52,088 [INFO] Project with name 'byoc fine-tuning project' found with id b11867e6-7ca8-45bd-b09b-41cbc7ba73ce
2024-11-25 16:03:52,329 [INFO] Job with name 'e2e_fc_taining_job' in project 'byoc fine-tuning project' found with id '1819ba81-9f93-4197-a7c3-51df6a3f8f0e'
2024-11-25 16:03:53,245 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10' promoted to model 'Suzume-Llama-3-8B-Multilingual-Publichealth'


'c867b392-2d02-453d-9fd8-e14016e39153'

In [14]:
# check the promoted model is now in SambaStudio models
[model for model in sambastudio_client.list_models() if model['model_checkpoint_name']==model_checkpoint['model_name']]

[{'model_id': 'c867b392-2d02-453d-9fd8-e14016e39153',
  'model_checkpoint_name': 'Suzume-Llama-3-8B-Multilingual-Publichealth',
  'version': 1}]

#### Delete all saved training checkpoints, after promotion (optional)

In [15]:
# We can delete all intermediate checkpoints saved during the training job 
for checkpoint in checkpoints:
    sambastudio_client.delete_checkpoint(checkpoint["checkpoint_name"])

2024-11-25 16:06:20,296 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-10' deleted
2024-11-25 16:06:20,586 [INFO] Model checkpoint '1819ba81-9f93-4197-a7c3-51df6a3f8f0e-5' deleted


## Streamlined Execution

The training job and checkpoint promotion can be done in a streamlined way setting all the job and checkpoint parameters in a config file like in the [finetune_config.yaml](../finetune_config.yaml) example, and executing: