# Fine-Tuning Executing Training Job

In [None]:
import os
import sys

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

## Step by Step / Manual setting

In [None]:
sambastudio_client = SnsdkWrapper()

2024-11-22 18:57:38,619 [INFO] Using variables from Snapi config to set up Snsdk.


### List trainable models

In [20]:
[model["model_checkpoint_name"]for model in sambastudio_client.list_models(filter_job_types=["train"])]

['meta-llama-3-8b-instruct-128384-vocab',
 'E5 Large V2',
 'GPT13B 2k SS ITv3',
 'GPT_1.5B_GT_Finetuned',
 'Multilingual E5 Large',
 'GPT_1.5B_Base_Model',
 'CLIP-ViT-B-32-laion2B-s34B-b79k',
 'CLIP ViT-B-32 Backbone (Deprecated)',
 'llava-v1.5-7b',
 'Multilingual E5 Large Instruct',
 'Hubert_ASR',
 'GPT_1.5B_GT_Pretrained',
 'TR_Sarashina2-70B_Superglue_Sarashina_8k_SN40L-8_4RDU-ckpt10',
 'FakeBox',
 'Deepseek-coder-6.7b-instruct',
 'RC4_VIEW_TEST',
 'SimpleTextClassGenerativeTrained',
 'RC4_Colab_Test',
 'Deepseek-coder-6.7b-base',
 'HermesProInstructV10',
 'GPT_13B_Human_Aligned_Instruction_Tuned_V2',
 'Llama-2-7b-16k-hf',
 'Suzume-Llama-3-8B-Multilingual',
 'YANZHEC_TEST_SNAPI_GPT1.5B_GT_Finetuned',
 'meta-llama-3-8b-instruct-128256-vocab',
 'GPT_1.5B_Dialog_Act_Classification_Finetuned',
 'LlamaGuard_7b',
 'meta-llama-3-70b-instruct-128256-vocab',
 'Thai_LLaMA_70B',
 'GPT_13B_Generative_Inference',
 'Llama-2-13b-hf',
 'meta-llama-3-8b-nan-generator',
 'meta-llama-guard-2-8b-128384

In [None]:
model = 'Suzume-Llama-3-8B-Multilingual'

### List available datasets

In [13]:
[dataset["dataset_name"] for dataset in sambastudio_client.list_datasets()]

['openthaigpt_50k_IT0913',
 'Generative_Inference_Dataset',
 'GPT_13B_Inference_Dataset',
 'FiQA',
 'Super_Glue_4k_SS',
 'E5_Large_V2_Inference_Example',
 'ASR_With_Diarization_Dataset',
 'Restore_Punctuation_Data',
 'ASR_Without_Diarization_Dataset',
 'GPT_13B_8k_SS_Toy_Training_Dataset',
 'Librispeech',
 'GPT_1.5B_Training_Dataset',
 'GPT_13B_Training_Dataset',
 'Speaker_Diarization',
 'test',
 'Coding_Generative_Train_4k_SS_Dataset',
 'Mistral_Tokenized_Copa',
 'thai-dpo-sft-ss4k',
 'RBAC_Test_Curl',
 'test_upload',
 'Coding_Generative_Inference_Dataset',
 'E5_Large_V2_Training_MSMarco_Distillation',
 'console_upload',
 'Super_Glue_8k_SS_128k_vocab',
 'yc_snapi_add_localmachine_test_13B_2451_rc3',
 '0606qa03orgadmin',
 'LLaVA-example',
 'openwebtext_ss4096_32k_vocab',
 'aniket-e5-dataset-upload-trial5',
 'GPT_1.5B_Inference_Dataset',
 'test_dataset',
 'Super_Glue_16k_SS',
 'Caltech_256_Clip',
 'Superglue_Sarashina_4k',
 'Superglue_Sarashina_8k',
 '1029test',
 '1113AWS',
 'smol_sql_d

In [None]:
dataset_name = 'publichealth'

### Create a project

### Set Project configs 

In [None]:
project = {
    'project_name': 'byoc fine-tuning project',
    'project_description': 'this project will be used to test the BYOC and Fine-tuning e2e pipeline implementation'
}

In [None]:
sambastudio_client.create_project(
    project_name = project['project_name'],
    project_description = project['project_description']
)

2024-11-22 18:57:48,601 [INFO] Project with name 'byoc fine-tuning project' found with id 08b2e9e9-cebe-4f57-9271-a7e6c6f1561d
2024-11-22 18:57:48,601 [INFO] Project with name 'byoc fine-tuning project' already exists with id '08b2e9e9-cebe-4f57-9271-a7e6c6f1561d', using it


'08b2e9e9-cebe-4f57-9271-a7e6c6f1561d'

### Set train job config

In [None]:
job = {
    'job_name': 'e2e_fc_taining_job',
    'job_description': 'e2e finetuning training job public health for suzume multilingual',
    'job_type': 'train',
    'model': model,
    'model_version': '1',
    'parallel_instances': '1',
    'dataset_name': dataset_name,
    'load_state': False,
    'sub_path': '',
    'hyperparams': {
        "batch_size": 8,
        "do_eval": False,
        "eval_steps":5,
        "evaluation_strategy": "no",
        "learning_rate": 0.00001,
        "logging_steps": 1,
        "lr_schedule": "fixed_lr",
        "max_sequence_length": 8192,
        "num_iterations": 10,
        "prompt_loss_weight": 0.0,
        "save_optimizer_state": True,
        "save_steps": 5,
        "skip_checkpoint": False,
        "subsample_eval": 0.01,
        "subsample_eval_seed": 123,
        "use_token_type_ids": True,
        "vocab_size": 128256,
        "warmup_steps": 0,
        "weight_decay": 0.1,
    }
}

### Execute training job

In [None]:
sambastudio_client.run_training_job(
    project_name = project["project_name"],
    job_name = job['job_name'],
    job_description = job['job_description'],
    job_type = job['job_type'],
    model = job['model'],
    model_version = job['model_version'],
    dataset_name = job['dataset_name'],
    parallel_instances = job['parallel_instances'],
    load_state = job['load_state'],
    sub_path = job['sub_path'],
    rdu_arch = 'SN40L-8',
    hyperparams = job['hyperparams']
)

In [None]:
sambastudio_client.check_job_progress(
    project_name=project['project_name'],
    job_name=job['job_name'],
    wait=True
)

2024-11-22 19:08:17,198 [INFO] Project with name 'byoc fine-tuning project' found with id 08b2e9e9-cebe-4f57-9271-a7e6c6f1561d
2024-11-22 19:08:17,437 [INFO] Project with name 'byoc fine-tuning project' found with id 08b2e9e9-cebe-4f57-9271-a7e6c6f1561d
2024-11-22 19:08:17,652 [INFO] Job with name 'e2e_fc_taining_job2' in project 'byoc fine-tuning project' found with id 'e2b76179-05e2-4e2c-9534-56b22e7c081e'
2024-11-22 19:08:17,894 [INFO] Job `e2e_fc_taining_job2` with progress status: PENDING_RDU
2024-11-22 19:09:18,148 [INFO] Job `e2e_fc_taining_job2` with progress status: FAILED
2024-11-22 19:09:18,149 [ERROR] Job failed. Details: {'job_id': 'e2b76179-05e2-4e2c-9534-56b22e7c081e', 'job_name': 'e2e_fc_taining_job2', 'job_type': 'train', 'status': 'FAILED', 'time_created': '2024-11-23T00:06:50.391725000Z'}


Exception: Job failed. Details: {'job_id': 'e2b76179-05e2-4e2c-9534-56b22e7c081e', 'job_name': 'e2e_fc_taining_job2', 'job_type': 'train', 'status': 'FAILED', 'time_created': '2024-11-23T00:06:50.391725000Z'}

### Promote Checkpoint

In [None]:
checkpoints = sambastudio_client.list_checkpoints(
    project_name=project['project_name'],
    job_name=job['job_name'],
    sort=True
)
checkpoints

#### Promoted checkpoint config

In [None]:
model_checkpoint = {
    'checkpoint_name': checkpoints[0]['checkpoint_name'],
    'model_name': 'Suzume-Llama-3-8B-Multilingual-Publichealth',
    'model_description': 'finetuned suzume multilingual in public health qa dataset',
    'model_type': 'finetuned'
}

In [None]:
sambastudio_client.promote_checkpoint(
    checkpoint_name = model_checkpoint['checkpoint_name'],
    project_name=project['project_name'],
    job_name=job['job_name'],
    model_name=model_checkpoint['model_name'],
    model_description=model_checkpoint['model_description'],
    model_type=model_checkpoint['model_type']
)

In [None]:
sambastudio_client.list_models(filter_job_types=["deploy"])

#### Delete all saved training checkpoints, after promotion (optional)

In [None]:
for checkpoint in checkpoints:
    sambastudio_client.delete_checkpoint(checkpoint["checkpoint_name"])

## Streamlined Execution

In [None]:
config_file = os.path.join(repo_dir, 'finetune_config.yaml')
sambastudio_client = SnsdkWrapper(config_file)
sambastudio_client.create_project()
sambastudio_client.run_training_job()
sambastudio_client.check_job_progress(wait=True)
checkpoints = sambastudio_client.list_checkpoints(sort=True)
sambastudio_client.promote_checkpoint(checkpoints[0]['checkpoint_name'])
for checkpoint in checkpoints:
    sambastudio_client.delete_checkpoint(checkpoint["checkpoint_name"])