# Dataset preparation for Fine-Tuning

In [4]:
import os
import sys
from datasets import load_dataset

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src import sambastudio_utils
from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

# Step by Step / Manual setting

First instantiate the SambaStudio client 

In [5]:
sambastudio_client = SnsdkWrapper()

2025-04-01 14:21:35,339 [INFO] Using variables from .snapi config to set up Snsdk.


### Select your training dataset
You can use your own dataset (see [synthetic data generation util](../synthetic_data_gen/notebooks/quickstart_synthetic_data_gen.ipynb)) xxx.

In [7]:
target_dir = os.path.join(kit_dir, "data","datasets")
# Create target dir if not exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

### Prepare Dataset

To upload a dataset to SambaStudio we need first to convert it to a suitable format (hdf5), for this we will use the generative data prep utility 

In [11]:
hdf5_dataset_path = sambastudio_utils.gen_data_prep_pipeline(
    input_files = os.path.join(target_dir, "bio_train_completions/bio_train_completions.jsonl"),
    output_path = os.path.join(target_dir, "bio_train_completions/fine_tuning_bio_train_completions"),
    tokenizer = "meta-llama/Llama-3.1-70B-Instruct", # use the tokenizer of the model to train with
    max_seq_length = 8192,
    shuffle = 'on_RAM',
    input_packing_config = 'single::truncate_right', 
    prompt_keyword = 'prompt',
    completion_keyword = 'completion',
    # num_training_splits = 8, xxx
    num_training_splits = 3,
    apply_chat_template = False,
    )

2025-04-01 14:23:36,713 [INFO] input jsonl files merged into /Users/francescar/Documents/ai-starter-kit/e2e_draft_model_training/data/datasets/bio_train_completions/bio_train_completions_merged.jsonl
2025-04-01 14:23:36,716 [INFO] generative_data_prep is already installed. Skipping installation.


Find more details about the gen data prep parameters [here](https://github.com/sambanova/generative_data_prep?tab=readme-ov-file#flags)

### Set dataset configs

Some parameter should be provided to upload a previously created checkpoint, for this we will keep these parameters in a dataset dict 

In [20]:
dataset = {
    'dataset_path': hdf5_dataset_path,
    'dataset_name': "publichealth",
    'dataset_description': 'This dataset contains question and answer pairs sourced from Q&A pages and FAQs from CDC and WHO pertaining to COVID-19',
    'dataset_job_types': ["evaluation", "train"],
    'dataset_source_type': 'localMachine',
    'dataset_language': 'english',
    'dataset_filetype': 'hdf5',
    # 'dataset_url': "https://huggingface.co/datasets/xhluca/publichealth-qa", xxx
    'dataset_metadata':{}
}

You should indicate for which apps the uploaded dataset will be available, if not sure you can list all the aps in SambaStudio ans select those you want 

In [21]:
avaliable_apps = sambastudio_client.list_apps()
avaliable_apps

[{'id': '89fbfbe6-ee77-4f5c-9ff6-56e2ab69f6ee', 'name': 'Text Embedding'},
 {'id': 'a0547cc1-bf29-4774-abd0-5f1b2ac87eb2', 'name': 'Symphony CoE App'},
 {'id': '3e91cc69-8a8f-4f11-b987-2445d226a666',
  'name': 'Studio Inference Engine App'},
 {'id': 'ecf84906-0924-4ce1-a1a2-c008f5334820', 'name': 'Speech Recognition'},
 {'id': 'e580a1b7-0f23-4644-8959-98eba2dae86e', 'name': 'Spec Decoding'},
 {'id': 'cbba6d31-104a-4295-ac21-7e91da09ab9b', 'name': 'Speaker Diarization'},
 {'id': 'f67c5390-da52-4105-ae17-12434fa7d03b', 'name': 'Sentence Detection'},
 {'id': 'c3178605-2f2a-409d-9dbd-e4efebd2ade5',
  'name': 'Samba 1 Turbo Spec Decoding'},
 {'id': '6c24ed43-f150-4fa6-a76c-38ff5e1372bc', 'name': 'Samba 1 Turbo App'},
 {'id': '99f31849-8911-4118-bd2c-587ac843e85c',
  'name': 'Samba1 Solar Experts'},
 {'id': 'd2afdaf6-bbf0-484f-abb1-3e93db210246',
  'name': 'Samba1 Qwen2 Experts'},
 {'id': 'ec67a215-4d16-44ca-b3bc-3ae5addd9eb1',
  'name': 'Samba1 Qwen2 Audio Experts'},
 {'id': '12e55537-0e6a-

In [22]:
# In this case we will train a llama3 model so wi will include all the llama3 apps
llama3_apps=[app['name'] for app in avaliable_apps if 'llama3' in app['name'].replace(' ','').lower()]
dataset['dataset_apps_availability'] = llama3_apps

In [23]:
# We can see here all the parameters required to upload the dataset
dataset

{'dataset_path': '/Users/francescar/Documents/ai-starter-kit/e2e_draft_model_training/data/datasets/bio_train_completions/fine_tuning_bio_train_completions',
 'dataset_name': 'publichealth',
 'dataset_description': 'This dataset contains question and answer pairs sourced from Q&A pages and FAQs from CDC and WHO pertaining to COVID-19',
 'dataset_job_types': ['evaluation', 'train'],
 'dataset_source_type': 'localMachine',
 'dataset_language': 'english',
 'dataset_filetype': 'hdf5',
 'dataset_metadata': {},
 'dataset_apps_availability': ['Samba1 Llama3 Experts',
  'Samba1 Llama3.3 Experts',
  'Samba1 Llama3.2 Vision Experts',
  'Samba1 Llama3.2 Experts',
  'Samba1 Llama3.1 Experts',
  'Llama 3.2',
  'Llama 3.1',
  'Llama 3']}

### Upload Dataset to SambaStudio

In [26]:
# Execute the create dataset method from client with dataset parameters (this can take a while)
sambastudio_client.create_dataset(
    dataset_path = dataset['dataset_path'],
    dataset_name = dataset['dataset_name'],
    dataset_description = dataset['dataset_description'],
    dataset_job_types = dataset['dataset_job_types'],
    dataset_source_type = dataset['dataset_source_type'],
    dataset_language = dataset['dataset_language'],
    dataset_url = dataset['dataset_url'],
    dataset_apps_availability = dataset['dataset_apps_availability'],
    dataset_filetype = dataset['dataset_filetype'],
    dataset_metadata = dataset['dataset_metadata']
)

2025-03-31 22:28:36,965 [INFO] App with name 'Samba1 Llama3 Experts' found with id 61fa0993-04a2-42ca-9db1-1eff693ea978
2025-03-31 22:28:37,252 [INFO] App with name 'Samba1 Llama3.3 Experts' found with id fd3c25e0-4091-41d6-97b2-f676f48f951b
2025-03-31 22:28:37,511 [INFO] App with name 'Samba1 Llama3.2 Vision Experts' found with id f0ceb1ed-c407-4088-9c25-deaa20a81ef9
2025-03-31 22:28:37,914 [INFO] App with name 'Samba1 Llama3.2 Experts' found with id 49683c7f-3e42-4217-96dd-6f975d17c393
2025-03-31 22:28:38,219 [INFO] App with name 'Samba1 Llama3.1 Experts' found with id eb0aaad1-694f-41b6-958a-b974737635c4
2025-03-31 22:28:38,564 [INFO] App with name 'Llama 3.2' found with id 1af6aa21-b58a-4794-a134-b80764456d25
2025-03-31 22:28:38,908 [INFO] App with name 'Llama 3.1' found with id 45c7e5b8-2c12-45c3-aecc-713dded73b8f
2025-03-31 22:28:39,369 [INFO] App with name 'Llama 3' found with id ad39e323-9878-4914-8e29-82c9f2939475
2025-03-31 22:28:39,723 [INFO] Dataset with name 'publichealth'

'5129f5de-57cf-471b-a04a-7fcf389b3bc4'

In [27]:
# check the dataset is now in SambaStudio environment
sambastudio_client.list_datasets()[-1]

{'id': '7198371d-0f70-4287-8645-8e3807782d30',
 'dataset_name': 'Coding_Generative_Inference_Dataset'}

## Streamlined Execution

The dataset upload can be done in a streamlined way setting all the dataset parameters in a config file like in the [dataset_config.yaml](../dataset_config.yaml) example, and executing:

In [28]:
config_file = os.path.join(kit_dir, 'dataset_config.yaml')
sambastudio_client = SnsdkWrapper(config_file)
sambastudio_client.create_dataset()
sambastudio_client.list_datasets()[-1]

2025-03-31 22:28:40,177 [INFO] Using config file located in /Users/francescar/Documents/ai-starter-kit/e2e_draft_model_training/dataset_config.yaml
2025-03-31 22:28:40,178 [INFO] Using variables from .snapi config to set up Snsdk.
2025-03-31 22:28:41,262 [INFO] App with name 'Llama 3' found with id ad39e323-9878-4914-8e29-82c9f2939475
2025-03-31 22:28:41,505 [INFO] App with name 'Samba1 Llama3 Experts' found with id 61fa0993-04a2-42ca-9db1-1eff693ea978
2025-03-31 22:28:41,761 [INFO] App with name 'Samba1 Llama3.1 Experts' found with id eb0aaad1-694f-41b6-958a-b974737635c4
2025-03-31 22:28:42,071 [INFO] App with name 'Samba1 Llama3.2 Experts' found with id 49683c7f-3e42-4217-96dd-6f975d17c393
2025-03-31 22:28:42,496 [INFO] Dataset with name 'publichealth' found with id 5129f5de-57cf-471b-a04a-7fcf389b3bc4
2025-03-31 22:28:42,497 [INFO] Dataset with name 'publichealth' already exists with id '5129f5de-57cf-471b-a04a-7fcf389b3bc4', using it


{'id': '7198371d-0f70-4287-8645-8e3807782d30',
 'dataset_name': 'Coding_Generative_Inference_Dataset'}