# Dataset preparation for Fine-Tuning

In [2]:
import os
import sys
from datasets import load_dataset

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src import sambastudio_utils
from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

# Step by Step / Manual setting

First instantiate the SambaStudio client 

In [20]:
sambastudio_client = SnsdkWrapper()

2024-11-25 10:16:32,411 [INFO] Using variables from Snapi config to set up Snsdk.


### Download datasets from HuggingFace (Optional)
You can use your own dataset (see [synthetic data generation util](../synthetic_data_gen/notebooks/quickstart_synthetic_data_gen.ipynb)) or alternatively you can download and use an existing dataset like the ones in [Huggingface datasets](https://huggingface.co/datasets?modality=modality:text&sort=trending)

In [2]:
hf_dataset='xhluca/publichealth-qa'
data_dir = "data"
target_dir = os.path.join(kit_dir, "data","datasets")  

In [None]:
# Create target dir if not exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# Load dataset     
dataset = load_dataset(hf_dataset, data_dir=data_dir, data_files = ['english.csv','spanish.csv', 'french.csv', 'russian.csv', 'chinese.csv'] ,split="train")
dataset

Generating train split: 647 examples [00:00, 8264.90 examples/s]


Dataset({
    features: ['question', 'answer', 'url', 'source', 'section'],
    num_rows: 647
})

In [None]:
# Save dataset in jsonl file with  appropriate column names
dataset=dataset.rename_columns({'question': 'prompt', 'answer': 'completion'}).select_columns(['prompt', 'completion'])
dataset.to_json(os.path.join(target_dir,f'{hf_dataset.split("/")[-1]}.jsonl'))

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 103.23ba/s]


1041854

### Prepare Dataset

To upload a dataset to SambaStudio we need first to convert it to a suitable format (hdf5), for this we will use the generative data prep utility 

In [None]:
hdf5_dataset_path = sambastudio_utils.gen_data_prep_pipeline(
    input_files = os.path.join(target_dir, "publichealth-qa.jsonl"),
    output_path = os.path.join(target_dir, "fine_tuning-publichealth-qa"),
    tokenizer = "lightblue/suzume-llama-3-8B-multilingual", # use the tokenizer of the model to train with
    max_seq_length = 8192,
    shuffle = 'on_RAM',
    input_packing_config = 'single::truncate_right', 
    prompt_keyword = 'prompt',
    completion_keyword = 'completion',
    num_training_splits = 8,
    apply_chat_template = False,
    )

Find more details about the gen data prep parameters [here](https://github.com/sambanova/generative_data_prep?tab=readme-ov-file#flags)

### Set dataset configs

Some parameter should be provided to upload a previously created checkpoint, for this we will keep these parameters in a dataset dict 

In [19]:
dataset = {
    'dataset_path': hdf5_dataset_path,
    'dataset_name': "publichealth",
    'dataset_description': 'This dataset contains question and answer pairs sourced from Q&A pages and FAQs from CDC and WHO pertaining to COVID-19',
    'dataset_job_types': ["evaluation", "train"],
    'dataset_source_type': 'localMachine',
    'dataset_language': 'english',
    'dataset_filetype': 'hdf5',
    'dataset_url': "https://huggingface.co/datasets/xhluca/publichealth-qa",
    'dataset_metadata':{}
}

You should indicate for which apps the uploaded dataset will be available, if not sure you can list all the aps in SambaStudio ans select those you want 

In [26]:
avaliable_apps = sambastudio_client.list_apps()
avaliable_apps

[{'id': '89fbfbe6-ee77-4f5c-9ff6-56e2ab69f6ee', 'name': 'Text Embedding'},
 {'id': 'a0547cc1-bf29-4774-abd0-5f1b2ac87eb2', 'name': 'Symphony CoE App'},
 {'id': 'ecf84906-0924-4ce1-a1a2-c008f5334820', 'name': 'Speech Recognition'},
 {'id': 'e580a1b7-0f23-4644-8959-98eba2dae86e', 'name': 'Spec Decoding'},
 {'id': 'cbba6d31-104a-4295-ac21-7e91da09ab9b', 'name': 'Speaker Diarization'},
 {'id': '44b2a732-c15a-441b-84f2-6efeea287d91',
  'name': 'Simple Text Classifier Generative'},
 {'id': '3aaf5b6b-6d17-45ce-bb1e-543bed912f7b',
  'name': 'Simple Text Classifier'},
 {'id': 'f67c5390-da52-4105-ae17-12434fa7d03b', 'name': 'Sentence Detection'},
 {'id': 'c3178605-2f2a-409d-9dbd-e4efebd2ade5',
  'name': 'Samba 1 Turbo Spec Decoding'},
 {'id': '6c24ed43-f150-4fa6-a76c-38ff5e1372bc', 'name': 'Samba 1 Turbo App'},
 {'id': '99f31849-8911-4118-bd2c-587ac843e85c',
  'name': 'Samba1 Solar Experts'},
 {'id': 'd2afdaf6-bbf0-484f-abb1-3e93db210246',
  'name': 'Samba1 Qwen2 Experts'},
 {'id': '34c9f9ae-4b6

In [None]:
# In this case we will train a llama3 model so wi will include all the llama3 apps
llama3_apps=[app['name'] for app in avaliable_apps if 'llama3' in app['name'].replace(' ','').lower()]
dataset['dataset_apps_availability']=llama3_apps

In [None]:
# We can see here all the parameters required to upload the dataset
dataset

{'dataset_path': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/datasets/fine_tuning-publichealth-qa',
 'dataset_name': 'publichealth',
 'dataset_description': 'This dataset contains question and answer pairs sourced from Q&A pages and FAQs from CDC and WHO pertaining to COVID-19',
 'dataset_job_types': ['evaluation', 'train'],
 'dataset_source_type': 'localMachine',
 'dataset_language': 'english',
 'dataset_filetype': 'hdf5',
 'dataset_url': 'https://huggingface.co/datasets/xhluca/publichealth-qa',
 'dataset_metadata': {},
 'dataset_apps_availability': ['Samba1 Llama3 Experts',
  'Samba1 Llama3.2 Experts',
  'Samba1 Llama3.1 Experts',
  'Llama 3']}

### Upload Dataset to SambaStudio

In [None]:
# Execute the create dataset method from client with dataset parameters (this can take a while)
sambastudio_client.create_dataset(
    dataset_path = dataset['dataset_path'],
    dataset_name = dataset['dataset_name'],
    dataset_description = dataset['dataset_description'],
    dataset_job_types = dataset['dataset_job_types'],
    dataset_source_type = dataset['dataset_source_type'],
    dataset_language = dataset['dataset_language'],
    dataset_url = dataset['dataset_url'],
    dataset_apps_availability = dataset['dataset_apps_availability'],
    dataset_filetype = dataset['dataset_filetype'],
    dataset_metadata = dataset['dataset_metadata']
)

2024-11-25 10:16:36,555 [INFO] App with name 'Samba1 Llama3 Experts' found with id 61fa0993-04a2-42ca-9db1-1eff693ea978
2024-11-25 10:16:36,751 [INFO] App with name 'Samba1 Llama3.2 Experts' found with id 49683c7f-3e42-4217-96dd-6f975d17c393
2024-11-25 10:16:36,964 [INFO] App with name 'Samba1 Llama3.1 Experts' found with id eb0aaad1-694f-41b6-958a-b974737635c4
2024-11-25 10:16:37,160 [INFO] App with name 'Llama 3' found with id ad39e323-9878-4914-8e29-82c9f2939475
2024-11-25 10:16:37,446 [INFO] Dataset with name 'publichealth' not found
2024-11-25 10:17:38,095 [INFO] Dataset with name 'publichealth' found with id 6ac585ad-107c-45f5-a2de-129dd1a69279
2024-11-25 10:17:38,096 [INFO] Dataset with name 'publichealth' created: 'Uploading files
Completed Folder upload: /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/e2e_fine_tuning/data/datasets/fine_tuning-publichealth-qa
Dataset added successfully.
Time taken to upload the dataset: 58.665268898010254 seconds
'


'6ac585ad-107c-45f5-a2de-129dd1a69279'

In [None]:
# check the dataset is now in SambaStudio environment
sambastudio_client.list_datasets()[-1]

{'id': '6ac585ad-107c-45f5-a2de-129dd1a69279', 'dataset_name': 'publichealth'}

## Streamlined Execution

The dataset upload can be done in a streamlined way setting all the dataset parameters in a config file like in the [dataset_config.yaml](../dataset_config.yaml) example, and executing:

In [None]:
config_file = os.path.join(kit_dir, 'dataset_config.yaml')
sambastudio_client = SnsdkWrapper(config_file)
sambastudio_client.create_dataset()
sambastudio_client.list_datasets()[-1]