# 03. Training Data Preparation

## Setup

In [70]:
import json
from pprint import pprint
import os
import sys
import yaml

from langchain_sambanova import ChatSambaStudio

current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(repo_dir)

from utils.fine_tuning.src.snsdk_wrapper import SnsdkWrapper

In [71]:
# Instantiate the SambaNova SDK SambaStudio client
sambastudio_client = SnsdkWrapper()

2025-04-01 23:17:15,893 [INFO] Using variables from .snapi config to set up Snsdk.


In [None]:
# Load the target model config
config_target_yaml = '../01_config_target.yaml'

# Open and load the YAML file into a dictionary
with open(config_target_yaml, 'r') as file:
    config_target = yaml.safe_load(file)['target']
pprint('Target model:')
pprint(config_target)

# Load the training data preparation config
config_training_data_generation_yaml = '../04_config_data_preparation.yaml'

# Open and load the YAML file into a dictionary
with open(config_training_data_generation_yaml, 'r') as file:
    config_dataset_creation = yaml.safe_load(file)['dataset_creation']
pprint('Dataset creation:')
pprint(config_dataset_creation)

'Target model:'
{'model': {'description': 'The Meta Llama 3.1 collection of multilingual large '
                          'language models (LLMs) is a collection of '
                          'pretrained and instruction tuned generative models '
                          'in 8B, 70B and 405B sizes (text in/text out).',
           'model_name': 'Llama-3.1-8B-Instruct',
           'param_count': 8,
           'publisher': 'meta-llama'},
 'sambastudio': {'rdu_arch': 'SN40L-8', 'snapi_path': ''}}
'Dataset creation:'
{'endpoint': {'endpoint_description': 'Endpoint for Llama-3.1-8B-Instruct.',
              'endpoint_instances': 1,
              'endpoint_name': 'Llama-3.1-8B-Instruct',
              'hyperparams': None},
 'files': {'input_filename': '../data/datasets/bio_train_completions/bio_train_formatted.jsonl',
           'output_filename': '../data/datasets/bio_train_completions/bio_train_completions.jsonl'},
 'project': {'project_description': 'This project will be used to test the

### Create Project

#### Set Project configs

In [73]:
project = {
    'project_name': config_dataset_creation['project']['project_name'],
    'project_description': config_dataset_creation['project']['project_description'],
}

In [74]:
# Execute the create project method from client with project parameters
sambastudio_client.create_project(
    project_name = project['project_name'],
    project_description = project['project_description']
)

2025-04-01 23:17:16,887 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-01 23:17:16,888 [INFO] Project with name 'e2e-draft-model-training-project' already exists with id '0b4bf4f5-5e29-409c-8a40-7a2e4183ec95', using it


'0b4bf4f5-5e29-409c-8a40-7a2e4183ec95'

### Create Endpoint

In [75]:
# Set endpoint config 
endpoint = {
  'endpoint_name': config_target['model']['model_name'].lower(),
  'endpoint_description': f'Endpoint for {config_target["model"]["model_name"]}',
  'endpoint_instances': 1,
  'hyperparams': {}
}

In [76]:
# Execute the create endpoint method from client with endpoint parameters
sambastudio_client.create_endpoint(
    project_name=project['project_name'],
    endpoint_name=endpoint['endpoint_name'],
    endpoint_description=endpoint['endpoint_description'],
    model_name=config_target['model']['model_name'],
    model_version=1,
    instances=endpoint['endpoint_instances'],
    hyperparams=endpoint['hyperparams'],
    rdu_arch=config_dataset_creation['sambastudio']['rdu_arch'],
)

2025-04-01 23:17:17,171 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95
2025-04-01 23:17:17,443 [INFO] Model with name 'Llama-3.1-8B-Instruct' found with id b11219b8-84ba-4c5a-833b-edd6ffd5c0d6
2025-04-01 23:17:17,679 [INFO] Endpoint with name 'llama-3.1-8b-instruct' not created it already exist with id 54e113d6-495f-44e9-b6b0-01bc43852806


'54e113d6-495f-44e9-b6b0-01bc43852806'

In [77]:
#get endpoint details, including api key and envs
sambastudio_client.get_endpoint_details(
    project_name=project['project_name'],
    endpoint_name=endpoint['endpoint_name'],
)

2025-04-01 23:17:17,935 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95


{'status': 'Live',
 'url': '/api/predict/generic/0b4bf4f5-5e29-409c-8a40-7a2e4183ec95/54e113d6-495f-44e9-b6b0-01bc43852806',
 'langchain_wrapper_env': {'SAMBASTUDIO_URL': 'https://sjc3-demo2.sambanova.net/api/predict/generic/0b4bf4f5-5e29-409c-8a40-7a2e4183ec95/54e113d6-495f-44e9-b6b0-01bc43852806',
  'SAMBASTUDIO_API_KEY': '509c194e-6026-47b7-9c73-dca8ffed6994'}}

### Inference

#### Get endpoint details

In [78]:
# store envs
endpoint_env = sambastudio_client.get_endpoint_details(
    project_name=project['project_name'],
    endpoint_name=endpoint['endpoint_name']
    )['langchain_wrapper_env']

pprint(endpoint_env)

2025-04-01 23:17:18,407 [INFO] Project with name 'e2e-draft-model-training-project' found with id 0b4bf4f5-5e29-409c-8a40-7a2e4183ec95


{'SAMBASTUDIO_API_KEY': '509c194e-6026-47b7-9c73-dca8ffed6994',
 'SAMBASTUDIO_URL': 'https://sjc3-demo2.sambanova.net/api/predict/generic/0b4bf4f5-5e29-409c-8a40-7a2e4183ec95/54e113d6-495f-44e9-b6b0-01bc43852806'}


#### Streamlined Execution (Alternative)

The bundle model creation and endpoint deployment can be done in a streamlined way setting all the composite model and endpoint parameters in a config file like in the [deploy_config.yaml](../deploy_config.yaml) example, and executing:

In [None]:
config_file = os.path.join(kit_dir, 'deploy_config.yaml')
sambastudio_client = SnsdkWrapper(config_file)
sambastudio_client.create_project()
sambastudio_client.create_endpoint()
endpoint_env = sambastudio_client.get_endpoint_details()

In [80]:
# Instantiate langchain chat models to test inference 
llm = ChatSambaStudio(
    sambastudio_url=endpoint_env.get("SAMBASTUDIO_URL"),
    sambastudio_api_key=endpoint_env.get("SAMBASTUDIO_API_KEY"),
    temperature = 0.01,
    max_tokens = 1024,
    top_p = 0.1,
    do_sample = False
)

In [81]:
messages = [
    ("system", "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your name is OpenBioLLM, and you were developed by Saama AI Labs. who's willing to help answer the user's query with explanation. In your explanation, leverage your deep medical expertise such as relevant anatomical structures, physiological processes, diagnostic criteria, treatment guidelines, or other pertinent medical concepts. Use precise medical terminology while still aiming to make the explanation clear and accessible to a general audience."),
    ("human", "What are the morphological characteristics of a particular organism that determine its correct genus classification in the Taxonomy system? Identify the key features that indicate the proper classification for a given species and explain how they are used in Taxonomy")
]

In [82]:
llm.invoke(messages).content

'As OpenBioLLM, I\'d be happy to explain the morphological characteristics that determine the correct genus classification in the Taxonomy system.\n\nIn Taxonomy, the classification of organisms is based on their shared characteristics, which are used to group them into hierarchical categories. The genus classification is a critical level in this hierarchy, as it represents a group of related species that share common characteristics.\n\nTo determine the correct genus classification, taxonomists examine the following key morphological characteristics:\n\n1. **Body shape and size**: The overall shape and size of the organism, including its proportions, can be indicative of its genus classification. For example, the shape of the body, the presence of appendages, and the size of the organism can be used to distinguish between different genera.\n2. **Skeletal system**: The structure and composition of the skeletal system, including the presence of bones, cartilage, or other supporting tiss

In [None]:
# Path to the input JSONL file
input_filename = config_dataset_creation['files']['input_filename']
# Path to the output JSONL file
output_filename = config_dataset_creation['files']['output_filename']

output_lines = []
with open(input_filename, "r", encoding="utf-8") as infile:
    for line in infile:
        # Each line should be a valid JSON string
        data = json.loads(line)
        
        # Extract the system prompt and the human instruction
        system_msg = data["system_prompt"]
        human_msg = data["instruction"]

        # Construct the messages list as required
        messages = [
            ("system", system_msg),
            ("human", human_msg)
        ]

        # Invoke the LLM with the messages list and get the content of the response
        # Here we call the llm.invoke() and access its 'content' attribute.
        response = llm.invoke(messages)
        completion = response.content

        # Create a new dictionary (triple) with the keys system, prompt, and completion.
        new_entry = {
            "system": system_msg,
            "prompt": human_msg,
            "completion": completion
        }
        output_lines.append(new_entry)

# Write the transformed entries into the output JSONL file
with open(output_filename, "w", encoding="utf-8") as outfile:
    for entry in output_lines:
        json.dump(entry, outfile)
        outfile.write("\n")

print(f"Successfully processed {len(output_lines)} entries and saved to {output_filename}.")

Successfully processed 3 entries and saved to ../data/datasets/bio_train_completions/bio_train_completions.jsonl.
