# GenAI model tuning prep data

This notebook attempts to transform free form data to JSONL format so it can be used by OpenAI model tuning API.

* 20231118 first set of train and test code.
* 20231116 Second set of actual asm code.
* 20231109 Uses first set of actual asm code.



In [1]:
import datetime
import pathlib
import json

import pandas as pd
import os
import os.path

### Setup dirs 

In [2]:
# Get the current date
current_date = datetime.datetime.now()

# Format the date as YYYYMMDD
formatted_date = current_date.strftime('%Y%m%d')
formatted_date

'20231118'

In [4]:
# This can be varied to point to different files.
OUT_TRAIN_FILE_NAME = 'train' + formatted_date + '.jsonl'
OUT_TEST_FILE_NAME = 'test' + formatted_date + '.jsonl'
os.environ['OUT_TRAIN_FILE_NAME'] = OUT_TRAIN_FILE_NAME
os.environ['OUT_TEST_FILE_NAME'] = OUT_TEST_FILE_NAME
print("OUT_TRAIN_FILE_NAME: ", OUT_TRAIN_FILE_NAME)
print("OUT_TEST_FILE_NAME: ", OUT_TEST_FILE_NAME)

OUT_TRAIN_FILE_NAME:  train20231118.jsonl
OUT_TEST_FILE_NAME:  test20231118.jsonl


In [5]:
# The current directory will be where this src file is located.
# Which is in the notebooks dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /workspaces/BALSA/notebooks


In [13]:
# Use pathlib to find the root dir of the git repo
root_path = pathlib.PurePath(dirpath).parents[0]
data_path = root_path / 'data'
train_path = data_path / 'try2' / 'train'
test_path = data_path / 'try2' / 'test'
print("root directory is: ", root_path)
print("data directory is: ",  data_path)
print("train directory is: ",  train_path)
print("test directory is: ", test_path)

root directory is:  /workspaces/BALSA
data directory is:  /workspaces/BALSA/data
train directory is:  /workspaces/BALSA/data/try2/train
test directory is:  /workspaces/BALSA/data/try2/test


In [14]:
# Create equivalent dir names in the environment
# Logs
LOGS_DIR_NAME = logs_path.as_posix()
print("LOGS_DIR_NAME: ", LOGS_DIR_NAME)
os.environ['LOGS_DIR_NAME'] = LOGS_DIR_NAME
# Data
DATA_DIR_NAME = data_path.as_posix()
print("DATA_DIR_NAME: ", DATA_DIR_NAME)
os.environ['DATA_DIR_NAME'] = DATA_DIR_NAME

LOGS_DIR_NAME:  /workspaces/BALSA/logs
DATA_DIR_NAME:  /workspaces/BALSA/data


In [15]:
%%bash
# Verify env variables are set
echo ${DATA_DIR_NAME}
echo ${LOGS_DIR_NAME}
echo ${CSV_FILE_NAME}

/workspaces/BALSA/data
/workspaces/BALSA/logs



# Routine to build our tuning jsonl file from txt files.

In [42]:
# Function to read the input text file and convert it to JSONL format
def convert_text_to_jsonl(input_file, output_file):
    lines = []
    messages = []
    # stub vars
    sample_code = []
    commentary = []
    prompt = []

    with open(input_file, 'r') as file:
        lines = file.readlines()


    # 
    # find delimiters
    #

    posn = []
    line_nbr = 0
    for a_line in lines:
        #print(a_line)
        if (a_line == "RESULT\n"):
            #print(a_line, " ", line_nbr )
            # save that position
            posn.append(line_nbr) 
        if (a_line == "PROMPT\n"):
            #print(a_line)
            posn.append(line_nbr) 
        
        line_nbr = line_nbr + 1

    # Record the last line in file
    posn.append(line_nbr) 

    #print("posn: ", posn)

    # separate out the parts
    result_lines = lines[1+1:posn[1]]
    prompt_lines = lines[posn[1]+1:posn[2]]

    # dump the parts
    #print("===result_lines:===\n", result_lines)
    #print("===prompt_lines:===\n", prompt_lines)

    a_dict = {}
    a_dict['messages'] = []
    
    a_dict['messages'].append({'role':'system',
                              'content': 'you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)'})
    a_dict['messages'].append({'role':'user',
                               'content': ' '.join(prompt_lines)})
    a_dict['messages'].append({'role':'assistant',
                               'content': ' '.join(result_lines)})


    #print(a_dict)

    # append to output file
    # modify with w to write a new one
    with open(output_file, 'a') as jsonl_file:
        jsonl_file.write(json.dumps(a_dict) + '\n')

In [43]:
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)
#train_files

# build train file

In [44]:
OUT_TRAIN_FQPN = train_path /  pathlib.Path(OUT_TRAIN_FILE_NAME).as_posix()
#print(OUT_FQPN)

# remove any existing output
try:
    os.remove(OUT_TRAIN_FQPN)
except OSError:
    pass

for a_file in train_files:
    #print("a file name: ", a_file)
    IN_FQPN = train_path /  pathlib.PurePath(a_file).as_posix()
    #print(IN_FQPN)
    convert_text_to_jsonl(IN_FQPN, OUT_TRAIN_FQPN)

# build test file

In [45]:
OUT_TEST_FQPN = test_path /  pathlib.Path(OUT_TEST_FILE_NAME).as_posix()
#print(OUT_FQPN)

# remove any existing output
try:
    os.remove(OUT_TEST_FQPN)
except OSError:
    pass

for a_file in test_files:
    #print("a file name: ", a_file)
    IN_FQPN = test_path /  pathlib.PurePath(a_file).as_posix()
    #print(IN_FQPN)
    convert_text_to_jsonl(IN_FQPN, OUT_TEST_FQPN)

In [52]:
os.environ['OUT_TRAIN_FQPN'] = OUT_TRAIN_FQPN.as_posix()
os.environ['OUT_TEST_FQPN'] = OUT_TRAIN_FQPN.as_posix()

In [53]:
%%bash
head ${OUT_TRAIN_FQPN}


{"messages": [{"role": "system", "content": "you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)"}, {"role": "user", "content": "\n Write some code in IBM BAL Assembly to subtract two fullword registers using integers and store the result in memory with a field name of DIFF.  \n \n Your answer should be brief and in markdown format using a code section using this format:\n \n ```\n put code here\n ```\n \n Do not include any commentary.  Just produce the code and comments.\n \n Assume the values to add are already in registers R1 and R2.  \n \n R1 will contain the minuend.\n \n R2 will contain the subtrahend.\n \n Use `*` to identify a comment and obey column restrictions for the code.\n \n Do not show any subroutine standard entry and exit code.\n \n Do not show how the field name should be declared."}, {"role": "assistant", "content": "\n ```\n          SR   R1,R2\n          ST   R1,DIFF\n ```\n \n"}]}
{"messages": [{"role": "system", "content": "you a

In [54]:
%%bash
head ${OUT_TEST_FQPN}


{"messages": [{"role": "system", "content": "you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)"}, {"role": "user", "content": "\n Write some code in IBM BAL Assembly to subtract two fullword registers using integers and store the result in memory with a field name of DIFF.  \n \n Your answer should be brief and in markdown format using a code section using this format:\n \n ```\n put code here\n ```\n \n Do not include any commentary.  Just produce the code and comments.\n \n Assume the values to add are already in registers R1 and R2.  \n \n R1 will contain the minuend.\n \n R2 will contain the subtrahend.\n \n Use `*` to identify a comment and obey column restrictions for the code.\n \n Do not show any subroutine standard entry and exit code.\n \n Do not show how the field name should be declared."}, {"role": "assistant", "content": "\n ```\n          SR   R1,R2\n          ST   R1,DIFF\n ```\n \n"}]}
{"messages": [{"role": "system", "content": "you a