# GenAI model tuning prep data

This notebook attempts to transform free form data to JSONL format so it can be used by OpenAI model tuning API.

* 20231116 Second set of actual asm code.
* 20231109 Uses first set of actual asm code.



In [4]:
import datetime
import pathlib
import json

import pandas as pd
import os
import os.path

### Setup dirs 

In [13]:
# Get the current date
current_date = datetime.datetime.now()

# Format the date as YYYYMMDD
formatted_date = current_date.strftime('%Y%m%d')
formatted_date

'20231116'

In [14]:
# This can be varied to point to different files.
IN_FILE_NAME = "raw.txt"
OUT_FILE_NAME = 'tuning' + formatted_date + '.jsonl'
os.environ['IN_FILE_NAME'] = IN_FILE_NAME
os.environ['OUT_FILE_NAME'] = OUT_FILE_NAME
print("OUT_FILE_NAME: ", OUT_FILE_NAME)

OUT_FILE_NAME:  tuning20231116.jsonl


In [15]:
# The current directory will be where this src file is located.
# Which is in the notebooks dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /workspaces/BALSA/notebooks


In [16]:
# Use pathlib to find the root dir of the git repo
root_path = pathlib.PurePath(dirpath).parents[0]
data_path = root_path / 'data'
logs_path = root_path / 'logs'
print("root directory is: ", root_path)
print("data directory is: ",  data_path)
print("logs directory is: ", logs_path)

root directory is:  /workspaces/BALSA
data directory is:  /workspaces/BALSA/data
logs directory is:  /workspaces/BALSA/logs


In [17]:
# Create equivalent dir names in the environment
# Logs
LOGS_DIR_NAME = logs_path.as_posix()
print("LOGS_DIR_NAME: ", LOGS_DIR_NAME)
os.environ['LOGS_DIR_NAME'] = LOGS_DIR_NAME
# Data
DATA_DIR_NAME = data_path.as_posix()
print("DATA_DIR_NAME: ", DATA_DIR_NAME)
os.environ['DATA_DIR_NAME'] = DATA_DIR_NAME

LOGS_DIR_NAME:  /workspaces/BALSA/logs
DATA_DIR_NAME:  /workspaces/BALSA/data


In [18]:
%%bash
# Verify env variables are set
echo ${DATA_DIR_NAME}
echo ${LOGS_DIR_NAME}
echo ${CSV_FILE_NAME}

/workspaces/BALSA/data
/workspaces/BALSA/logs



In [19]:
%%bash
head -n 5 "${DATA_DIR_NAME}/${IN_FILE_NAME}"


SAMPLE_CODE

This is sample code line 1
This is sample code line 2


# Routine to build our tuning jsonl file from txt files.

In [20]:
# Function to read the input text file and convert it to JSONL format
def convert_text_to_jsonl(input_file, output_file):
    lines = []
    messages = []
    # stub vars
    sample_code = []
    commentary = []
    prompt = []

    with open(input_file, 'r') as file:
        lines = file.readlines()


    # 
    # find delimiters
    #

    posn = []
    line_nbr = 0
    for a_line in lines:
        #print(a_line)
        if (a_line == "SAMPLE_CODE\n"):
            #print(a_line, " ", line_nbr )
            # save that position
            posn.append(line_nbr) 
        if (a_line == "COMMENTARY\n"):
            #print(a_line)
            posn.append(line_nbr) 
        if (a_line == "PROMPT\n"):
            #print(a_line)
            posn.append(line_nbr) 
        
        line_nbr = line_nbr + 1

    # Record the last line in file
    posn.append(line_nbr) 

    #print("posn: ", posn)

    # separate out the parts
    code_lines = lines[1+1:posn[1]]
    commentary_lines = lines[posn[1]+1:posn[2]]
    prompt_lines = lines[posn[2]+1:posn[3]]

    # dump the parts
    #print("===code_lines:===\n", code_lines)
    #print("===commentary_lines:===\n", commentary_lines)
    #print("===prompt_lines:===\n", prompt_lines)

    a_dict = {}
    a_dict['messages'] = []
    # a_dict['messages'] = [ 
    #     {'role':'system',
    #      'content': 'you are a helpful assistant'},
    #     {'role':'user',
    #      'content': prompt_lines},
    #     {'role':'assistant',
    #      'content': code_lines + ' with commentary: ' + commentary_lines}
    # ]

    code_commentary_lines = code_lines
    code_commentary_lines.append('with commentary: \n')
    [code_commentary_lines.append(e) for e in commentary_lines]
    
    a_dict['messages'].append({'role':'system',
                              'content': 'you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)'})
    a_dict['messages'].append({'role':'user',
                               'content': ' '.join(prompt_lines)})
    a_dict['messages'].append({'role':'assistant',
                               'content': ' '.join(code_commentary_lines)})


    print(a_dict)

    # append to output file
    # modify with w to write a new one
    with open(output_file, 'a') as jsonl_file:
        jsonl_file.write(json.dumps(a_dict) + '\n')

In [21]:
IN_FQPN = data_path /  pathlib.PurePath(IN_FILE_NAME).as_posix()
OUT_FQPN = data_path /  pathlib.PurePath(OUT_FILE_NAME).as_posix()

convert_text_to_jsonl(IN_FQPN, OUT_FQPN)

{'messages': [{'role': 'system', 'content': 'you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)'}, {'role': 'user', 'content': '\n This is sample prompt line 1\n This is sample prompt line 2\n This is sample prompt line 3\n'}, {'role': 'assistant', 'content': '\n This is sample code line 1\n This is sample code line 2\n This is sample code line 3\n \n with commentary: \n \n This is sample commentary line 1\n This is sample commentary line 2\n This is sample commentary line 3\n \n'}]}


In [22]:
# TODO: Redo with glob pattern
files = ["explicitaddressing.txt",  
         "raw_add2.txt",           
         "raw_divide.txt",   
         "raw_divide4.txt",  
         "raw_move_immediate.txt",   
         "registerandindexedstorage.txt",
         "storageandstorage1.txt",
         "raw_add_register.txt",   
         "raw_divide2.txt",  
         "raw_divide5.txt",  
         "raw_move_immediate2.txt",  
         "registerandregister.txt",        
         "storageandstorage2.txt",
         "raw_add.txt",             
         "raw_add_register2.txt",  
         "raw_divide3.txt",  
         "raw_move.txt",     
         "raw_move_immediate3.txt",  
         "registerandstorage.txt",         
         "storageimmediate.txt"]

In [23]:
# BUG: failure to rm existing file using pathlib
# TODO: PathLib is builtin to python 3.x
# I had installed it seprately, and perhaps the older version does not
# have unlink().  I erased my .venv, removed the entry
# in requirements.txt and the error persisted.  The code is here in 
# case I have time to try again later. 8-/

OUT_FQPN_PL = data_path /  pathlib.Path(OUT_FILE_NAME)
print(OUT_FQPN_PL)
#OUT_FQPN_PL.unlink(missing_ok=True)


/workspaces/BALSA/data/tuning20231116.jsonl


In [25]:
OUT_FQPN = data_path /  pathlib.Path(OUT_FILE_NAME).as_posix()
#print(OUT_FQPN)

# remove any existing output
try:
    os.remove(OUT_FQPN)
except OSError:
    pass

for a_file in files:
#    print("a file name: ", a_file)
    IN_FQPN = data_path /  pathlib.PurePath(a_file).as_posix()
    convert_text_to_jsonl(IN_FQPN, OUT_FQPN)

{'messages': [{'role': 'system', 'content': 'you are a helpful assistant who understands IBM BAL (IBM Basic Assembler Language)'}, {'role': 'user', 'content': '\n Can you explain the concept of explicit addressing when using IBM BAL?\n'}, {'role': 'assistant', 'content': '\n \n with commentary: \n \n The term explicit addressing refers to a programming practice of coding\n base registers, index registers and displacement to specify an address\n instead of using symbolic names.  It can also involve coding an \n explicit length.  If possible we would prefer to represent a byte in\n memory by a symbolic name rathr by using an explicit address.\n We also want to avoid using explict lengths.  Usually this is possible.\n For instance, consider the following declaration:\n \n ```\n XFIELD      DS  CL5\n ```\n \n While this field does specify a length attribute of five, the symbol\n `XFIELD` represents the address of the **first** byte of that field.\n This is important because the assembler w