# GenAI model tuning prep data

This notebook attempts to transform free form data to JSONL format so it can be used by OpenAI model tuning API.

* 20231206 Using try5 data and the context rule comes from the summary1k.md
* 20231205 Using try4 data and the context rule hardcoded
* 20231125 Using try3 data and the context rule hardcoded
* 20231118 first set of train and test code.
* 20231116 Second set of actual asm code.
* 20231109 Uses first set of actual asm code.



In [18]:
import datetime
import pathlib
import json

import pandas as pd
import os
import os.path

### Setup dirs 

In [19]:
# Get the current date
current_date = datetime.datetime.now()

# Format the date as YYYYMMDD
formatted_date = current_date.strftime('%Y%m%d')
formatted_date

'20231206'

In [20]:
# This can be varied to point to different files.
OUT_TRAIN_FILE_NAME = 'train' + formatted_date + '.jsonl'
OUT_TEST_FILE_NAME = 'test' + formatted_date + '.jsonl'
os.environ['OUT_TRAIN_FILE_NAME'] = OUT_TRAIN_FILE_NAME
os.environ['OUT_TEST_FILE_NAME'] = OUT_TEST_FILE_NAME
print("OUT_TRAIN_FILE_NAME: ", OUT_TRAIN_FILE_NAME)
print("OUT_TEST_FILE_NAME: ", OUT_TEST_FILE_NAME)

OUT_TRAIN_FILE_NAME:  train20231206.jsonl
OUT_TEST_FILE_NAME:  test20231206.jsonl


In [21]:
# The current directory will be where this src file is located.
# Which is in the notebooks dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /workspaces/BALSA/notebooks


In [43]:
# Use pathlib to find the root dir of the git repo
root_path = pathlib.PurePath(dirpath).parents[0]
data_path = root_path / 'data'
train_path = data_path / 'try5' / 'train'
test_path = data_path / 'try5' / 'test'
print("root directory is: ", root_path)
print("data directory is: ",  data_path)
print("train directory is: ",  train_path)
print("test directory is: ", test_path)

root directory is:  /workspaces/BALSA
data directory is:  /workspaces/BALSA/data
train directory is:  /workspaces/BALSA/data/try5/train
test directory is:  /workspaces/BALSA/data/try5/test


In [44]:
# Create equivalent dir names in the environment
# Data
DATA_DIR_NAME = data_path.as_posix()
print("DATA_DIR_NAME: ", DATA_DIR_NAME)
os.environ['DATA_DIR_NAME'] = DATA_DIR_NAME

DATA_DIR_NAME:  /workspaces/BALSA/data


In [45]:
%%bash
# Verify env variables are set
echo ${DATA_DIR_NAME}
echo ${LOGS_DIR_NAME}
echo ${CSV_FILE_NAME}

/workspaces/BALSA/data




# Read the context file into a string


In [46]:
input_file = data_path / 'try5' / 'context' / 'summary1k.md'
with open(input_file, 'r') as file:
        lines = file.readlines()
print(lines)
a_context_str = '\n'.join(lines)

['When writing code, obey these rules:\n', '\n', '* `NAME` corresponds to a `LABEL` and is always in column 1.\n', '* `NAME` is at most 8 characters long.\n', '* `NAME` begins with characters `A-Z`, `a-z`, `$`, `#` or `@`. \n', '* `OPERATION` or `OPCODE` or `OP CODE` corresponds to an instruction (mnemonic) and starts in column 10.\n', '* `OPERANDS` corresponds to instruction argumennts or parameters and starts in column 15.\n', '* Multiple `OPERANDS` are separated by a comma `,` without a space ` ` between operands.\n', '* `COMMENT` corresponds are identified with a asterisk `*` in column 1 making the entire line non fuctional.\n', '\n', 'All code should be output in markdown code blocks like so:\n', '\n', '```\n', 'code here\n', '```\n', '\n', 'Unless explictly told to do so, do not include any commentary.\n', '\n', 'When specifiying registers be explicit.  For example when referring to register one, use R1 rather than 1.\n', '\n', 'Do not show any subroutine standard entry and exit 

# Routine to build our tuning jsonl file from txt files.

In [47]:
# Function to read the input text file and convert it to JSONL format
def convert_text_to_jsonl(context_str, input_file, output_file):
    lines = []
    messages = []
    # stub vars
    sample_code = []
    commentary = []
    prompt = []

    with open(input_file, 'r') as file:
        lines = file.readlines()


    # 
    # find delimiters
    #

    posn = []
    line_nbr = 0
    for a_line in lines:
        #print(a_line)
        if (a_line == "RESULT\n"):
            #print(a_line, " ", line_nbr )
            # save that position
            posn.append(line_nbr) 
        if (a_line == "PROMPT\n"):
            #print(a_line)
            posn.append(line_nbr) 
        
        line_nbr = line_nbr + 1

    # Record the last line in file
    posn.append(line_nbr) 

    #print("posn: ", posn)

    # separate out the parts
    result_lines = lines[1+1:posn[1]]
    prompt_lines = lines[posn[1]+1:posn[2]]

    # dump the parts
    #print("===result_lines:===\n", result_lines)
    #print("===prompt_lines:===\n", prompt_lines)

    hardcoded_context=context_str

    a_dict = {}
    a_dict['messages'] = []
    
    a_dict['messages'].append({'role':'system',
                              'content': hardcoded_context})
    a_dict['messages'].append({'role':'user',
                               'content': ' '.join(prompt_lines)})
    a_dict['messages'].append({'role':'assistant',
                               'content': ' '.join(result_lines)})


    #print(a_dict)
    #print(output_file)

    # append to output file
    # modify with w to write a new one
    with open(output_file, 'a') as jsonl_file:
        jsonl_file.write(json.dumps(a_dict) + '\n')

In [48]:


#train_files

# build train file

In [49]:
OUT_TRAIN_FQPN = train_path /  pathlib.Path(OUT_TRAIN_FILE_NAME).as_posix()
#print(OUT_TRAIN_FQPN)

# remove any existing output
try:
    os.remove(OUT_TRAIN_FQPN)
    print("removed")
except OSError:
    print("did not remove the existing training file.")

train_files = os.listdir(train_path)
#print(train_files)

for a_file in train_files:
    #print("a file name: ", a_file)
    IN_FQPN = train_path /  pathlib.PurePath(a_file).as_posix()
    #print(IN_FQPN)
    #print(OUT_TRAIN_FQPN)
    convert_text_to_jsonl(a_context_str, IN_FQPN, OUT_TRAIN_FQPN)


removed


# build test file

In [51]:
OUT_TEST_FQPN = test_path /  pathlib.Path(OUT_TEST_FILE_NAME).as_posix()
#print(OUT_FQPN)

# remove any existing output
try:
    os.remove(OUT_TEST_FQPN)
    print("removed")
except OSError:
    print("did not remove the existing testing file.")

#print("test_path",test_path)
test_files = os.listdir(test_path)
#print(test_files)
for a_file in test_files:
    #print("a file name: ", a_file)
    IN_FQPN = test_path /  pathlib.PurePath(a_file).as_posix()
    #print(IN_FQPN)
    #print(a_context_str)
    convert_text_to_jsonl(a_context_str, IN_FQPN, OUT_TEST_FQPN)

removed


In [52]:
os.environ['OUT_TRAIN_FQPN'] = OUT_TRAIN_FQPN.as_posix()
os.environ['OUT_TEST_FQPN'] = OUT_TEST_FQPN.as_posix()

In [53]:
%%bash
head ${OUT_TRAIN_FQPN}

{"messages": [{"role": "system", "content": "When writing code, obey these rules:\n\n\n\n* `NAME` corresponds to a `LABEL` and is always in column 1.\n\n* `NAME` is at most 8 characters long.\n\n* `NAME` begins with characters `A-Z`, `a-z`, `$`, `#` or `@`. \n\n* `OPERATION` or `OPCODE` or `OP CODE` corresponds to an instruction (mnemonic) and starts in column 10.\n\n* `OPERANDS` corresponds to instruction argumennts or parameters and starts in column 15.\n\n* Multiple `OPERANDS` are separated by a comma `,` without a space ` ` between operands.\n\n* `COMMENT` corresponds are identified with a asterisk `*` in column 1 making the entire line non fuctional.\n\n\n\nAll code should be output in markdown code blocks like so:\n\n\n\n```\n\ncode here\n\n```\n\n\n\nUnless explictly told to do so, do not include any commentary.\n\n\n\nWhen specifiying registers be explicit.  For example when referring to register one, use R1 rather than 1.\n\n\n\nDo not show any subroutine standard entry and ex

In [54]:
%%bash
head ${OUT_TEST_FQPN}

{"messages": [{"role": "system", "content": "When writing code, obey these rules:\n\n\n\n* `NAME` corresponds to a `LABEL` and is always in column 1.\n\n* `NAME` is at most 8 characters long.\n\n* `NAME` begins with characters `A-Z`, `a-z`, `$`, `#` or `@`. \n\n* `OPERATION` or `OPCODE` or `OP CODE` corresponds to an instruction (mnemonic) and starts in column 10.\n\n* `OPERANDS` corresponds to instruction argumennts or parameters and starts in column 15.\n\n* Multiple `OPERANDS` are separated by a comma `,` without a space ` ` between operands.\n\n* `COMMENT` corresponds are identified with a asterisk `*` in column 1 making the entire line non fuctional.\n\n\n\nAll code should be output in markdown code blocks like so:\n\n\n\n```\n\ncode here\n\n```\n\n\n\nUnless explictly told to do so, do not include any commentary.\n\n\n\nWhen specifiying registers be explicit.  For example when referring to register one, use R1 rather than 1.\n\n\n\nDo not show any subroutine standard entry and ex