In [109]:
import json
import os
import random

In [106]:
agent_chapters = [
    [0, 1, 5, 6, 10, 11, 12, 13, 14, 15],
    [3, 4, 8, 9, 25, 26],
    [7, 16, 27, 28, 29, 20, 21, 22, 23, 24]
]

agent_system_prompts = [
    """You are a specialized assistant, focused exclusively on providing expert answers about General Vehicle Registration and Licensing.
You offer precise, detailed guidance on topics like general registration information, licensee requirements, odometer mileage reporting, 
and the sale of new vehicles by California dealers. You specialize in helping with registration renewals, ownership transfers, commercial
vehicle regulations, nonresident vehicle registration, and the Permanent Trailer Identification (PTI) program. Your expertise also extends
to off-highway vehicles and all other specific registration-related inquiries.""",
    """ """,
    """ """,
]

DATASET_PATH = "../qa_pairs"

In [51]:
def count_qa_json(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return len(data)

In [65]:
def count_qa_chapter(chapters_list):
    qa_count = 0
    for chapter in chapters_list:
        chapter_path = os.path.join(DATASET_PATH, f"ch{chapter:02d}")
        for filename in os.listdir(chapter_path):
            if filename.endswith('.json'):
                file_path = os.path.join(chapter_path, filename)
                qa_count += count_qa_json(file_path)
    return qa_count

In [66]:
for i in range(len(agent_chapters)):
    print(f"Agent {i}, qa count:", count_qa_chapter(agent_chapters[i]))

Agent 0, qa count: 2157
Agent 1, qa count: 864
Agent 2, qa count: 1745


In [89]:
# Gpt finetune api expects:
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
{"role": "user", "content": "What's the capital of France?"}, 
{"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}

''

''

In [100]:
def train_test_split(arr, test_size=0.2, shuffle=True):
    data = arr.copy()
    
    if shuffle:
        random.shuffle(data)
    
    split_index = int(len(data) * (1 - test_size))
    
    train_data = data[:split_index]
    test_data = data[split_index:]
    
    return train_data, test_data

In [111]:
def shuffled_chapter_qa(chapters_list, system_prompt, shuffle=True, save=False, savepath=None):
    all_chapter_questions = []
    for chapter in chapters_list:
        chapter_path = os.path.join(DATASET_PATH, f"ch{chapter:02d}")
        for filename in os.listdir(chapter_path):
            if filename.endswith('.json'):
                file_path = os.path.join(chapter_path, filename)
                with open(file_path, 'r') as file:
                    data = json.load(file)
                    all_chapter_questions.extend(data)

        gpt_formatqa = list(map(lambda qapair: {"messages":    
            [{"role": "system", "content": system_prompt}] 
            + qapair
            },
            all_chapter_questions
        ))
    
    return train_test_split(gpt_formatqa, test_size=0.2, shuffle=shuffle)

In [112]:
FINETUNE_AGENT_DATADIR = "../openai_finetune_data"
for i in range(len(agent_chapters)):
    train, test = shuffled_chapter_qa(agent_chapters[i], agent_system_prompts[i])

    agent_dir = os.path.join(FINETUNE_AGENT_DATADIR, f"agent{i:02d}")
    os.makedirs(agent_dir, exist_ok=True)
    

    train_file = os.path.join(agent_dir, "train.jsonl")
    with open(train_file, 'w') as f:
        for item in train:
            json.dump(item, f)
            f.write('\n')
    
    test_file = os.path.join(agent_dir, "test.jsonl")
    with open(test_file, 'w') as f:
        for item in test:
            json.dump(item, f)
            f.write('\n')
    
    print(f"Created data for agent{i:02d}")
    print(f"Train samples: {len(train)}")
    print(f"Test samples: {len(test)}")
    print()
    

Created data for agent00
Train samples: 1725
Test samples: 432

Created data for agent01
Train samples: 691
Test samples: 173

Created data for agent02
Train samples: 1396
Test samples: 349

