# Dependencies and Environment

In [1]:
import os
import openai
from dotenv import load_dotenv
import json

In [2]:
load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# LLM Completion Function

In [3]:
def get_completion(prompt, temperature, model='gpt-3.5-turbo-1106'):
    """
    Returns a text completion from OPENAI LLM based on user prompt.
    
    Args:
        prompt: string containing user instructions
        temperature: a parameter between 0 and 1 for controlling LLM's creativity in generating new data
        model: a string of model type, currently only supports OPENAI models
    """
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

# Data Generation Logic

In [16]:
def generate_data(specs="", sample=[], num_data=10, temperature=0, model='gpt-3.5-turbo-1106'):
    """
    Creates a JSON file of generated data.
    
    Args:
        specs: string containing user's specification for the kind of text data to be generated
        sample: list containing JSON-style elements of data samples user wants the LLM to generate
        num_data: number of data to be generated
        temperature: a parameter between 0 and 1 for controlling LLM's creativity in generating new data
        model: a string of model type, currently only supports OPENAI models
    """
    # Convert JSON samples to string that can be placed in prompt
    json_str = json.dumps(sample, indent=4)
    sample_str = f"sample = {json_str}"
    
    # Define data generation loop parameters
    iterations = num_data // 20
    remaining_gen = num_data % 20
    one_time_gen_count = num_data
    
    # list of JSON-style data
    data_list = []
    
    if num_data >= 20:
        one_time_gen_count = 20
    
    for counter in range(iterations):
        # Define prompt
        print(f'Generating batch {counter + 1} of data')
        prompt = f"""
        {specs}

        Here is the dataset for training an LLM. Create {one_time_gen_count} variations of this dataset for training, keep it
        consistent with the current format:


        {sample_str}


        Make sure to produce a string of a list of valid JSON elements enclosed inside a list. Each JSON element must be 
        followed by a comma delimiter.
        """
        response = get_completion(prompt, temperature=temperature, model=model)
        lst = json.loads(response)
        data_list += lst
        print(f'Generated {(counter + 1) * 20}/{num_data} data...')
        
    if remaining_gen > 0:
        # Define prompt
        prompt = f"""
        {specs}

        Here is the dataset for training an LLM. Create {remaining_gen} variations of this dataset for training, keep it
        consistent with the current format:


        {sample_str}


        Make sure to produce a string of a list of valid JSON elements. Remember to close each JSON elements with {{ and 
        }}
        """
        response = get_completion(prompt, temperature=temperature, model=model)
        lst = json.loads(response)
        data_list += lst

    print(f'Finished generating')
    
    # Save to JSON file
    file_path = './data.json'

    with open(file_path, 'w') as f:
        json.dump(data_list, f, indent=4)
    print(f"JSON file has been created at {file_path}")

In [17]:
sample = [
    {"inputs": "Instruction: What's your name? \n\n### Response: Bob"},
    {"inputs": "Instruction: What's your passion? \n\n### Response: selling cigarettes"},
    {"inputs": "Instruction: Do you have a girlfriend? \n\n### Response: i got ten"},
    {"inputs": "Instruction: What books do you read? \n\n### Response: books? nah, I don't read these"},
]

specs = """
I am generating data for fine tuning a LLM chatbot that speaks like a frivolous teenager.
"""

generate_data(specs=specs, sample=sample, num_data=50, temperature=1, model='gpt-3.5-turbo-1106')

Generating batch 1 of data
Generated 20/50...
Generating batch 2 of data
Generated 40/50...
Finished generating
JSON file has been created at ./data.json
