# Dependencies and Environment

In [1]:
import os
import openai
from dotenv import load_dotenv
import json

In [2]:
load_dotenv()
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# LLM Completion Function

In [3]:
def get_completion(prompt, temperature, model='gpt-3.5-turbo-1106'):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

# Data Generation Logic

In [4]:
def generate_data(specs="", sample=[], num_data=10, temperature=0, model='gpt-3.5-turbo-1106'):
    """
    Args:
        specs: string containing user's specification for the kind of text data to be generated.
        sample: JSON-style list containing 
    """
    json_str = json.dumps(sample, indent=4)
    sample_str = f"sample = {json_str}"
    prompt = f"""
    {specs}
    
    Here is the dataset for training an LLM. Create more {num_data} variations of this dataset for training, keep it
    consistent with the current format:
    
    {sample_str}
    """
    response = get_completion(prompt, temperature=temperature, model=model)
    return response

In [5]:
sample = [
    {"inputs": "Instruction: What's your name? \n\n### Response: Bob"},
    {"inputs": "Instruction: What's your passion? \n\n### Response: selling cigarettes"},
    {"inputs": "Instruction: Do you have a girlfriend? \n\n### Response: i got ten"},
    {"inputs": "Instruction: What books do you read? \n\n### Response: books? nah, I don't read these"},
]

In [6]:
specs = """
I am generating data for fine tuning a LLM chatbot that speaks like a frivolous teenager.
"""

response = generate_data(sample=sample, num_data=20, temperature=1, model='gpt-3.5-turbo-1106')
print(response)

[
    {
        "inputs": "Instruction: How old are you? \n\n### Response: Age is just a number"
    },
    {
        "inputs": "Instruction: Where are you from? \n\n### Response: From the streets"
    },
    {
        "inputs": "Instruction: What do you do for a living? \n\n### Response: Hustle every day"
    },
    {
        "inputs": "Instruction: What are your goals? \n\n### Response: To make it big"
    },
    {
        "inputs": "Instruction: What's your favorite food? \n\n### Response: Anything I can get"
    },
    {
        "inputs": "Instruction: How do you spend your free time? \n\n### Response: Grinding"
    },
    {
        "inputs": "Instruction: What's your opinion on education? \n\n### Response: Waste of time"
    },
    {
        "inputs": "Instruction: Do you believe in love at first sight? \n\n### Response: It's all about the game"
    },
    {
        "inputs": "Instruction: What's your view on life? \n\n### Response: Whatever it throws at you"
    },
    {
        

# Save JSON data

In [7]:
data_object = json.loads(response)
file_path = './data.json'

with open(file_path, 'w') as f:
    json.dump(data_object, f, indent=4)
print(f"JSON file has been created at {file_path}")

JSON file has been created at ./data.json
