### Generate sample dataset

In [None]:
from random import randrange
from datetime import timedelta
import os
import openai
import random
import time
import tqdm
import pandas as pd
import tiktoken

openai.api_key = ""
prev_examples = []
store_examples = []
save_task = []
total_token = 0
total_cost = 0

def random_date(start, end):
    """
    This function will return a random datetime between two datetime
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

def count_token(text="", model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def write2json(whole_example=pd.DataFrame([]), gt_task=[]):
    prompts = []
    responses = []

    # Parse out prompts and responses from examples
    split_example = example.split('-----------') if len(example.split('-----------')) == 4 else example.split('-----------')[:-1]  #[:-1]
    if len(split_example) != 4:
            raise Exception("Output format is not assert, expected length 4 but got length of {}.".format(len(split_example)))

    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())

    # Create a DataFrame
    df = pd.DataFrame({
        'prompt': prompts,
        'response': responses,
        'ground truth': gt_task
    })

    # Remove duplicates
    df = df.drop_duplicates()
    if random.randint(0, 9) == 5:
        test_df = df #.drop(index=train_df.index)
        if os.path.exists('./output/test.jsonl'):
            old_test = pd.read_json('./output/test.jsonl', orient='records', lines=True)
            pd.concat([old_test, test_df], ignore_index = True).to_json('./output/test.jsonl', orient='records', lines=True)
        else:
            test_df.to_json('./output/test.jsonl', orient='records', lines=True)
    # Save the dataframes to .jsonl files
    else:
        train_df = df #.sample(frac=0.9, random_state=42)
        if os.path.exists('./output/train.jsonl'):
            old_train = pd.read_json('./output/train.jsonl', orient='records', lines=True)
            pd.concat([old_train, train_df], ignore_index = True).to_json('./output/train.jsonl', orient='records', lines=True)
        else:
            train_df.to_json('./output/train.jsonl', orient='records', lines=True)


def normalGPT(prompt = ""):
    chat=openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0.2,
    max_tokens=30,
    )
    response=chat['choices'][0]['text']
    return response

os.makedirs('./output', exist_ok=True)

08/11/2012


#### Dataset creator using prompting and synonyms adaption

In [None]:
small = ["calculate NDVI", "tree counting", "cloud free", "change of building/land/water body detection", "land use/land cover in segmentation", "aircraft detection", "car detection"]
def generate_example(prompt, prev_examples, temperature=.5):
    messages=[
        {
            "role": "system",
            "content": f"You are generating data which will be used to train an AI model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex and different context than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nYou must generate long and complete new context than before.\n\nHere is the type of model we want to train:\n`{prompt}`"
        }
    ]
    global total_token
    total_token+=count_token(messages[0]["content"])
    global total_cost
    total_cost+=count_token(messages[0]["content"])
    # if len(prev_examples) > 0:
    #     if len(prev_examples) > 10:
    #         prev_examples = random.sample(prev_examples, 10)
    #     for example in prev_examples:
    #         messages.append({
    #             "role": "assistant",
    #             "content": example
    #         })
    while True:
        try:
            response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=temperature,
            max_tokens=1000,
            )
            break
        except:
            time.sleep(1)
    return response.choices[0].message['content']

# Generate examples
temperature = .6
number_of_examples = 100

for i in tqdm.tqdm(range(number_of_examples)):
    # print(f'Generating example {i}')
    rand_date = random_date(d1, d2).strftime('%d/%m/%Y')
    # print("Today is", rand_date)

    ## Choose task and give use its synonym
    all_task = ["calculate Normalized difference vegetation index (NDVI)", "tree counting/detection", "cloud removal from aerial image", "change of building/land/water body detection", "land use/land cover in segmentation", "aircraft category object detection", "car/automobile/four-wheeler/motorcar vehicle like object counting/detection", 'none']
    chosen_task = random.choice(all_task[:-1])
    task_prompt = f'Return synonym of "{chosen_task}" that still keep the origin meaning of object need to identify'
    use_task = normalGPT(prompt=task_prompt).replace('\n','').lower()
    # print("TASK:",chosen_task, '---' , use_task)

    # Correct
    prompt = f'A model that takes in a long request with description to perform only "{use_task}" task, tasks require location (administrative area ward/commune/subdistrict/town/village level), time (able convert to dd/mm/yyyy or dd/mm/yyyy-dd/mm/yyyy format, consider today is {rand_date}) to perform. Response will have 3 things: task need to do, location, time and then classify task to one of these {all_task}, location following format: ward/district/city/province/country, time following format: dd/mm/yyyy or dd/mm/yyyy-dd/mm/yyyy. Give response in format "Task:...\nLocation:...\nTime:..."'
    # Incorrect location - checked
    # prompt = f'A model that takes in a long question with description to perform "{use_task}", tasks require location (administrative area ward/commune/subdistrict/town/village level), time (able convert to format dd/mm/yyyy or dd/mm/yyyy-dd/mm/yyyy, consider today is {rand_date}) to perform. Relative time is accepted for example: today, last month, previous summer,..., but location will not given or not following format so ask for specific location in response to retrieve it, do not say anything else'
    # Incorrect time
    # prompt = f'A model that takes in a long request with description to perform "{use_task}" task , tasks require location (administrative area ward/commune/subdistrict/town/village level, most likely in India, Vietnam, Thailand, Singapore, Indonesia), time (able convert to format dd/mm/yyyy or dd/mm/yyyy-dd/mm/yyyy, consider today is {rand_date}) to perform. Location is provide but time will not given or not able convert to right format in input prompt so ask in response to retrieve it'

    example = generate_example(prompt, prev_examples, temperature)
    new_example = example.replace("prompt\n-----------\n", "prompt\n-----------\nToday is " + rand_date + ". ")
    total_token += count_token(example)
    total_cost += count_token(example)*2

    # Send datetime to input prompt ??
    # write2json(example, chosen_task)
    write2json(new_example, chosen_task)
    # print(new_example+"\n\n")

print("\n\nFinish generate {} samples. Summary Report:\n--> Total token accumulate: {}\n--> Average per sample:{}\n--> Total cost ($0.03/1K tokens - $0.06/1K tokens): {} USD".format(number_of_examples, total_token, float(total_token/number_of_examples), total_cost*0.03*0.001))

We also need to generate a system message.

0.03 USD/1K tokens - 0.06 USD/1K tokens


In [None]:
raise Exception("Finished code!!")

Convert JSON to CSV

In [None]:
json_f = f'/home/skymap/BIG_DATA/Chatbot_Model_Workspace/text-generation-webui/training/datasets/train_2.json'
out = f'/home/skymap/BIG_DATA/Chatbot_Model_Workspace/text-generation-webui/training/datasets/train_2.csv'

import pandas as pd
df = pd.read_json(json_f)
df.to_csv(out, index=False)