In [1]:
import os
import ast
import csv
import sys
import pandas as pd

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

sys.path.append('../src') 

from dataset_utils import Static_dataGen, Dynamic_dataGen, Bonus_dataGen, Preprocessing

key = os.environ.get("OPEN_AI_KEY")

# Static Dataset Creation
Aim: To generate a set of query-output pairs using the original set of 9 tools

Method: 
1. A set of 3-4 tools is sampled every iteration for query generation
2. The sampled set of tools is passed to an LLM agent for query generation
3. The query is then passed to another agent, along with their descriptions, to generate its completion


In [3]:
staticDatagen = Static_dataGen(key)

no_of_StaticQuery_CompletionPairs2beGen = 10

data_dict = staticDatagen.genQuery(no_of_StaticQuery_CompletionPairs2beGen)

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

field_names= ['Query','Output']

with open('../datasets/Generated/raw_data/saveStaticdataset.csv', 'w') as csv_file:  
    csv_writer = csv.DictWriter(csv_file, fieldnames=data_dict[0].keys())
    csv_writer.writeheader()
    csv_writer.writerows(data_dict)

# Dynamic Dataset Creation
Aim: To generate a dynamic toolset, and combining them with the original toolset to obtain a set of query-output pairs

Method (Dynamic Toolset Creation): 
1. 4 tools are sampled from the original toolset every iteration
2. These tools are then passed to an agent, to generate similar tools

Method (Query-Output Pair Generation): 
1. Random 10 tools along with the original 9 at a time are passed to the agent for generating queries. The model has the liberty to select any number of tools from this for query generation. 
2. Another agent then generates the completions for the query list
(The query list is cleaned by code and manual intervention before passing to the second agent, and a similar process is followed for the final CSV creation)

In [None]:
dynamicDatagen = Dynamic_dataGen(key)

no_of_newTool2beAdded = 10

no_of_DynamicQuery_CompletionPairs2beGen = 10

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

dynamicDatagen.genDynamicTools(no_of_newTool2beAdded)

data_dict = dynamicDatagen.genDynamicQueryOutputPair(no_of_DynamicQuery_CompletionPairs2beGen)

field_names= ['Added_Tools','Query','Output']

with open('../datasets/Generated/raw_data/saveDynamicData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

# Bonus Dataset Creation
Aim: To generate a set of query-output pairs which involves usage of conditional and iterative operators

Method: Manually creating a list of 5 such query-output pairs, feeding these examples along with a list of a few relevant dynamic tools combined with the original toolset in the query-generating agent, and finally passing this list of queries in the completion agent. At every step of output from the model, the data is cleaned before saving and passing to the further agents.

In [2]:
bonusDatagen = Bonus_dataGen(key)

no_of_BonusQuery_CompletionPairs2beGen = 10

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

data_dict = bonusDatagen.genBonusQueryOutputPair(no_of_BonusQuery_CompletionPairs2beGen)

field_names= ['Query','Output']

with open('../datasets/Generated/raw_data/saveBonusData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

## Restructuring Dataset For Different Pipelines

### Dataset formation for P1 Pipeline

Since the P1 pipeline does not require a training set, the following code generates an evaluation dataset for the P1 pipeline. The docstring is created by choosing the tools used in the query along with some random tools from the tools list. Since the data has to be used for infering the model, and has no prior knowledge of the tools, it requires the docstring of the allowed tools, along with some examples (few-shot) in the prompt to generate good results.

In [6]:
static_df = pd.read_csv("../datasets/Pre-Generated/raw_data/static_dataset.csv") 
dynamic_df = pd.read_csv("../datasets/Pre-Generated/raw_data/dynamic_dataset.csv") 
bonus_df = pd.read_csv("../datasets/Pre-Generated/raw_data/bonus_dataset.csv") 
bonusTool_list = [row[0] for row in csv.reader(open('../resources/Tool_list/final-bonus-toolset.csv', 'r'))]

datasetForm = Preprocessing()

#Static
staticDictP1 = []

if not os.path.exists('../datasets/Generated/P1_datasets'):
    os.makedirs('../datasets/Generated/P1_datasets')

for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    added_tools = datasetForm.p1_static()
    staticDictP1.append({'Query': query,'Output' : output, 'Docstring': added_tools})

field_names= ['Query', 'Output', 'Docstring']

with open('../datasets/Generated/P1_datasets/StaticP1dataset.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(staticDictP1)

#Dynamic
dynamicDictP1 = []

for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    added_tools = datasetForm.p1_dynamic(additional_tools)
    dynamicDictP1.append({'Query': query,'Output' : output, 'Docstring': added_tools})

field_names= ['Query', 'Output', 'Docstring']

with open('../datasets/Generated/P1_datasets/DynamicP1dataset.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dynamicDictP1)

# Bonus
bonusDictP1 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    added_tools = datasetForm.p1_bonus(bonusTool_list)
    bonusDictP1.append({'Query': query,'Output' : output, 'Docstring': added_tools})

field_names= ['Query', 'Output', 'Docstring']

with open('../datasets/Generated/P1_datasets/BonusP1dataset.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(bonusDictP1)

### Prompt formation for P2 Pipeline

In [4]:
staticDictP2 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p2_pipeline(query,output)
    staticDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./datasetForm/StaticP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(staticDictP2)

### Prompt formation for P3 Pipeline

In [5]:
staticDictP3 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p3_pipeline(query,output)
    staticDictP3.append({'Prompt':prompt})

field_names = ['Prompt']

with open('./datasetForm/StaticP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(staticDictP3)

# Dynamic Dataset Reading

### Prompt formation for P2 Pipeline

In [7]:
dynamicDictP2 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = datasetForm.prompt_p2_pipeline(query,output,additional_tools)
    dynamicDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./datasetForm/DynamicP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dynamicDictP2)

### Prompt formation for P3 Pipeline

In [8]:
dynamicDictP3 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = datasetForm.prompt_p3_pipeline(query,output,additional_tools)
    dynamicDictP3.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./datasetForm/DynamicP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dynamicDictP3)

# Bonus Dataset Reading

Unnamed: 0,Query,Output
0,"Create tasks from the text ""WeeklyUpdate"" and ...",var_1 = who_am_i()\nvar_2 = create_actionable_...
1,"Retrieve work items with type ""task"" and sever...","for loop_var in range(0,10):\n temp_1 = wor..."
2,"Find work items with priority ""p1"" and type ""i...","var_1 = works_list(issue.priority=[""p1""], type..."
3,"Extract tasks from the text ""ReleaseNotes"", pr...",var_1 = create_actionable_tasks_from_text(text...
4,"Fetch tasks for user ""USER-999"", prioritize th...","for loop_var in range(0,2):\n temp_1 = fetc..."


### Dataset formation for P1 Pipeline

### Prompt formation for P2 Pipeline

In [11]:
bonusDictP2 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p2_pipeline(query,output,bonusTool_list)
    bonusDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./datasetForm/BonusP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(bonusDictP2)

### Prompt formation for P3 Pipeline

In [12]:
bonusDictP3 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p3_pipeline(query,output,bonusTool_list)
    bonusDictP3.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./datasetForm/BonusP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(bonusDictP3)

# Train Validation Test For Static Data

### P2 Pipeline

In [21]:
df = pd.read_csv("./datasetForm/StaticP2prompt.csv")

train_df = df[0:1700]
validation_df = df[1700:1900]
test_df = df[1900:]

train_df.to_csv("./finetuning_P2dataset/train.csv", index=False)
validation_df.to_csv("./finetuning_P2dataset/validation.csv", index=False)
test_df.to_csv("./finetuning_P2dataset/test.csv", index=False)

### P3 Pipeline

In [None]:
df = pd.read_csv("./datasetForm/StaticP3prompt.csv")

train_df = df[0:1700]
validation_df = df[1700:1900]
test_df = df[1900:]

train_df.to_csv("./finetuning_P3dataset/train.csv", index=False)
validation_df.to_csv("./finetuning_P3dataset/validation.csv", index=False)
test_df.to_csv("./finetuning_P3dataset/test.csv", index=False)