# SPICE Fintune Dataset
The required structure of the dataset is as follows:
```
[
    {
        "id": "identity_0",
        "conversations": [
            {
                "from": "human",
                "value": "..."
            },
            {
                "from": "gpt",
                "value": "..."
            },
            ...
        ]
    },
    ...
]
```

#### Import Dependencies

In [1]:
from typing import List
import random
import os
import json
import pandas as pd
import numpy as np
from Prompts import create_conversation_history
from Import import get_file_paths

#### Create Dataframe from Webnlg Dataset

In [2]:
def create_finetune_dataset(base_path: str, conversations: int) -> List[dict]:
    finetune_dataset: List[dict] = []
    file_paths = get_file_paths(base_path)

    for i in range(conversations):
        conversation_turns = random.randint(1, 4)
        conversations = []
        used_files = []

        conversation_turn = 0
        while conversation_turn < conversation_turns:
            # select a random file
            file_path = random.choice(file_paths)

            # check if the file has already been used
            if file_path in used_files:
                # select another file
                continue

            # add the file to the used files
            used_files.append(file_path)

            # create a dataframe from the file
            spice_df = pd.read_csv(file_path)

            # select a even random row from the dataframe
            df_index = random.randrange(0, len(spice_df), 2)
            current_row = spice_df.iloc[df_index]
            next_row = spice_df.iloc[df_index + 1]
            conversation_history = create_conversation_history(spice_df, df_index)

            # check if sparql_query is empty string
            if next_row['sparql_query'] is np.nan:
                # select another file
                continue

            conversations.append(
                {
                  "from": "human",
                  "value": f"""{conversation_history}Input question: {current_row['utterance']}
Entities: {current_row['entities_in_utterance']}
Relations: {current_row['relations']}
Types: {current_row['type_list']}"""
                })
            conversations.append(
             {
                "from": "gpt",
                "value": f"SPARQL query: {next_row['sparql_query']}"
              }
            )
            
            # increment the conversation turn
            conversation_turn += 1

        finetune_dataset.append({
            "id": f"identity_{i}",
            "conversations": conversations
        })
        
    return finetune_dataset

In [3]:
# Change the path to the preprocessed SPICE dataset folder
base_path = "./SPICE_dataset_pp/train"
# Define the ouput file path
output_path = "./spice_finetune_dataset_chat_30000_v02.json"
# Set the number of conversations to be generated
conversations = 30000

finetune_dataset = create_finetune_dataset(base_path, conversations)

#### Export finetune dataset

In [4]:
def export_dataset_to_json_file(dataset: List[dict], file_path: str):
    with open(file_path, "w") as file:
        json.dump(dataset, file, indent=4)

In [5]:

export_dataset_to_json_file(finetune_dataset, output_path)