# WebNLG Finetune Dataset

The required structure of the dataset is as follows:
```
[{
        "instruction": "",
        "input": "",
        "output": ""
}]
```

#### Import Dependencies

In [None]:
from Import import create_webnlg_df
from typing import List, Dict
import pandas as pd
import json
from Prompts import get_finetune_instruction
import random

#### Create Dataframe from Webnlg Dataset

In [None]:
file_path = "./webnlg-dataset/release_v3.0/en/json/train/v3.0_train_set.json"
train_set_size = 13211

webnlg_train_df = create_webnlg_df(file_path, train_set_size)
webnlg_train_df.head()

#### Create Finetune Dataset

In [None]:
def get_lixicalisation(lexicalisations: List[Dict[str, str]]) -> str:
    """Returns a random lexicalisation from the list of lexicalisations with comment good"""
    good_lexes = []
    for lexicalisation in lexicalisations:
        if lexicalisation["comment"] == "good":
            good_lexes.append(lexicalisation["lex"])

    if good_lexes == []:
        return None
    
    return random.choice(good_lexes)

In [None]:
def create_finetune_dataset(webnlg_train_df: pd.DataFrame, train_set_size: int, conversations: int):
    finetune_dataset: List(dict) = []

    for i in range(conversations):
        conversation_turns = random.randint(1, 4)
        conversations = []

        for j in range(conversation_turns):
            # select a random index between 0 and train_set_size
            df_index = random.randint(0, train_set_size - 1)
            lex = get_lixicalisation(webnlg_train_df["lexicalisations"][df_index])
  
            if lex is None:
              # skip this row
              j -= 1
              continue
            
            conversations.append(
                {
                  "from": "human",
                  "value": f"Input triples: {webnlg_train_df['modifiedtripleset'][df_index]}"
                })
            conversations.append(
             {
                "from": "gpt",
                "value": f"Output text: {lex}"
              }
            )

        finetune_dataset.append({
            "id": f"identity_{i}",
            "conversations": conversations
        })
        
    return finetune_dataset


In [None]:
# Using train_set_size conversations leads on average to every sample beeing included two and a half times (using randomly choosen lexicalisations)
conversations = train_set_size * 2
finetune_dataset = create_finetune_dataset(webnlg_train_df, train_set_size, conversations)
print(finetune_dataset)

#### Export finetune dataset

In [None]:
def export_dataset_to_json_file(dataset: List[dict], file_path: str):
    with open(file_path, "w") as file:
        json.dump(dataset, file, indent=4)

In [None]:
export_dataset_to_json_file(finetune_dataset, "./webnlg_finetune_dataset_chat.json")