In [None]:
# coding=utf-8
#
# Copyright 2024
# Heinrich Heine University Dusseldorf,
# Faculty of Mathematics and Natural Sciences,
# Computer Science Department
#
# Authors:
# Renato Vukovic (renato.vukovic@hhu.de)
#
# This code was generated with the help of AI writing assistants
# including GitHub Copilot, ChatGPT, Bing Chat.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # #

### From the Ontology Relation Prediction Data generate the instruction/prompt inputs for fine-tuning and the relational triplets as outputs

In [1]:
import json
from pathlib import Path
import datasets
from datasets import Dataset, DatasetDict, load_from_disk
from tqdm import tqdm
from transformers import AutoTokenizer
import numpy as np

In [2]:
#load the prompt dict
with Path("../experiments/prompts/no_memory/zero_shot_no_memory_reframed_prompt_dict.json").open("r") as f:
	prompt_dict = json.load(f)

print(prompt_dict)

{'task_description': 'You are an expert in ontology construction from a set of domain (general topic), slot (information type about entities in a domain) or value (concrete instances of information in slots) candidate terms in task-oriented dialogue.', 'dialogue': 'Here is the dialogue:', 'term_list': 'Here is the list of terms:', 'relations_so_far': '', 'output_instruction': "Predict relations in the dialogue between the given terms, that can be domains, slots or values. Predict the 'has slot' relation between domains and slots in the form [domainname, has slot, slotname]. Predict the 'has value' relation between slots and values in the form [slotname, has value, valuename]. Predict the 'has domain' relation between values and domains in the form [valuename, has domain, domainname]. Predict the 'refers to same concept as' relation between terms from the same category with the same meaning [term, refers to same concept as, term]. Make sure to put brackets around each relational triplet

First MultiWOZ

In [3]:
with Path("./multiwoz21_dialogue_term_dict.json").open("r") as f:
	data = json.load(f)

In [4]:
print(data["train"]["multiwoz21-train-0"].keys())

dict_keys(['text', 'terms', 'relational triplets'])


In [5]:
task_description = prompt_dict["task_description"]
dialogue_input = prompt_dict["dialogue"]
term_input = prompt_dict["term_list"]
output_instruction = prompt_dict["output_instruction"]
			

In [6]:
sft_data_dict = {}
#go through the splits and create the dataset with the instruction consisting of the prompt, the dialogue and the term list
for split in data.keys():
    sft_data_dict[split] = {"instruction": [], "output": [], "dialogue_id": []}
    for dial_id, dialogue in tqdm(data[split].items()):
        text = dialogue["text"]
        term_list = dialogue["terms"]
        relation_triplets = dialogue["relational triplets"]
        instruction_text = ""
        instruction_text += dialogue_input + "\n" + text + "\n"
        instruction_text += term_input + "\n" + str(term_list)  + "\n"
        instruction_text += output_instruction + "\n"

        sft_data_dict[split]["instruction"].append(instruction_text)
        sft_data_dict[split]["output"].append(relation_triplets)
        sft_data_dict[split]["dialogue_id"].append(dial_id)
        

100%|██████████| 8438/8438 [00:00<00:00, 185597.24it/s]
100%|██████████| 1000/1000 [00:00<00:00, 169754.90it/s]
100%|██████████| 1000/1000 [00:00<00:00, 167450.65it/s]


In [7]:
#make datasets out of the different split dicts
sft_datasets = {}
for split in sft_data_dict.keys():
    sft_datasets[split] = Dataset.from_dict(sft_data_dict[split])


In [8]:
#turn the dataset into a Huggingface dataset
sft_dataset = DatasetDict(sft_datasets)

print(sft_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 8438
    })
    validation: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 1000
    })
})


In [9]:
#save the dataset
sft_dataset.save_to_disk("./multiwoz21_ontology_relation_sft_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/8438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [2]:
#test loading the dataset
sft_dataset = load_from_disk("./multiwoz21_ontology_relation_sft_dataset")

print(sft_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 8438
    })
    validation: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 1000
    })
})


Next SGD

In [11]:
with Path("./sgd_dialogue_term_dict.json").open("r") as f:
	data = json.load(f)

In [12]:
sft_data_dict = {}
#go through the splits and create the dataset with the instruction consisting of the prompt, the dialogue and the term list
for split in data.keys():
    sft_data_dict[split] = {"instruction": [], "output": [], "dialogue_id": []}
    for dial_id, dialogue in tqdm(data[split].items()):
        text = dialogue["text"]
        term_list = dialogue["terms"]
        relation_triplets = dialogue["relational triplets"]
        instruction_text = ""
        instruction_text += dialogue_input + "\n" + text + "\n"
        instruction_text += term_input + "\n" + str(term_list)  + "\n"
        instruction_text += output_instruction + "\n"

        sft_data_dict[split]["instruction"].append(instruction_text)
        sft_data_dict[split]["output"].append(relation_triplets)
        sft_data_dict[split]["dialogue_id"].append(dial_id)
        

100%|██████████| 16142/16142 [00:00<00:00, 326404.51it/s]
100%|██████████| 2482/2482 [00:00<00:00, 250626.25it/s]
100%|██████████| 4201/4201 [00:00<00:00, 278198.70it/s]


In [13]:
#make datasets out of the different split dicts
sft_datasets = {}
for split in sft_data_dict.keys():
    sft_datasets[split] = Dataset.from_dict(sft_data_dict[split])


In [14]:
#turn the dataset into a Huggingface dataset
sft_dataset = DatasetDict(sft_datasets)

print(sft_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 16142
    })
    validation: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 2482
    })
    test: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 4201
    })
})


In [15]:
#save the dataset
sft_dataset.save_to_disk("./sgd_ontology_relation_sft_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/16142 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2482 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4201 [00:00<?, ? examples/s]

In [29]:
#test loading the dataset
sft_dataset = load_from_disk("./sgd_ontology_relation_sft_dataset")

print(sft_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 16142
    })
    validation: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 2482
    })
    test: Dataset({
        features: ['instruction', 'output', 'dialogue_id'],
        num_rows: 4201
    })
})
