In [None]:
# coding=utf-8
#
# Copyright 2024
# Heinrich Heine University Dusseldorf,
# Faculty of Mathematics and Natural Sciences,
# Computer Science Department
#
# Authors:
# Renato Vukovic (renato.vukovic@hhu.de)
#
# This code was generated with the help of AI writing assistants
# including GitHub Copilot, ChatGPT, Bing Chat.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # #

### Prepare data for DORE experiments on MultiWOZ and SGD on inferring ontology relations based on all the dialogues in the data-set

In [3]:
from convlab.util import load_dataset, load_ontology, load_database
import re
import json
from pathlib import Path
from tqdm import tqdm
import random
import copy
import importlib
import sys
import torch
from collections import Counter

In [4]:
def tokenize(utt):
    utt_lower = utt.lower()
    #utt_lower = utt_lower.replace("\n", " ")
    utt_lower = utt_lower.replace("\t", " ")
    utt_tok = utt_to_token(utt_lower)
    return utt_tok

def utt_to_token(utt):
    return [tok for tok in map(lambda x: re.sub(" ", "", x), re.split("(\W+)", utt)) if len(tok) > 0]


In [5]:
def present_in_utterance(term, utterance):
    if term in utterance:
        #now check that each token of the splitted term occurs behind each other in the utterance
        splitted_term = term.split()
        splitted_utterance = utterance.split()
        for i in range(len(splitted_utterance)-len(splitted_term)+1):
            if all([splitted_utterance[i+j] == splitted_term[j] for j in range(len(splitted_term))]):
                return True
        return False
    else:
        return False
    
def present_in_other_terms(term, other_term):
    if present_in_utterance(term, other_term) and term != other_term:
        return True
    else:
        return False

### load and prepare the dialogues with convlab

In [4]:
dataset = load_dataset("multiwoz21")
ontology = load_ontology("multiwoz21")

### load the multiwoz label maps from TripPy

In [6]:
with Path("./multiwoz21_trippy_labelmaps.json").open("r") as f:
	labelmaps = json.load(f)



labelmaps = labelmaps["label_maps"]

#tokenize all the label maps
tokenized_labelmaps = {}
for key, values in labelmaps.items():
	tokenized_key = " ".join(tokenize(key))
	tokenized_labelmaps[tokenized_key] = [" ".join(tokenize(value)) for value in values]

labelmaps = tokenized_labelmaps

In [6]:
splits = ["train", "validation", "test"]

In [7]:
#make a dict that maps the domain-slot pairs to their values
slot_value_dict = {}
ontology_dict = {}
for domain, content in ontology["domains"].items():
    ontology_dict[domain] = {}
    for slot, values in content["slots"].items():
        if "possible_values" in values:
            ontology_dict[domain][slot] = set(values["possible_values"])
            slot_value_dict[domain + '-' + slot] = set(values['possible_values'])
        else:
            ontology_dict[domain][slot] = set()
            slot_value_dict[domain + '-' + slot] = set()

In [40]:
#go through the validation set and extract the slot types and values that appear in the user turns to get the validation ontology
ontology = {}
#also make a dict that maps the values to the utterances they appear in
slot_value_utterance_dict = {}
dialogue_turn_slot_value_dict = {}
for split in splits:
    dialogue_turn_slot_value_dict[split] = {}
    for dialog in dataset[split]:
        dial_id = dialog["dialogue_id"]
        dialogue_turn_slot_value_dict[split][dial_id] = {}
        for turn in dialog['turns']:
            turn_id = turn["utt_idx"]
            dialogue_turn_slot_value_dict[split][dial_id][turn_id] = {}

            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["speaker"] = turn["speaker"]
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["utterance"] = " ".join(tokenize(turn["utterance"]))
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"] = []
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"] = []
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"] = []

            for cat, values in turn["dialogue_acts"].items():
                if cat != "binary":
                    if turn["speaker"] == "user" or turn["speaker"] == "system":
                        for val in values:
                            ontology_dict[val["domain"]][val["slot"]].add(val["value"])
                            if val["domain"] + ' ' + val["slot"] not in ontology:
                                ontology[val["domain"] + ' ' + val["slot"]] = set()
                                slot_value_utterance_dict[val["domain"] + ' ' + val["slot"]] = {}
                            tokenized_value = " ".join(tokenize(val["value"]))
                            ontology[val["domain"] + ' ' + val["slot"]].add(tokenized_value)
                            if tokenized_value not in slot_value_utterance_dict[val["domain"] + ' ' + val["slot"]]:
                                slot_value_utterance_dict[val["domain"] + ' ' + val["slot"]][tokenized_value] = []
                            slot_value_utterance_dict[val["domain"] + ' ' + val["slot"]][tokenized_value].append((turn["speaker"], (" ".join(tokenize(turn["utterance"])), dial_id, turn_id, turn["dialogue_acts"])))
                        
                    #add all the utterances with domains, slots and values to the dialogue_turn_slot_value_dict
                    for val in values:
                        dom = " ".join(tokenize(val["domain"]))
                        slot = " ".join(tokenize(val["slot"]))
                        value = " ".join(tokenize(val["value"]))
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"].extend([dom, slot, value])
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"].extend([dom + " " + slot] * 3)
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"].append((dom, slot, value))
                
                else: #also include binary dialogue acts as they containt domain-slot relations
                    if turn["speaker"] == "user" or turn["speaker"] == "system":
                        for val in values:
                            if val["domain"] == "general":
                                continue
                            if val["domain"] + ' ' + val["slot"] not in ontology:
                                ontology[val["domain"] + ' ' + val["slot"]] = set()
                                slot_value_utterance_dict[val["domain"] + ' ' + val["slot"]] = {}
                        
                    #add all the utterances with domains, slots and values to the dialogue_turn_slot_value_dict
                    for val in values:
                        dom = " ".join(tokenize(val["domain"]))
                        if dom == "general":
                            continue
                        slot = " ".join(tokenize(val["slot"]))
                        value = ""
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"].extend([dom, slot, value])
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"].extend([dom + " " + slot] * 3)
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"].append((dom, slot, value))



                


In [47]:
###accumulate the list of all terms that appear in a dialogue and their relations to other terms
dialogue_term_dict = {}
for split in splits:
    dialogue_term_dict[split] = {}
    for dial_id, turns in tqdm(dialogue_turn_slot_value_dict[split].items()):
        current_dialogue_text = ""
        current_dialogue_terms = []
        current_dialogue_relational_triplets = []
        for turn_id, turn in turns.items():
            turn_text = '"' + turn['speaker'] + '": "' + turn["utterance"] + '"\n'
            current_dialogue_text += turn_text
            if turn["speaker"] == "user" or turn["speaker"] == "system":
                current_dialogue_terms.extend([val for val in turn["slotvalues"] if val])
                for slotval_triplet in turn["slotvalue pairs"]:
                    if slotval_triplet[1]: #some cases where the slot is empty in binary dialogue act
                        current_dialogue_relational_triplets.append((slotval_triplet[0], "has slot", slotval_triplet[1]))
                    if slotval_triplet[2]: #it is empty for the triples from the binary slots where only the domain-slot relation is used
                        current_dialogue_relational_triplets.append((slotval_triplet[1], "has value", slotval_triplet[2]))
                        current_dialogue_relational_triplets.append((slotval_triplet[2], "has domain", slotval_triplet[0]))
        dialogue_term_dict[split][dial_id] = {}
        dialogue_term_dict[split][dial_id]["text"] = current_dialogue_text

        #check those values that are not present in the text whether there is a label map present
        triplet_copy = copy.deepcopy(current_dialogue_relational_triplets)
        for head, relation, tail in triplet_copy:
            #if head not in current_dialogue_text and tail not in current_dialogue_text:
            if head in labelmaps and tail in labelmaps:
                head_label_map = ""
                tail_label_map = ""
                for labelmap in labelmaps[head]:
                    if labelmap in current_dialogue_text:
                        head_label_map = labelmap
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((head, "refers to same concept as", labelmap))
                        break
                for labelmap in labelmaps[tail]:
                    if labelmap in current_dialogue_text:
                        tail_label_map = labelmap
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((tail, "refers to same concept as", labelmap))
                        break
            elif head in labelmaps:
                for labelmap in labelmaps[head]:
                    if labelmap in current_dialogue_text:
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((head, "refers to same concept as", labelmap))
                        break
            elif tail in labelmaps:
                for labelmap in labelmaps[tail]:
                    if labelmap in current_dialogue_text:
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((tail, "refers to same concept as", labelmap))
                        break
            

        dialogue_term_dict[split][dial_id]["terms"] = list(set([term for term in current_dialogue_terms if present_in_utterance(term, current_dialogue_text)]))
        
        dialogue_term_dict[split][dial_id]["relational triplets"] = list(set(current_dialogue_relational_triplets))


100%|██████████| 8438/8438 [00:19<00:00, 425.80it/s]
100%|██████████| 1000/1000 [00:02<00:00, 364.06it/s]
100%|██████████| 1000/1000 [00:02<00:00, 361.14it/s]


In [49]:
#save the dialogue_term_dict
with Path("./multiwoz21_dialogue_term_dict.json").open("w") as f:
	json.dump(dialogue_term_dict, f, indent=4)


### Now define replacements of slot names in order for them to be more likely to be present in the data

In [17]:
slotname_replacements ={
    "ref": "reference number",
    "book day": "day",
    "book people": "people",
    "book time": "time",
    "book stay": "stay",
}

In [23]:
dialogue_term_dict_with_replaced_slotnames = {}
#go through the data and replace the slotnames with the replacements in the relation triplets, if they are then present in dialogues, then add them to the list as well
for split in splits:
	dialogue_term_dict_with_replaced_slotnames[split] = {}
	for dial_id, dialogue in tqdm(dialogue_term_dict[split].items()):
		dialogue_term_dict_with_replaced_slotnames[split][dial_id] = {}
		text = dialogue['text']
		terms = dialogue['terms']
		relations = dialogue['relational triplets']
		relations_with_replaced_slotnames = []
		replaced_slotname_terms = set()
		for rel in relations:
			#has value relations
			relation = copy.deepcopy(rel)
			if relation[0] in slotname_replacements:
				relation = (slotname_replacements[relation[0]], relation[1], relation[2])
				#check that the term is present in the dialogue text
				if present_in_utterance(relation[0], text):
					replaced_slotname_terms.add(relation[0])
			#has slot relations
			if relation[2] in slotname_replacements:
				relation = (relation[0], relation[1], slotname_replacements[relation[2]])
				#check that the term is present in the dialogue text
				if present_in_utterance(relation[2], text):
					replaced_slotname_terms.add(relation[2])
			relations_with_replaced_slotnames.append(relation)

		dialogue_term_dict_with_replaced_slotnames[split][dial_id]['text'] = text
		dialogue_term_dict_with_replaced_slotnames[split][dial_id]['terms'] = list(set(terms + list(replaced_slotname_terms)))
		dialogue_term_dict_with_replaced_slotnames[split][dial_id]['relational triplets'] = list(set(tuple(x) for x in relations_with_replaced_slotnames))






100%|██████████| 8438/8438 [00:04<00:00, 2030.83it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1740.09it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1779.44it/s]


### Next add label maps for slots

In [27]:
#label maps for the slots

slot_labelmaps = {
    "people": ["ticket", "tickets"],
    "stay": ["night", "nights"],
    "reference number": ["ref", "reference"],

    "leave at": ["leave", "leaves", "departs", "departing", "leaving", "depart"],
    "arrive by": ["arrive", "arrives", "arriving", "arrival"],
    "stars": ["star", "rating"],
    "area": ["part of town", "part of the city"],
    "internet": ["wifi"],
    "phone": ["telephone", "number"],
    "price range": ["price", "priced"],
    "price": ["cost", "costs", "fee"],
    "entrance fee": ["fee", "cost", "costs"],
    "departure": ["departs", "depart", "departing", "leaving", "leave", "leaves" ],
    "destination": ["arrives", "arriving", "arrival", "arrive"],
    "postcode": ["postal"],   
    "food": ["cuisine"],

    
}

In [28]:
#add them as refers to same relations to the dict
dialogue_term_dict_with_replaced_slotnames_with_labelmaps = {}
for split in splits:
	dialogue_term_dict_with_replaced_slotnames_with_labelmaps[split] = {}
	for dial_id, dialogue in tqdm(dialogue_term_dict_with_replaced_slotnames[split].items()):
		dialogue_term_dict_with_replaced_slotnames_with_labelmaps[split][dial_id] = {}
		text = dialogue['text']
		terms = dialogue['terms']
		relations = dialogue['relational triplets']
		relations_with_labelmaps = []
		additional_terms_with_labelmaps = set()
		for head, rel, tail in relations:
			#check if the head or tail is in the labelmaps
			if head in slot_labelmaps:
				for term in slot_labelmaps[head]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((head, "refers to same concept as", term))
			if tail in slot_labelmaps:
				for term in slot_labelmaps[tail]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((tail, "refers to same concept as", term))
			

		dialogue_term_dict_with_replaced_slotnames_with_labelmaps[split][dial_id]['text'] = text
		dialogue_term_dict_with_replaced_slotnames_with_labelmaps[split][dial_id]['terms'] = list(set(list(terms) + list(additional_terms_with_labelmaps)))
		dialogue_term_dict_with_replaced_slotnames_with_labelmaps[split][dial_id]['relational triplets'] = list(set(tuple(x) for x in relations_with_labelmaps + relations))

100%|██████████| 8438/8438 [00:12<00:00, 687.53it/s]
100%|██████████| 1000/1000 [00:01<00:00, 540.63it/s]
100%|██████████| 1000/1000 [00:01<00:00, 545.26it/s]


In [32]:
#save the new multiwoz term dict
with open("./multiwoz21_term_dict_with_replaced_slotnames_with_labelmaps.json", "w") as f:
	json.dump(dialogue_term_dict_with_replaced_slotnames_with_labelmaps, f, indent=4)

### Next prepare the SGD data

In [4]:
dataset = load_dataset("sgd")
ontology = load_ontology("sgd")

In [5]:
splits = ["train", "validation", "test"]

In [11]:
#make a dict that maps the domain-slot pairs to their values
slot_value_dict = {}
ontology_dict = {}
for dom, content in ontology["domains"].items():
    domain = dom.split("_")[0]
    ontology_dict[domain] = {}
    for sl, values in content["slots"].items():
        slot = " ".join(sl.split("_"))
        if "possible_values" in values:
            ontology_dict[domain][slot] = set(values["possible_values"])
            slot_value_dict[domain + '-' + slot] = set(values['possible_values'])
        else:
            ontology_dict[domain][slot] = set()
            slot_value_dict[domain + '-' + slot] = set()

### load the SGD label maps

In [6]:
with Path("./sgd_label_maps.json").open("r") as f:
	labelmaps = json.load(f)



#tokenize all the label maps
tokenized_labelmaps = {}
for key, values in labelmaps.items():
	tokenized_key = " ".join(tokenize(key))
	tokenized_labelmaps[tokenized_key] = [" ".join(tokenize(value)) for value in values]

labelmaps = tokenized_labelmaps

#for each label map make the list into a set
for key, values in labelmaps.items():
	labelmaps[key] = list(set(values))
	#if the key itself is in the values, remove it
	if key in labelmaps[key]:
		labelmaps[key].remove(key)



In [23]:
#go through the validation set and extract the slot types and values that appear in the user turns to get the validation ontology
ontology = {}
#also make a dict that maps the values to the utterances they appear in
slot_value_utterance_dict = {}
dialogue_turn_slot_value_dict = {}
for split in splits:
    dialogue_turn_slot_value_dict[split] = {}
    for dialog in tqdm(dataset[split]):
        dial_id = dialog["dialogue_id"]
        dialogue_turn_slot_value_dict[split][dial_id] = {}
        for turn in dialog['turns']:
            turn_id = turn["utt_idx"]
            dialogue_turn_slot_value_dict[split][dial_id][turn_id] = {}

            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["speaker"] = turn["speaker"]
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["utterance"] = " ".join(tokenize(turn["utterance"]))
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["dialogue_acts"] = turn["dialogue_acts"]
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"] = []
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"] = []
            dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"] = []

            for cat, values in turn["dialogue_acts"].items():
                if cat != "binary":
                    if turn["speaker"] == "user" or turn["speaker"] == "system":
                        for val in values:
                            domain = val["domain"].split("_")[0]
                            slot = " ".join(val["slot"].split("_"))
                            if slot not in ontology_dict[domain]:
                                ontology_dict[domain][slot] = set()
                            ontology_dict[domain][slot].add(val["value"])
                            if domain + ' ' + slot not in ontology:
                                ontology[domain + ' ' + slot] = set()
                                slot_value_utterance_dict[domain + ' ' + slot] = {}
                            tokenized_value = " ".join(tokenize(val["value"]))
                            ontology[domain + ' ' + slot].add(tokenized_value)
                            if tokenized_value not in slot_value_utterance_dict[domain + ' ' + slot]:
                                slot_value_utterance_dict[domain + ' ' + slot][tokenized_value] = []
                            slot_value_utterance_dict[domain + ' ' + slot][tokenized_value].append((turn["speaker"], (" ".join(tokenize(turn["utterance"])), dial_id, turn_id, turn["dialogue_acts"])))
                        
                    #add all the utterances with domains, slots and values to the dialogue_turn_slot_value_dict
                    for val in values:
                        domain = val["domain"].split("_")[0]
                        slot = " ".join(val["slot"].split("_"))
                        dom = " ".join(tokenize(domain))
                        slot = " ".join(tokenize(slot))
                        value = " ".join(tokenize(val["value"]))
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"].extend([dom, slot, value])
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"].extend([dom + " " + slot] * 3)
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"].append((dom, slot, value))

                else: #also include binary dialogue acts as they containt domain-slot relations
                    if turn["speaker"] == "user" or turn["speaker"] == "system":
                        for val in values:
                            domain = val["domain"].split("_")[0]
                            if domain == "general" or not domain:
                                continue
                            slot = " ".join(val["slot"].split("_"))
                            if slot not in ontology_dict[domain]:
                                ontology_dict[domain][slot] = set()
                            if domain + ' ' + slot not in ontology:
                                ontology[domain + ' ' + slot] = set()
                                slot_value_utterance_dict[domain + ' ' + slot] = {}
                        
                    #add all the utterances with domains, slots and values to the dialogue_turn_slot_value_dict
                    for val in values:
                        domain = val["domain"].split("_")[0]
                        slot = " ".join(val["slot"].split("_"))
                        dom = " ".join(tokenize(domain))
                        if dom == "general" or not domain:
                            continue
                        slot = " ".join(tokenize(slot))
                        value = ""
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalues"].extend([dom, slot, value])
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotlabels"].extend([dom + " " + slot] * 3)
                        dialogue_turn_slot_value_dict[split][dial_id][turn_id]["slotvalue pairs"].append((dom, slot, value))
                


100%|██████████| 16142/16142 [00:29<00:00, 549.69it/s]
100%|██████████| 2482/2482 [00:03<00:00, 638.02it/s]
100%|██████████| 4201/4201 [00:09<00:00, 439.31it/s] 


In [28]:
###accumulate the list of all terms that appear in a dialogue and their relations to other terms
dialogue_term_dict = {}
for split in splits:
    dialogue_term_dict[split] = {}
    for dial_id, turns in tqdm(dialogue_turn_slot_value_dict[split].items()):
        current_dialogue_text = ""
        current_dialogue_terms = []
        current_dialogue_relational_triplets = []
        for turn_id, turn in turns.items():
            turn_text = '"' + turn['speaker'] + '": "' + turn["utterance"] + '"\n'
            current_dialogue_text += turn_text
            if turn["speaker"] == "user" or turn["speaker"] == "system":
                current_dialogue_terms.extend([val for val in turn["slotvalues"] if val])
                for slotval_triplet in turn["slotvalue pairs"]:
                    if slotval_triplet[1]: #some cases where the slot is empty in binary dialogue act
                        current_dialogue_relational_triplets.append((slotval_triplet[0], "has slot", slotval_triplet[1]))
                    if slotval_triplet[2]: #it is empty for the triples from the binary slots where only the domain-slot relation is used
                        current_dialogue_relational_triplets.append((slotval_triplet[1], "has value", slotval_triplet[2]))
                        current_dialogue_relational_triplets.append((slotval_triplet[2], "has domain", slotval_triplet[0]))
        dialogue_term_dict[split][dial_id] = {}
        dialogue_term_dict[split][dial_id]["text"] = current_dialogue_text

        #check those values that are not present in the text whether there is a label map present
        triplet_copy = copy.deepcopy(current_dialogue_relational_triplets)
        for head, relation, tail in triplet_copy:
            #if head not in current_dialogue_text and tail not in current_dialogue_text:
            if head in labelmaps and tail in labelmaps:
                head_label_map = ""
                tail_label_map = ""
                for labelmap in labelmaps[head]:
                    if labelmap in current_dialogue_text:
                        head_label_map = labelmap
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((head, "refers to same concept as", labelmap))
                        break
                for labelmap in labelmaps[tail]:
                    if labelmap in current_dialogue_text:
                        tail_label_map = labelmap
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((tail, "refers to same concept as", labelmap))
                        break
            elif head in labelmaps:
                for labelmap in labelmaps[head]:
                    if labelmap in current_dialogue_text:
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((head, "refers to same concept as", labelmap))
                        break
            elif tail in labelmaps:
                for labelmap in labelmaps[tail]:
                    if labelmap in current_dialogue_text:
                        current_dialogue_terms.append(labelmap)
                        current_dialogue_relational_triplets.append((tail, "refers to same concept as", labelmap))
                        break


        dialogue_term_dict[split][dial_id]["terms"] = list(set([term for term in current_dialogue_terms if present_in_utterance(term, current_dialogue_text)]))
        
        dialogue_term_dict[split][dial_id]["relational triplets"] = list(set(current_dialogue_relational_triplets))


100%|██████████| 16142/16142 [00:58<00:00, 274.90it/s]
100%|██████████| 2482/2482 [00:07<00:00, 311.96it/s]
100%|██████████| 4201/4201 [00:15<00:00, 276.43it/s]


In [31]:
#save the dialogue_term_dict
with Path("./sgd_dialogue_term_dict.json").open("w") as f:
	json.dump(dialogue_term_dict, f, indent=4)


### Get label maps for domains and slots

In [10]:
#get the has slot relations of a domain
def get_has_slot_relations_of_domain(domain, dialogue_term_relation_dict):
	has_slot_relations = set()
	for split in dialogue_term_relation_dict.keys():    
		for dialogue_id, dialogue in dialogue_term_relation_dict[split].items():
			dialogue_text = dialogue["text"]
			dialogue_terms = dialogue["terms"]
			dialogue_relations = dialogue["relational triplets"]
			for relation in dialogue_relations:
				if relation[0] == domain and relation[1] == 'has slot':
					has_slot_relations.add(tuple(relation))
                
				
				
	return has_slot_relations
             
        
        

In [13]:
print(get_has_slot_relations_of_domain("media", dialogue_term_dict))

{('media', 'has slot', 'subtitles'), ('media', 'has slot', 'count'), ('media', 'has slot', 'director'), ('media', 'has slot', 'actors'), ('media', 'has slot', 'directed by'), ('media', 'has slot', 'price'), ('media', 'has slot', 'findmovies'), ('media', 'has slot', 'starring'), ('media', 'has slot', 'genre'), ('media', 'has slot', 'title'), ('media', 'has slot', 'playmovie'), ('media', 'has slot', 'rentmovie'), ('media', 'has slot', 'movie name'), ('media', 'has slot', 'subtitle language')}


In [14]:
#check dialogues that contain the problematic domains, but also use it for slots later on
def check_dialogues_with_problematic_terms(term, dialogue_term_relation_dict):
    #print the first few dialogues that contain the problematic term in a relation
	for dialogue_id, dialogue in dialogue_term_relation_dict.items():
		dialogue_text = dialogue["text"]
		dialogue_terms = dialogue["terms"]
		dialogue_relations = dialogue["relational triplets"]
		for relation in dialogue_relations:
			if term in relation:
				print(f"dialogue id: {dialogue_id}")
				print(f"dialogue: {dialogue_text}")
				print(f"terms: {dialogue_terms}")
				print(f"relations: {dialogue_relations}\n")
				break


In [15]:
check_dialogues_with_problematic_terms("media", dialogue_term_dict["test"])

dialogue id: sgd-test-1160
dialogue: "user": "i have some free time and i like to watch a movie like close encounters with english subtitles for which i need your help ."
"system": "confirm to play the movie close encounters of the third kind with english subtitles ."
"user": "yes , that is confirmed to proceed ."
"system": "movie was started playing for you now ."
"user": "thanks and let me know the type of movie ."
"system": "it is sci - fi movie ."
"user": "who are the cast for this movie ?"
"system": "stephen powers acted in lead role ."
"user": "thanks for the information ."
"system": "anything else you need ?"
"user": "no , thanks a lot ."
"system": "have a great day ."

terms: ['close encounters', 'close encounters of the third kind', 'english', 'sci - fi']
relations: [['media', 'has slot', 'title'], ['title', 'has value', 'close encounters'], ['starring', 'has value', 'stephen powers'], ['media', 'has slot', 'subtitle language'], ['title', 'has value', 'close encounters of the 

In [16]:
#label maps for the domains

domain_labelmaps = {
    "restaurants": ["restaurant"],
    "hotels": ["hotel"],
    "homes": ["home", "house", "apartment", "property"],
    "flights": ["flight"],
    "trains": ["train"],
    "buses": ["bus"],
    "banks": ["bank", "balance", "account"],
    "movies": ["movie"],
    "services": ["service", "dentist", "practitioner", "physician", "stylist", "provider"],
    "ridesharing": ["ride"],
    "rentalcars": ["rental car", "car rental", "car", "rentalcar"],
    "travel": ["travel", "trip", "vacation", "holiday", "event", "events", "attraction", "attractions"],
    "messaging": ["message", "share", "send"],
    "media": ["movie", "movies"]
}

In [17]:
splits = ["train", "validation", "test"]

In [18]:
#add them as refers to same relations to the dict
dialogue_term_dict_with_domain_labelmaps = {}
for split in splits:
	dialogue_term_dict_with_domain_labelmaps[split] = {}
	for dial_id, dialogue in tqdm(dialogue_term_dict[split].items()):
		dialogue_term_dict_with_domain_labelmaps[split][dial_id] = {}
		text = dialogue['text']
		terms = dialogue['terms']
		relations = dialogue['relational triplets']
		relations_with_labelmaps = []
		additional_terms_with_labelmaps = set()
		for head, rel, tail in relations:
			#check if the head or tail is in the labelmaps
			if head in domain_labelmaps:
				for term in domain_labelmaps[head]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((head, "refers to same concept as", term))
			if tail in domain_labelmaps:
				for term in domain_labelmaps[tail]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((tail, "refers to same concept as", term))
			

		dialogue_term_dict_with_domain_labelmaps[split][dial_id]['text'] = text
		dialogue_term_dict_with_domain_labelmaps[split][dial_id]['terms'] = list(set(list(terms) + list(additional_terms_with_labelmaps)))
		dialogue_term_dict_with_domain_labelmaps[split][dial_id]['relational triplets'] = list(set(tuple(x) for x in relations_with_labelmaps + relations))

100%|██████████| 16142/16142 [00:27<00:00, 586.02it/s] 
100%|██████████| 2482/2482 [00:04<00:00, 611.66it/s] 
100%|██████████| 4201/4201 [00:08<00:00, 521.39it/s] 


In [None]:
#save the new multiwoz term dict
with open("./sgd_term_dict_with_domain_labelmaps.json", "w") as f:
	json.dump(dialogue_term_dict_with_domain_labelmaps, f, indent=4)

### Next add label maps for slots

In [69]:
check_dialogues_with_problematic_terms("from location", dialogue_term_dict["validation"])

dialogue id: sgd-validation-207
dialogue: "user": "i need a bus for my planned trip ."
"system": "what is your destination ? where are you leaving from and when ?"
"user": "the date is march 14th and the destination is sacramento . the departure is from las vegas ."
"system": "the bus leaves at 7 : 20 am and the ticket is $ 83 . it has 1 transfers ."
"user": "what is the departure bus station ?"
"system": "it ' s union plaza ."
"user": "it may work for me ."
"system": "may i offer you to buy the tickets on your behalf ?"
"user": "yes please , that would be awesome ."
"system": "how many do you need ?"
"user": "i need tickets for two people ."
"system": "read carefully the information in my possession and tell me if you agree with them . you want me to buy 2 tickets for a bus leaving from las vegas to sacramento . the date is march 14th at 7 : 20 am ."
"user": "yes , it is what i need ."
"system": "tickets are yours ."
"user": "i ' m thankful for your services , i don ' t need you anymo

In [70]:
#label maps for the slots

slot_labelmaps = {'restaurant name': ["name"], 
                  'destination': [], 
                  'street address': ["address", "location", "located"], 
                  'ride fare': ["cost", "costs", "price"], 
                  'event name': ["name"], 
                  'hotel name': ["name"], 
                  'phone number': ["phone", "number"], 
                  'address': ["location", "located"], 
                  'song name': ["name", "song", "songs"], 
                  'event location': ["location", "located", "address"], 
                  'outbound arrival time': ["time", "times"], 
                  'property name': ["name"], 
                  'doctor name': ["name"],
                  'attraction name': ["name"],
                  'fare': ["cost", "costs", "price"],
                  'outbound departure time': ["time", "times"],
                  'inbound departure time': ["time", "times"],
                  'amount': [],
                  'artist': [],
                  'price': ["cost", "costs"],
                  'inbound arrival time': ["time", "times"],
                  'balance': [],
                  'account balance': ["account", "balance"],
                  'leaving time': ["time", "times"],
                  'starring': ["cast", "crew"],
                  'track': [],
                  'departure time': ["time", "times"],
                  'transfer amount': ["transfer", "amount"],
                  'pickup time': ["time", "times"],
                  'total price': ["cost", "costs"],
                  'place name': ["name"],
                  'event time': ["time", "times"],
                  'stylist name': ["name"],
                  'contact name': ["name"],
                  'theater name': ["name"],
                  'dentist name': ["name"],
                  'movie name': ["name"],
                  'therapist name': ["name"],
                  'title': [],
                  'appointment time': ["time", "times"],
                  'venue address': ["address", "location", "located"],
                  'cast': [],
                  'category': ["type of food", "kind of food"],
                  'address of location': ["address", "location", "located"],
                  'actors': [], 
                  'price per night': ["cost", "costs"], 
                  'show time': ["time", "times"], 
                  'date of journey': ["date"], 
                  'show date': ["date"], 
                  'total': [], 
                  'start date': ["date"], 
                  'car name': ["name"], 
                  'pickup location': [], 
                  'pickup city': ["city"], 
                  'dropoff date': ["date"], 
                  'event date': ["date"], 
                  'visit date': ["date"], 
                  'end date': ["date"], 
                  'leaving date': ["date"], 
                  'city of event': ["city"], 
                  'where to': [], 
                  'origin airport': [], 
                  'available start time': ["time", "times"], 
                  'destination airport': [], 
                  'new alarm time': ["time", "times"], 
                  'appointment date': ["date"], 
                  'movie title': [], 
                  'journey start time': ["time", "times"], 
                  'destination city': ["destination", "city"], 
                  'origin city': ["city"], 
                  'location': [], 
                  'from location': ["departure", "from"], 
                  'cuisine': [], 
                  'to location': ["destination", "to"], 
                  'origin': [], 
                  'from city': ["from", "departure", "city"], 
                  'available end time': ["time", "times"], 
                  'to city': ["to", "city", "destination"], 
                  'aggregate rating': [], 
                  'from station': [], 
                  'to station': [], 
                  'pickup date': ["date"], 
                  'director': ["directed by"], 
                  'approximate ride duration': [], 
                  'stay length': [], 
                  'venue': [], 
                  'precipitation': [], 
                  'rent': [], 
                  'price per day': ["cost", "costs"], 
                  'check out date': ["date"], 
                  'alarm name': ["name"], 
                  'percent rating': ["name"], 
                  'genre': [], 
                  'origin airport name': ["name"], 
                  'date': ["date"], 
                  'city': [], 
                  'return date': ["date"], 
                  'subcategory': [], 
                  'alarm time': ["time", "times"], 
                  'check in date': ["date"], 
                  'recipient name': ["name"], 
                  'recipient account name': ["name"], 
                  'destination airport name': ["name"], 
                  'origin station name': ["name"], 
                  'destination station name': ["name"], 
                  'album': [], 
                  'departure date': ["date"], 
                  'area': [], 
                  'new alarm name': ["name"], 
                  'number of days': [], 
                  'receiver': [], 
                  'subtitle language': [], 
                  'party size': [], 
                  'directed by': ["director"], 
                  'number of adults': [], 
                  'flight class': [], 
                  'average rating': [], 
                  'num passengers': [], 
                  'playback device': [], 
                  'fare type': [], 
                  'group size': [], 
                  'count': [], 
                  'number of riders': [], 
                  'recipient account type': [], 
                  'time': ["times"], 
                  'wait time': ["time", "times"], 
                  'airlines': [], 
                  'payment method': [], 
                  'number checked bags': [], 
                  'price per ticket': ["cost", "costs"], 
                  'seating class': [], 
                  'has live music': [], 
                  'car type': [], 
                  'private visibility': [], 
                  'in unit laundry': [], 
                  'temperature': [], 
                  'number of beds': [], 
                  'travelers': [], 
                  'transfer time': ["time", "times"], 
                  'has garage': [], 
                  'serves alcohol': [], 
                  'number stops': [], 
                  'pets allowed': [], 
                  'intent': [], 
                  'free entry': [], 
                  'is nonstop': [], 
                  'add insurance': [], 
                  'has wifi': [], 
                  'refundable': [], 
                  'has seating outdoors': [], 
                  'furnished': [], 
                  'pets welcome': [], 
                  'account type': [], 
                  'offers cosmetic services': [], 
                  'arrives next day': [], 
                  'additional luggage': [], 
                  'is redeye': [], 
                  'good for kids': [], 
                  'has vegetarian options': [], 
                  'type': [], 
                  'rating': [], 
                  'show type': [], 
                  'device': [], 
                  'price range': [], 
                  'class': [], 
                  'trip protection': [], 
                  'number of baths': [], 
                  'has laundry service': [], 
                  'smoking allowed': [], 
                  'subtitles': [], 
                  'is unisex': [], 
                  'event type': [],
                  }
                  

In [77]:
#print slot labelmaps that are not empty
for key, value in slot_labelmaps.items():
	if value:
		print(f"'{key}': {value},")

'restaurant name': ['name'],
'street address': ['address', 'location', 'located'],
'ride fare': ['cost', 'costs', 'price'],
'event name': ['name'],
'hotel name': ['name'],
'phone number': ['phone', 'number'],
'address': ['location', 'located'],
'song name': ['name', 'song', 'songs'],
'event location': ['location', 'located', 'address'],
'outbound arrival time': ['time', 'times'],
'property name': ['name'],
'doctor name': ['name'],
'attraction name': ['name'],
'fare': ['cost', 'costs', 'price'],
'outbound departure time': ['time', 'times'],
'inbound departure time': ['time', 'times'],
'price': ['cost', 'costs'],
'inbound arrival time': ['time', 'times'],
'account balance': ['account', 'balance'],
'leaving time': ['time', 'times'],
'starring': ['cast', 'crew'],
'departure time': ['time', 'times'],
'transfer amount': ['transfer', 'amount'],
'pickup time': ['time', 'times'],
'total price': ['cost', 'costs'],
'place name': ['name'],
'event time': ['time', 'times'],
'stylist name': ['name'],

In [71]:
#add them as refers to same relations to the dict
dialogue_term_dict_with_domain_and_slot_labelmaps = {}
for split in splits:
	dialogue_term_dict_with_domain_and_slot_labelmaps[split] = {}
	for dial_id, dialogue in tqdm(dialogue_term_dict_with_domain_labelmaps[split].items()):
		dialogue_term_dict_with_domain_and_slot_labelmaps[split][dial_id] = {}
		text = dialogue['text']
		terms = dialogue['terms']
		relations = dialogue['relational triplets']
		relations_with_labelmaps = []
		additional_terms_with_labelmaps = set()
		for head, rel, tail in relations:
			#check if the head or tail is in the labelmaps
			if head in slot_labelmaps:
				for term in slot_labelmaps[head]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((head, "refers to same concept as", term))
			if tail in slot_labelmaps:
				for term in slot_labelmaps[tail]:
					if present_in_utterance(term, text):
						additional_terms_with_labelmaps.add(term)
						relations_with_labelmaps.append((tail, "refers to same concept as", term))
			

		dialogue_term_dict_with_domain_and_slot_labelmaps[split][dial_id]['text'] = text
		dialogue_term_dict_with_domain_and_slot_labelmaps[split][dial_id]['terms'] = list(set(list(terms) + list(additional_terms_with_labelmaps)))
		dialogue_term_dict_with_domain_and_slot_labelmaps[split][dial_id]['relational triplets'] = list(set(tuple(x) for x in relations_with_labelmaps + relations))

100%|██████████| 16142/16142 [00:06<00:00, 2316.97it/s]
100%|██████████| 2482/2482 [00:00<00:00, 2508.48it/s]
100%|██████████| 4201/4201 [00:01<00:00, 2301.24it/s]


In [74]:
#save the new sgd term dict
with open("./sgd_term_dict_with_domain_and_slot_labelmaps.json", "w") as f:
	json.dump(dialogue_term_dict_with_domain_and_slot_labelmaps, f, indent=4)

print an example

In [79]:
dial_id = "sgd-train-0"
print(dialogue_term_dict_with_domain_and_slot_labelmaps["train"][dial_id]["text"])
print(f"termlist: \n{dialogue_term_dict_with_domain_and_slot_labelmaps['train'][dial_id]['terms']} \n")
print(f"relation triplets: \n{dialogue_term_dict_with_domain_and_slot_labelmaps['train'][dial_id]['relational triplets']} \n")

"user": "i am feeling hungry so i would like to find a place to eat ."
"system": "do you have a specific which you want the eating place to be located at ?"
"user": "i would like for it to be in san jose ."
"system": "is there a specific cuisine type you enjoy , such as mexican , italian or something else ?"
"user": "i usually like eating the american type of food ."
"system": "i see that at 71 saint peter there is a good restaurant which is in san jose ."
"user": "can you give me the address of this restaurant ."
"system": "if you want to go to this restaurant you can find it at 71 north san pedro street ."
"user": "can you give me the phone number that i can contact them with ?"
"system": "if you want to phone them you can at 408 - 971 - 8523 ."
"user": "is there some other restaurant which you can suggest ?"
"system": "how would you like bazille restaurant which is situated in san jose ."
"user": "do you have another restaurant matching my needs ? for example a restaurant which is e