In [1]:
from tqdm import tqdm
import numpy as np
import os
import pickle

In [2]:
from utils.utils_general import *
from utils.utils_multiwoz import *

In [3]:
def evaluate_and_dump(args, model, loader, dump_keys=[], add_name="multiwoz"):
    f_w = open(os.path.join(args["output_dir"], "tst_results.txt"), "w")
    # Start evaluating on the test set
    prediction_output = []
    test_loss = 0
    preds, labels = [], []
    pbar = tqdm(loader)
    for d in pbar:
        with torch.no_grad():
            outputs = model(d)
        test_loss += outputs["loss"]
        d_out = {}
        for key in dump_keys:
             d_out[key] = d[key]
        d_out["preds"] = [item for item in outputs["pred"]]
        d_out["labels"] = [item for item in outputs["label"]]
    
        d_out_reshape = []
        for i in range(len(d_out["preds"])):
            item = {}
            for k, v in d_out.items():
                item[k] = v[i]
            d_out_reshape.append(item)
            
        prediction_output += d_out_reshape

        preds += [item for item in outputs["pred"]]
        labels += [item for item in outputs["label"]] 

    with open(os.path.join(args["output_dir"], "prediction-{}.pkl".format(add_name)), "wb") as f_out:
        pickle.dump(prediction_output, f_out)

    test_loss = test_loss / len(tst_loader)
    results = model.evaluation(preds, labels)
    f_w.write(str(results))
    f_w.close()

# Dialogue State Tracking

In [4]:
args = {
    "task":"dst",
    "task_name":"dst",
    "output_dir": "save/dst/",
    "load_path": "/export/share/jason/ToD-BERT/ToD-BERT-jnt-V1-DST/pytorch_model.bin",
    
    "model_type": "bert",
    "model_name_or_path": "TODBERT/TOD-BERT-JNT-V1",
    "data_path": "/export/share/datasets/dialogues/", #"/export/home/dialog_datasets/",
    "max_line": None,
    "ontology_version": "",
    "only_last_turn":False,
    "batch_size":6,
    "max_seq_length":512,
    "usr_token":"[USR]",
    "sys_token":"[SYS]",
    "example_type":"turn"
}

In [5]:
# Reading data and create data loaderz
datasets = {}
data_trn, data_dev, data_tst, data_meta = prepare_data_multiwoz(args)
datasets["multiwoz"] = {"train": data_trn, "dev":data_dev, "test": data_tst, "meta":data_meta}

unified_meta = get_unified_meta(datasets)  
args["unified_meta"] = unified_meta

[Info] Using Version 2.1
[Info] Load from old complete ontology from version ...
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/train_dials.json for read_langs_turn
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/dev_dials.json for read_langs_turn
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/test_dials.json for read_langs_turn
Read 56668 pairs train from MultiWOZ
Read 7374 pairs valid from MultiWOZ
Read 7368 pairs test from MultiWOZ
args["task_name"] dst


In [6]:
from models.BERT_DST_Picklist import *

model_class, tokenizer_class, config_class = BertModel, BertTokenizer, BertConfig
tokenizer = tokenizer_class.from_pretrained(args["model_name_or_path"])
config = config_class.from_pretrained(args["model_name_or_path"]) if args["model_name_or_path"] else config_class()
args["model_class"] = model_class
args["tokenizer"] = tokenizer
args["config"] = config

args["num_labels"] = unified_meta["num_labels"]
model = BeliefTracker(args)
    
if args["load_path"]:
    print("MODEL {} LOADED".format(args["load_path"]))
    if torch.cuda.is_available(): 
        model.load_state_dict(torch.load(args["load_path"]))
    else:
        model.load_state_dict(torch.load(args["load_path"], lambda storage, loc: storage))

if torch.cuda.is_available():
    model = model.cuda()

[Info] SV Encoder does not requires grad...
Complete initialization of slot and value lookup
MODEL /export/share/jason/ToD-BERT/ToD-BERT-jnt-V1-DST/pytorch_model.bin LOADED


In [7]:
print("[Info] Start Evaluation on test set...")
tst_loader = get_loader(args, "test" , tokenizer, datasets, unified_meta, shuffle=args["task_name"]=="rs")
model.eval()

if not os.path.exists(args["output_dir"]): os.makedirs(args["output_dir"])
    
evaluate_and_dump(args, model, tst_loader, dump_keys=["context_plain", "ID", "turn_id", "belief"])

  0%|          | 0/1228 [00:00<?, ?it/s]

[Info] Start Evaluation on test set...


100%|██████████| 1228/1228 [01:33<00:00, 13.10it/s]


Results 1:  {'joint_acc': 0.4799131378935939, 'slot_acc': 0.96900108577633}


## Test on Slices

In [8]:
prediction = pickle.load(open(os.path.join(args["output_dir"], "prediction-multiwoz.pkl"), "rb"))

In [9]:
SLOTS = ['hotel-pricerange','hotel-type','hotel-parking','hotel-book stay','hotel-book day','hotel-book people',
         'hotel-area','hotel-stars','hotel-internet','train-destination','train-day','train-departure','train-arriveby',
         'train-book people','train-leaveat','attraction-area','restaurant-food','restaurant-pricerange','restaurant-area',
         'attraction-name','restaurant-name','attraction-type','hotel-name','taxi-leaveat','taxi-destination', 'taxi-departure',
         'restaurant-book time','restaurant-book day','restaurant-book people','taxi-arriveby']

prediction[:5]

[{'context_plain': '[SYS]  [USR] i would like a taxi from saint john s college to pizza hut fen ditton . [SYS] what time do you want to leave and what time do you want to arrive by ? [USR] i want to leave after 17:15 . [SYS] booking completed ! your taxi will be blue honda contact number is 07218068540 [USR] thank you for all the help ! i appreciate it . [SEP]  [SYS] you are welcome . is there anything else i can help you with today ? [USR] no , i am all set . have a nice day . bye .',
  'ID': 'SNG0073.json',
  'turn_id': 3,
  'belief': {'taxi-leaveat': '17:15',
   'taxi-destination': 'pizza hut fenditton',
   'taxi-departure': 'saint johns college'},
  'preds': array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23, 297, 300,
           0,   0,   0,   0]),
  'labels': array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23, 297,

### Domain Slice (whether the true belief label contains a certain domain)

In [10]:
for domain in ["hotel", "train", "restaurant", "attraction", "taxi"]:
    preds, labels = [], []
    total = 0
    for data in prediction:
        total += 1
        if domain in [k.split("-")[0] for k in data["belief"].keys()]:
            preds += [data["preds"]]
            labels += [data["labels"]]
    print("Domain [{}] Ratio: {:.4f}".format(domain, len(labels)/total))
    if len(preds) != 0:
        results = model.evaluation(preds, labels) 


Domain [hotel] Ratio: 0.3534
Results 1:  {'joint_acc': 0.3302611367127496, 'slot_acc': 0.954889912954429}
Domain [train] Ratio: 0.4007
Results 1:  {'joint_acc': 0.49220867208672087, 'slot_acc': 0.9720415537488708}
Domain [restaurant] Ratio: 0.3920
Results 1:  {'joint_acc': 0.41481994459833793, 'slot_acc': 0.9623037857802401}
Domain [attraction] Ratio: 0.3313
Results 1:  {'joint_acc': 0.40925850061450225, 'slot_acc': 0.9628704083026082}
Domain [taxi] Ratio: 0.0871
Results 1:  {'joint_acc': 0.16510903426791276, 'slot_acc': 0.9235202492211838}


### Slot Slice (whether the true belief label contains a certain slot)

In [11]:
for slot in ['leaveat', 'type', 'stars', 'book day', 'day', 'book time', 'book people', 'arriveby', 'destination', 'book stay', 'food', 'pricerange', 'name', 'parking', 'departure', 'area', 'internet']:
    preds, labels = [], []
    total = 0
    for data in prediction:
        total += 1
        if slot in [k.split("-")[1] for k in data["belief"].keys()]:
            preds += [data["preds"]]
            labels += [data["labels"]]
    print("Slot [{}] Ratio: {:.4f}".format(slot, len(labels)/total))
    if len(preds) != 0:
        results = model.evaluation(preds, labels) 


Slot [leaveat] Ratio: 0.2189
Results 1:  {'joint_acc': 0.33477991320520767, 'slot_acc': 0.9553006819590825}
Slot [type] Ratio: 0.3825
Results 1:  {'joint_acc': 0.3388928317955997, 'slot_acc': 0.957049917198959}
Slot [stars] Ratio: 0.1623
Results 1:  {'joint_acc': 0.24665551839464883, 'slot_acc': 0.9455964325529543}
Slot [book day] Ratio: 0.3077
Results 1:  {'joint_acc': 0.28010586678429644, 'slot_acc': 0.9499926481399794}
Slot [day] Ratio: 0.3591
Results 1:  {'joint_acc': 0.47430083144368856, 'slot_acc': 0.9709120685311161}
Slot [book time] Ratio: 0.1781
Results 1:  {'joint_acc': 0.3201219512195122, 'slot_acc': 0.9553607723577235}
Slot [book people] Ratio: 0.4248
Results 1:  {'joint_acc': 0.3156549520766773, 'slot_acc': 0.9549520766773163}
Slot [arriveby] Ratio: 0.2499
Results 1:  {'joint_acc': 0.4111895708853884, 'slot_acc': 0.9614521093608546}
Slot [destination] Ratio: 0.4442
Results 1:  {'joint_acc': 0.4268255423159181, 'slot_acc': 0.9629901211936043}
Slot [book stay] Ratio: 0.1429


# Response Selection

In [12]:
args = {
    "task":"nlg",
    "task_name":"rs",
    "output_dir": "save/rs/",
    "load_path": "/export/share/jason/ToD-BERT/ToD-BERT-jnt-V1-RS/pytorch_model.bin",
    
    "model_type": "bert",
    "model_name_or_path": "TODBERT/TOD-BERT-JNT-V1",
    "data_path": "/export/share/datasets/dialogues/",
    "max_line": None,
    "ontology_version": "",
    "only_last_turn":False,
    "batch_size":100,
    "max_seq_length":256,
    "usr_token":"[USR]",
    "sys_token":"[SYS]",
    "example_type":"turn",
    "nb_neg_sample_rs":0,
}

In [13]:
# Reading data and create data loaderz
datasets = {}
data_trn, data_dev, data_tst, data_meta = prepare_data_multiwoz(args)
datasets["multiwoz"] = {"train": data_trn, "dev":data_dev, "test": data_tst, "meta":data_meta}

unified_meta = get_unified_meta(datasets)  
args["unified_meta"] = unified_meta

[Info] Using Version 2.1
[Info] Load from old complete ontology from version ...
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/train_dials.json for read_langs_turn
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/dev_dials.json for read_langs_turn
Reading from /export/share/datasets/dialogues/MultiWOZ-2.1/test_dials.json for read_langs_turn
Read 56668 pairs train from MultiWOZ
Read 7374 pairs valid from MultiWOZ
Read 7368 pairs test from MultiWOZ
args["task_name"] rs
resp_cand_trn 45299
resp_cand_dev 6244
resp_cand_tst 6218


In [14]:

from models.dual_encoder_ranking import *

model_class, tokenizer_class, config_class = BertModel, BertTokenizer, BertConfig
tokenizer = tokenizer_class.from_pretrained(args["model_name_or_path"])
config = config_class.from_pretrained(args["model_name_or_path"]) if args["model_name_or_path"] else config_class()
args["model_class"] = model_class
args["tokenizer"] = tokenizer
args["config"] = config

args["num_labels"] = unified_meta["num_labels"]
model = dual_encoder_ranking(args)
    
if args["load_path"]:
    print("MODEL {} LOADED".format(args["load_path"]))
    if torch.cuda.is_available(): 
        model.load_state_dict(torch.load(args["load_path"]))
    else:
        model.load_state_dict(torch.load(args["load_path"], lambda storage, loc: storage))

if torch.cuda.is_available():
    model = model.cuda()

MODEL /export/share/jason/ToD-BERT/ToD-BERT-jnt-V1-RS/pytorch_model.bin LOADED


In [15]:
print("[Info] Start Evaluation on test set...")
tst_loader = get_loader(args, "test" , tokenizer, datasets, unified_meta, shuffle=args["task_name"]=="rs")
model.eval()
if not os.path.exists(args["output_dir"]): os.makedirs(args["output_dir"])    
evaluate_and_dump(args, model, tst_loader, dump_keys=["context_plain", "ID", "turn_id", "response_plain", "utterance_plain"])

  0%|          | 0/64 [00:00<?, ?it/s]

[Info] Start Evaluation on test set...
[Info] Remove turns with empty system response...
[Info] Remove turn=0 system response...


100%|██████████| 64/64 [01:07<00:00,  1.04it/s]


{'top-1': 0.6509656146961846, 'top-3': 0.8662270372114932, 'top-5': 0.9324854765269273, 'top-10': 0.9825718323127649}


## Test on Slices

In [17]:
prediction = pickle.load(open(os.path.join(args["output_dir"], "prediction-multiwoz.pkl"), "rb"))

In [18]:
prediction[:5]

[{'context_plain': '[SYS]  [USR] hello , i am looking for an expensive place to eat in the centre of cambridge . [SYS] there are many results in that area for expensive restaurant -s , is there a specific type of cuisine you would like to try ? [USR] it should serve chinese food . [SYS] we have 4 entries that match that criteria . do you have a further preference ? [USR] ok , i want to book a table for 5 people at 16:30 on wednesday . [SYS] i am sorry , but i attempted reservations at your specified time at all 4 restaurant -s , and none have availability at that time . would you like to try a different time ? [USR] how about 15:30 ? [SYS] i have a reservation for you at 15:30 for 5 . your reference number is jugxyqop and they will only hold your table for 15 minutes just an fyi . anything else ? [USR] yes , i also need a train for wednesday i need to arrive by 12:30 . [SYS] what are your departure and arrival stations ? [USR] i am leaving from london kings cross and going to cambridge

### String Matching

In [19]:
for speaker in ["sys", "usr"]:
    for keywords in ["i/my/mine/we/our/ours", "you/your/yours", "what", "when", "where", "?"]:
        print("speaker [{}] keyword [{}]".format(speaker, keywords))
        
        preds, labels = [], []
        total = 0
        for data in prediction: 
            plain_name = "response_plain" if speaker == "sys" else "utterance_plain"
            total += 1
            for keyword in keywords.split("/"):
                if keyword in data[plain_name]:
                    preds += [data["preds"]]
                    labels += [data["labels"]]
                    break
        print("Ratio: {:.4f}".format(len(labels)/total))
        if len(preds) != 0:
            results = model.evaluation(preds, labels)

speaker [sys] keyword [i/my/mine/we/our/ours]
Ratio: 0.9827
{'top-1': 0.6529797092187251, 'top-3': 0.8678702668157853, 'top-5': 0.9336954785109443, 'top-10': 0.9832241572136124}
speaker [sys] keyword [you/your/yours]
Ratio: 0.7475
{'top-1': 0.6299096828397396, 'top-3': 0.8548624238605335, 'top-5': 0.9252257929006511, 'top-10': 0.9808863684099979}
speaker [sys] keyword [what]
Ratio: 0.1240
{'top-1': 0.5620253164556962, 'top-3': 0.8215189873417722, 'top-5': 0.9139240506329114, 'top-10': 0.9759493670886076}
speaker [sys] keyword [when]
Ratio: 0.0143
{'top-1': 0.5494505494505495, 'top-3': 0.7472527472527473, 'top-5': 0.8901098901098901, 'top-10': 0.989010989010989}
speaker [sys] keyword [where]
Ratio: 0.0385
{'top-1': 0.5061224489795918, 'top-3': 0.7877551020408163, 'top-5': 0.8979591836734694, 'top-10': 0.9918367346938776}
speaker [sys] keyword [?]
Ratio: 0.7262
{'top-1': 0.6464864864864864, 'top-3': 0.8663783783783784, 'top-5': 0.9338378378378378, 'top-10': 0.9824864864864865}
speaker [u

### Eval Set 1 - TOD: Schema (https://arxiv.org/pdf/2002.01359.pdf)

In [20]:
import importlib
import utils.utils_schema as utils_schema
importlib.reload(utils_schema)

# Reading data and create data loaderz
datasets = {}
data_trn, data_dev, data_tst, data_meta = utils_schema.prepare_data_schema(args)
datasets["schema"] = {"train": data_trn, "dev":data_dev, "test": data_tst, "meta":data_meta}

unified_meta = get_unified_meta(datasets)  
args["unified_meta"] = unified_meta

Reading from Schema for read_langs_turn
Reading from Schema for read_langs_turn
Reading from Schema for read_langs_turn
Read 164982 pairs train from Schema
Read 24363 pairs valid from Schema
Read 42297 pairs test  from Schema


In [21]:
print("[Info] Start Evaluation on test set...")
tst_loader = get_loader(args, "test" , tokenizer, datasets, unified_meta, shuffle=args["task_name"]=="rs")
model.eval()
if not os.path.exists(args["output_dir"]): os.makedirs(args["output_dir"])    
evaluate_and_dump(args, 
                  model, 
                  tst_loader, 
                  dump_keys=["context_plain", "ID", "turn_id", "response_plain", "utterance_plain"],
                  add_name="schema")


  0%|          | 0/381 [00:00<?, ?it/s]

[Info] Start Evaluation on test set...
[Info] Remove turns with empty system response...
[Info] Remove turn=0 system response...


100%|██████████| 381/381 [05:14<00:00,  1.23it/s]


{'top-1': 0.5139384712305753, 'top-3': 0.7700808483830324, 'top-5': 0.8634502309953801, 'top-10': 0.9452173456530869}


### Eval Set 2 - ChitChat: DailyDialog (https://arxiv.org/abs/1710.03957)

In [22]:
import importlib
import utils.utils_dailydialog as utils_dailydialog
importlib.reload(utils_dailydialog)

# Reading data and create data loaderz
datasets = {}
data_trn, data_dev, data_tst, data_meta = utils_dailydialog.prepare_data_dailydialog(args)
datasets["schema"] = {"train": data_trn, "dev":data_dev, "test": data_tst, "meta":data_meta}

unified_meta = get_unified_meta(datasets)  
args["unified_meta"] = unified_meta

Reading from dailydialog for read_langs_turn
Reading from dailydialog for read_langs_turn
Reading from dailydialog for read_langs_turn
Read 41637 pairs train from dailydialog
Read 3851 pairs valid from dailydialog
Read 3700 pairs test  from dailydialog


In [23]:
print("[Info] Start Evaluation on test set...")
tst_loader = get_loader(args, "test" , tokenizer, datasets, unified_meta, shuffle=args["task_name"]=="rs")
model.eval()
if not os.path.exists(args["output_dir"]): os.makedirs(args["output_dir"])    
evaluate_and_dump(args, 
                  model, 
                  tst_loader, 
                  dump_keys=["context_plain", "ID", "turn_id", "response_plain", "utterance_plain"],
                  add_name="dailydialog")


  0%|          | 0/27 [00:00<?, ?it/s]

[Info] Start Evaluation on test set...
[Info] Remove turns with empty system response...
[Info] Remove turn=0 system response...


100%|██████████| 27/27 [00:19<00:00,  1.39it/s]


{'top-1': 0.17296296296296296, 'top-3': 0.3025925925925926, 'top-5': 0.3725925925925926, 'top-10': 0.49925925925925924}
