In [94]:
import attrs
import bisect
import cattrs
import datasets
import enum
import itertools
import json
import multiprocessing as mp
import numpy as np
import pandas as pd
import pathlib
import random
import sklearn.model_selection
import transformers
import tqdm.auto
import typing as tp

In [2]:
tm1_path = pathlib.Path("../datasets/Taskmaster/TM-1-2019")

In [3]:
taskmaster_path = tm1_path

In [4]:
ontology_path = taskmaster_path.joinpath("ontology.json")
sample_path = taskmaster_path.joinpath("sample.json")
self_dialogues_path = taskmaster_path.joinpath("self-dialogs.json")
woz_dialogues_path = taskmaster_path.joinpath("woz-dialogs.json")

In [7]:
with sample_path.open() as f:
    sample_file = json.load(f)
with self_dialogues_path.open() as f:
    self_file = json.load(f)
with woz_dialogues_path.open() as f:
    woz_file = json.load(f)

In [15]:
sample_convo = cattrs.structure(sample_file, TaskmasterConvo)
self_convos = cattrs.structure(self_file, tp.List[TaskmasterConvo])

In [16]:
def check_alternating_speaker(convo: TaskmasterConvo) -> bool:
    for i, utt in enumerate(convo.utterances):
        if i % 2 == 0 and utt.speaker == TaskmasterSpeaker.ASSISTANT or i % 1 and utt.speaker == TaskmasterSpeaker.USER:
            return False
        return True

In [17]:
all(check_alternating_speaker(convo) for convo in self_convos)

True

In [18]:
all(check_alternating_speaker(convo) for convo in woz_convos)

False

In [53]:
self_convos[0].utterances

[TaskmasterUtterance(index=0, speaker=<TaskmasterSpeaker.USER: 'USER'>, text="Hi, I'm looking to book a table for Korean fod."),
 TaskmasterUtterance(index=1, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text='Ok, what area are you thinking about?'),
 TaskmasterUtterance(index=2, speaker=<TaskmasterSpeaker.USER: 'USER'>, text='Somewhere in Southern NYC, maybe the East Village?'),
 TaskmasterUtterance(index=3, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text="Ok, great.  There's Thursday Kitchen, it has great reviews."),
 TaskmasterUtterance(index=4, speaker=<TaskmasterSpeaker.USER: 'USER'>, text="That's great. So I need a table for tonight at 7 pm for 8 people. We don't want to sit at the bar, but anywhere else is fine."),
 TaskmasterUtterance(index=5, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text="They don't have any availability for 7 pm."),
 TaskmasterUtterance(index=6, speaker=<TaskmasterSpeaker.USER: 'USER'>, text='What times are available?'),
 Taskmaster

In [11]:
class TaskmasterSpeaker(enum.Enum):
    USER = "USER"
    ASSISTANT = "ASSISTANT"

@attrs.define
class TaskmasterUtterance:
    index: int
    speaker: TaskmasterSpeaker
    text: str

@attrs.define
class TaskmasterConvo:
    conversation_id: str
    instruction_id: str
    utterances: tp.List[TaskmasterUtterance]

class TaskmasterSpeaker(enum.Enum):
    USER = "USER"
    ASSISTANT = "ASSISTANT"

    @classmethod
    def _missing_(cls, value):
        for member in cls:
            if member.value.upper() == value.upper():
                return member

@attrs.define
class TaskmasterUtterance:
    index: int
    speaker: TaskmasterSpeaker
    text: str

@attrs.define
class Taskmaster3Convo:
    conversation_id: str
    vertical: str
    scenario: str
    instructions: str
    utterances: tp.List[TaskmasterUtterance]

In [12]:
def try_structure_file_tm(file):
    convos = []
    for i, x in enumerate(file):
        try:
            convos.append(cattrs.structure(x, TaskmasterConvo))
        except:
            print(i)
    return convos

In [13]:
def try_structure_file_tm3(file):
    convos = []
    for i, x in enumerate(file):
        try:
            convos.append(cattrs.structure(x, Taskmaster3Convo))
        except:
            print(i)
    return convos

In [19]:
woz_convos = try_structure_file_tm(woz_file)

1441
1934
2256
3416
3864
3946
4270
4309
5453
5455
5456
5457
5458


In [45]:
random.shuffle(woz_convos)

In [46]:
woz_lengths = np.array([len(convo.utterances) for convo in woz_convos]).cumsum()

In [53]:
woz_bisection_threshold = bisect.bisect(woz_lengths, 0.9*woz_lengths[-1])

In [57]:
woz_train, woz_test = woz_convos[:woz_bisection_threshold], woz_convos[woz_bisection_threshold:]

In [62]:
len(woz_train)

4952

In [63]:
len(woz_test)

542

In [64]:
len(woz_train)/(len(woz_train)+len(woz_test))

0.9013469239170003

In [56]:
len(woz_convos[0].utterances)

29

In [59]:
woz_file[1934]

{'conversation_id': 'dlg-5720be0d-6cb4-41c5-b63e-c51ca5534704',
 'instruction_id': 'auto-repair-1',
 'utterances': [{'index': 0,
   'speaker': 'ASSISTANT',
   'text': 'Hi, how can I help you?'},
  {'index': 1, 'speaker': 'ASSISTANT', 'text': 'Hi, how can I help you?'},
  {'index': 2, 'speaker': 'USER', 'text': 'Hello. Come and get it.'},
  {'index': 3, 'speaker': 'USER', 'text': 'Hello. Come and get it.'},
  {'index': 4,
   'speaker': 'ASSISTANT',
   'text': 'I think you said you wanted to book an appointment?'},
  {'index': 5,
   'speaker': 'ASSISTANT',
   'text': 'I think you said you wanted to book an appointment?'},
  {'index': 6, 'speaker': 'USER', 'text': "Yeah, that's right."},
  {'index': 7, 'speaker': 'USER', 'text': "Yeah, that's right."},
  {'index': 8,
   'speaker': 'ASSISTANT',
   'text': 'Can you please tell me your name and number?'},
  {'index': 9,
   'speaker': 'ASSISTANT',
   'text': 'Can you please tell me your name and number?'},
  {'index': 10,
   'speaker': 'ASSIS

In [86]:
cattrs.structure(woz_file[3], TaskmasterConvo)

TaskmasterConvo(conversation_id='dlg-0018566e-5d1f-4ea2-9416-2e19da7d8a91', utterances=[TaskmasterUtterance(index=0, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text='hi there.'), TaskmasterUtterance(index=1, speaker=<TaskmasterSpeaker.USER: 'USER'>, text="Hi, I'd like to schedule an appointment at Intelligent Auto Repairs for a repair of my 2014 Acura RDX."), TaskmasterUtterance(index=2, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text='sure thing.'), TaskmasterUtterance(index=3, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text="let's start with some details."), TaskmasterUtterance(index=4, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text="what's your name?"), TaskmasterUtterance(index=5, speaker=<TaskmasterSpeaker.USER: 'USER'>, text='My name is Jack Jones.'), TaskmasterUtterance(index=6, speaker=<TaskmasterSpeaker.ASSISTANT: 'ASSISTANT'>, text='hi jack.'), TaskmasterUtterance(index=7, speaker=<TaskmasterSpeaker.USER: 'USER'>, text='My phone number is 

In [65]:
woz_file[0]

{'conversation_id': 'dlg-00028478-84a9-4ca7-a3e2-be514a3b8c9d',
 'instruction_id': 'movie-tickets-2',
 'utterances': [{'index': 0,
   'speaker': 'ASSISTANT',
   'text': 'Hi there! How can I help?'},
  {'index': 1,
   'speaker': 'USER',
   'text': "Oh well, I've tried to go see Aquaman in Reno, Nevada.",
   'segments': [{'start_index': 30,
     'end_index': 37,
     'text': 'Aquaman',
     'annotations': [{'name': 'movie_ticket.name.movie'}]},
    {'start_index': 41,
     'end_index': 54,
     'text': 'Reno, Nevada.',
     'annotations': [{'name': 'movie_ticket.location.theater.accept'}]}]},
  {'index': 2,
   'speaker': 'ASSISTANT',
   'text': 'Okay! let me see what we got.'},
  {'index': 3, 'speaker': 'ASSISTANT', 'text': 'I found 2 options.'},
  {'index': 4, 'speaker': 'USER', 'text': 'Okay.'},
  {'index': 5,
   'speaker': 'ASSISTANT',
   'text': 'Century Summit Sierra is the first.',
   'segments': [{'start_index': 0,
     'end_index': 21,
     'text': 'Century Summit Sierra',
     '

In [63]:
for i, x in enumerate(woz_file[:5]):
    try:
        c = cattrs.structure(x, TaskmasterConvo)
        if not check_alternating_speaker(c):
            print(i)
    except Exception as e:
        print("E", i)
        # print(x)
        # print(e)
        print()

0
1
2
3
4


In [20]:
tm2_path = pathlib.Path("../datasets/Taskmaster/TM-2-2020")

In [21]:
tm2_convos = dict()
for x in tm2_path.joinpath("data").glob("*.json"):
    with x.open() as f:
        tm2_convos[x.stem] = cattrs.structure(json.load(f), tp.List[TaskmasterConvo])

In [22]:
tm3_path = pathlib.Path("../datasets/Taskmaster/TM-3-2020")

In [5]:
with tm3_path.joinpath("data/data_00.json").open() as f:
    tm3_dfile00 = json.load(f)

In [6]:
tm3_dfile00[0]["utterances"][0]

{'index': 0,
 'speaker': 'user',
 'text': 'hi....am buying a ticket tonight so we go and see a movie at AMC mountain 16',
 'segments': [{'start_index': 25,
   'end_index': 32,
   'text': 'tonight',
   'annotations': [{'name': 'date.showing'}]},
  {'start_index': 61,
   'end_index': 76,
   'text': 'AMC mountain 16',
   'annotations': [{'name': 'name.theater'}]}]}

In [11]:
tm3_dfile00[0]["utterances"][0]

{'index': 0,
 'speaker': 'user',
 'text': 'hi....am buying a ticket tonight so we go and see a movie at AMC mountain 16',
 'segments': [{'start_index': 25,
   'end_index': 32,
   'text': 'tonight',
   'annotations': [{'name': 'date.showing'}]},
  {'start_index': 61,
   'end_index': 76,
   'text': 'AMC mountain 16',
   'annotations': [{'name': 'name.theater'}]}]}

In [24]:
tm3_dfile00[0]

{'conversation_id': 'dlg-bca5ce0a-056f-446e-be94-3ba77b32a84f',
 'vertical': 'Movie Tickets',
 'scenario': 'Auto template 1 with theater name error',
 'instructions': 'SCENARIO: In the conversation below, a customer is talking to a booking agent to purchase movie tickets. However, the customer’s turns are missing. The details mentioned (theater names, cities, etc.) do NOT necessarily reflect real life, but you should pretend they are real for the purposes of this exercise.\n\nYOUR TASK: {{LIST *Pretend you are the customer*., Fill in the *missing turns* (in green) so that the conversation makes sense and sounds natural., Make sure you *follow the instructions* provided in parentheses for each turn. (Start typing to make the instructions move outside the box.), Do not copy the wording. *Use your own words*., *Use your normal speaking style* for this scenario but don’t be overly chatty., *Do NOT* call the booking agent by any name including “sir” “madam” etc.}}\n\n{{HIDE movie_1 name.mov

In [25]:
cattrs.structure(tm3_dfile00[0], Taskmaster3Convo)

Taskmaster3Convo(conversation_id='dlg-bca5ce0a-056f-446e-be94-3ba77b32a84f', vertical='Movie Tickets', scenario='Auto template 1 with theater name error', instructions='SCENARIO: In the conversation below, a customer is talking to a booking agent to purchase movie tickets. However, the customer’s turns are missing. The details mentioned (theater names, cities, etc.) do NOT necessarily reflect real life, but you should pretend they are real for the purposes of this exercise.\n\nYOUR TASK: {{LIST *Pretend you are the customer*., Fill in the *missing turns* (in green) so that the conversation makes sense and sounds natural., Make sure you *follow the instructions* provided in parentheses for each turn. (Start typing to make the instructions move outside the box.), Do not copy the wording. *Use your own words*., *Use your normal speaking style* for this scenario but don’t be overly chatty., *Do NOT* call the booking agent by any name including “sir” “madam” etc.}}\n\n{{HIDE movie_1 name.mo

In [None]:
tm3_convos = dict()
for x in tm3_path.joinpath("data").glob("*.json"):
    with x.open() as f:
        tm3_convos[x.stem] = try_structure_file_tm3(json.load(f))

In [72]:
def convo_to_dataset(convo):
    convo_id = convo.conversation_id
    utterances: tp.List[TaskmasterUtterance] = convo.utterances
    history: str = ''
    inputs = []
    labels = []
    instruction_prefixes = []
    for utt in utterances:
        if utt.speaker.value == "ASSISTANT" and not history:
            continue
        instr_prefix = f"Produce the next utterance in the conversation, as said by [{utt.speaker.value}]:\nConversation history:"
        if history:
            history = f'{history}\n[{utt.speaker.value}]'
        else:
            history = f'[{utt.speaker.value}]'
        inputs.append(history)
        labels.append(utt.text)
        instruction_prefixes.append(instr_prefix)
        history = f'{history} {utt.text}'
    return datasets.Dataset.from_dict(dict(inputs=inputs, labels=labels, instruction_prefixes=instruction_prefixes))

In [67]:
def interleave_datasets_by_size(dsets: tp.Iterable[datasets.Dataset]) -> datasets.Dataset:
    probabilities = np.array([ds.num_rows for ds in dsets], dtype=np.float64)
    probabilities /= probabilities.sum()
    return datasets.interleave_datasets(dsets, probabilities=probabilities)

In [92]:
def format_dialogue_dataset(convos: list):
    dsets = [convo_to_dataset(convo).shuffle() for convo in tqdm.auto.tqdm(convos)]
    return interleave_datasets_by_size(dsets)

In [95]:
woz_train_ds = format_dialogue_dataset(woz_train)
woz_test_ds = format_dialogue_dataset(woz_test)

  0%|          | 0/4952 [00:00<?, ?it/s]

  0%|          | 0/542 [00:00<?, ?it/s]

In [97]:
woz_train_ds.save_to_disk("./woz_train.dataset")

Flattening the indices:   0%|          | 0/4 [00:00<?, ?ba/s]

In [98]:
woz_train_ds.description

''

In [40]:
next(iter(tm3_convos.keys()))

'data_00'

In [84]:
convos_to_dataset(tm3_convos['data_00'])

Dataset({
    features: ['inputs', 'labels', 'instruction_prefixes'],
    num_rows: 10752
})

In [59]:
import collections

In [60]:
convo_count = collections.Counter()
for v in tm3_convos.values():
    for c in v:
        convo_count[c.vertical] += 1

In [61]:
convo_count

Counter({'Movie Tickets': 23757})

In [82]:
convo_to_dataset(tm3_convos['data_00'][0])

18

In [56]:
seed = 42
probabilities = np.array([0.6, 1.0, 0.4])
probabilities /= probabilities.sum()
d1 = datasets.Dataset.from_dict({"a": [0, 1, 2]})
d2 = datasets.Dataset.from_dict({"a": [10, 11, 12, 13]})
d3 = datasets.Dataset.from_dict({"a": [20, 21, 22]})
dataset = datasets.interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)
dataset["a"]

[10, 11, 20, 12, 0, 21, 13]

In [92]:
tm3_test_df = pd.read_csv(tm3_path.joinpath("splits/test/test.tsv"), delimiter="\t", header=None)

In [98]:
tm3_test_df[2].duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
35184     True
35185    False
35186     True
35187     True
35188    False
Name: 2, Length: 35189, dtype: bool