In [27]:
import collections
import json
import logging
import typing as tp
import pathlib
import attrs
import cattrs
import pandas as pd
import tqdm.auto
import sdt_prompts
import sgd_utils

In [13]:
@attrs.define
class SGDConversionOptions:
    """Options for creating Show, Don't Tell data from the SGD dataset.
    
    Attributes:
        sgd_dir (str, optional): Path to SGD data directory
        output_path (str, optional): Path for output file.
        sgdx_dir (str, optional): If set, create dialogue examples using SGD-X variants from tis path. e.g. /path/to/sgdx/v1/
        subdirs (tuple of str): A tuple of dataset subdirectories to process
        prompt_format (one of "separated"): Format of the prompt for priming.
            "separated" means a dialogue followed by a separate string of slots.
        prompt_indices (tuple of int, optional): Indices of the prompts for each service to be used for generating examples. Specify one or more numeric indices (starting from 0), or `None` to use all prompts for a given service.
        target_format (tuple of one of "all", "active"): Format of the target. "all" and "active" respectively refer to all and only active slots being present in the target.
        add_intents (bool): Whether to add intents.
        lowercase (bool): Whether to lowercase the generated example.
        mcq_cat_vals (bool): Whether to enumerate categorical values in the form of a multiple choice question in the prompt string.
        mcq_intents (bool): Whether to enumerate intents in the form of a multiple choice question in the prompt string. Only use if flag `add_intents` is `True`.
        randomize_slots (bool): Whether to randomize slot order of the prompt.
        randomize_cat_vals (bool): Whether to randomize order of categorical values in prompt.
        randomize_intents (bool): Whether to randomize order of intents in prompt. Only used if flag `add_intents` is `True`.
        use_slot_ids (bool): Whether to use numeric slot IDs in place of slot names in the input and output strings.
        data_percent (float): If not 0.0, only write this proportion of data and discard the rest of the examples. For data efficiency experiments. Not compatible with `k_shot`.
        k_shot (int): If not 0, sample this many examples from each service. For data efficiency experiments. Not compatible with `data_percent`.
        use_intent_slot_descs (bool): Whether to add D3ST descriptions to prompt.

    """
    sgd_dir: tp.Optional[str] = None
    output_path: tp.Optional[str] = None
    sgdx_dir: tp.Optional[str] = None
    subdirs: tp.Tuple[str, ...] = ("train", "dev", "test")
    prompt_format: tp.Optional[tp.Literal["separated"]] = "separated"
    prompt_indices: tp.Optional[tp.Tuple[int, ...]] = None
    context_format: tp.Literal["dialogue"] = "dialogue"
    target_format: tp.Tuple[tp.Literal["all", "active"], ...] = ("all",)
    add_intents: bool = False
    lowercase: bool = True
    mcq_cat_vals: bool = False
    mcq_intents: bool = False
    randomize_slots: bool = True
    randomize_cat_vals: bool = True
    randomize_intents: bool = True
    use_slot_ids: bool = False
    data_percent: float = 0.0
    k_shot: int = 0
    use_intent_slot_descs: bool = False
    
    @property
    def input_dir(self) -> tp.Optional[str]:
        return self.sgd_dir

In [32]:
@attrs.define
class Example:
    """Dataclass for single SDT example.

    Attributes:
        example_str (str): The example string.
        services (list of str): The services this example belongs to.

    """
    example_str: str
    services: tp.List[str]

In [15]:
PATH_TO_DSTC_STUFF = pathlib.Path("../datasets/dstc8-schema-guided-dialogue").resolve()
assert PATH_TO_DSTC_STUFF.is_dir()

In [16]:
config = SGDConversionOptions(
    sgd_dir=str(PATH_TO_DSTC_STUFF),
    output_path=str(pathlib.Path("./sgd_sdt_port_v0.tsv")),
    prompt_indices=0,
    mcq_cat_vals=True
)

In [29]:
Prompt = sdt_prompts.Prompt
Schemas = sgd_utils.Schemas
DialoguesDict = sgd_utils.DialoguesDict
RAND_SEED = 123
USER_SPEAKER = 'USER'
SYSTEM_SPEAKER = 'SYSTEM'
USER_TOK = '[user]'
SYS_TOK = '[system]'
INTENT_SLOT_VALUE_DELIMITER = '='
INPUT_TARGET_SEP = '\t'

In [30]:
_PROMPTS_MAP = {
    'separated': sdt_prompts.SGD_SEPARATED_ANNOTATION_PROMPTS,
}

In [33]:
def speaker_to_tok(speaker: str) -> str:
    if speaker == USER_SPEAKER:
        return USER_TOK
    elif speaker == SYSTEM_SPEAKER:
        return SYS_TOK
    raise ValueError(f"Speaker must be one of {USER_SPEAKER} or {SYSTEM_SPEAKER}. Found {speaker}.")

In [34]:
def generate_utt_str(utterance: str, speaker: str) -> str:
    prefix = speaker_to_tok(speaker)
    # Occasionally, some examples include newlines in the middle
    utterance = utterance.replace("\n", " ")

    return " ".join([prefix, utterance])

In [35]:
def build_example(input_strs: tp.Sequence[str], target_str: str, additional_strs: tp.Sequence[str], services: tp.Sequence[str], lowercase: bool) -> Example:
    """Builds a single example in TSV format."""
    example_str = " ".join(input_strs) + INPUT_TARGET_SEP + target_str
    if additional_strs:
        example_str += INPUT_TARGET_SEP + INPUT_TARGET_SEP.join(additional_strs)

    if lowercase:
        example_str = example_str.lower()

    return Example(example_str=example_str.strip(), services=list(services))

In [None]:
def create_examples_from_dialogue(dialogue: tp.Mapping[str, tp.Any], service_to_prompts: tp.Optional[Dict[str, tp.List[Prompt]]], service_to_schema: tp.Mapping[str, sgd_utils.Schema], options: SGDConversionOptions) -> tp.List[Example]:
    """Returns example strings created from a dialogue.

    Args:
        dialogue: A single dialogue containing multiple turns and frames.
        service_to_prompts: A map from SGD service to a list of prompts
        service_to_schema: A map from SGD service to schema
        options: An object containing various options related to example generation.
    """
    utt_strs = []
    example_strs= []

    for turn_idx, turn in enumerate(dialogue["turns"]):

        # Format utterances
    utt_strs.append(
        generate_utt_str(utterance=turn["utterance"], speaker=turn["speaker"]))
    )

    # Don't create examples out of system turns for DST
    if turn["speaker"] != USER_SPEAKER:
        continue

    for frame_idx, frame in enuemrate(turn["frames"]):

        # Create prompt
        prompt_str, ordered_slots, slot_to_cat_val_to_id, intent_to_id = sdt_utils.generate_prompt_str(
            keys=[frame["service"]],
            key_to_prompts=service_to_prompts,
            prompt_indices=options.prompt_indices,
            add_intents=options.add_intents,
            mcq_cat_vals=options.mcq_cat_vals,
            mcq_intents=options.mcq_intents,
            randomize_slots=options.randomize_slots,
            randomize_cat_vals=options.randomize_cat_vals,
            randomize_intents=options.randomize_intents,
            use_slot_ids=options.use_slot_ids,
            key_to_schema=service_to_schema
        )

        
        