In [1]:
import csv
from collections import defaultdict

import datasets
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
data_path = "../assets/data/scone/rlong"
splits = ["train", "dev", "test"]
tasks = ["alchemy", "scene", "tangrams"]


def tsv_to_dict_of_lists(file_path):
    with open(file_path, "r", newline="") as tsv_file:
        reader = csv.reader(tsv_file, delimiter="\t")

        # Read the first row to determine the number of columns
        first_row = next(reader)
        num_columns = len(first_row)

        # Generate headers
        headers = ["ID", "WORLD_0"]
        for i in range(1, (num_columns - 2) // 2 + 1):
            headers.extend([f"UTTERANCE_{i}", f"WORLD_{i}"])

        # Create a dictionary to store the lists
        result_dict = {header: [] for header in headers}

        # Reset the file pointer to the beginning
        tsv_file.seek(0)

        # Process each row
        for row in reader:
            for i, value in enumerate(row):
                if i < len(headers):
                    result_dict[headers[i]].append(value)

    return result_dict


task_datasets = defaultdict(list)

for split in splits:
    for task in tasks:
        ds = datasets.Dataset.from_dict(
            tsv_to_dict_of_lists(f"{data_path}/{task}-{split}.tsv")
        )
        ds = ds.add_column("task", [task] * len(ds))
        task_datasets[split].append(ds)

for split, ds_list in task_datasets.items():
    task_datasets[split] = datasets.concatenate_datasets(ds_list)

scone_dataset = datasets.DatasetDict(task_datasets)

In [3]:
# mapping from number to word
num2word = {
    1: "first",
    2: "second",
    3: "third",
    4: "fourth",
    5: "fifth",
    6: "sixth",
    7: "seventh",
}
color = {
    "g": "green",
    "b": "blue",
    "r": "red",
    "y": "yellow",
    "p": "purple",
    "o": "orange",
}


def alchemy_state_to_nl(state: str):
    beakers = list(map(lambda x: (x[0], x[-1]), state.split(" ")))

    def to_nl(x):
        i, s = x
        if s[1] == "_":
            return f"the {num2word[i + 1]} beaker is empty"
        return f"the {num2word[i + 1]} beaker has {s[0]} {color[s[1]]}"

    return ", ".join(map(to_nl, enumerate(beakers)))


def alchemy_sequence_to_instruction(example: dict, turn_limit: int):
    assert turn_limit <= 5, "Alchemy only has 5 turns"
    world_states = [alchemy_state_to_nl(example[f"WORLD_{i}"]) for i in range(0, 6)]
    utterances = [example[f"UTTERANCE_{i}"] for i in range(1, 6)]
    utterances.insert(0, "")
    utterances.append("")

    instructions = []
    output = []

    for i, state in enumerate(world_states):
        utterance = utterances[i + 1]
        if i + 1 <= turn_limit:
            instructions.append(f"{state}\n{utterance}".strip())
        else:
            output = state
            break

    return "\n".join(instructions), output

In [4]:
scone_dataset["train"][0]

{'ID': 'train-A9164',
 'WORLD_0': '1:ggg 2:_ 3:_ 4:_ 5:o 6:ooo 7:gggg',
 'UTTERANCE_1': 'throw out two units of first beaker',
 'WORLD_1': '1:g 2:_ 3:_ 4:_ 5:o 6:ooo 7:gggg',
 'UTTERANCE_2': 'throw out fifth beaker',
 'WORLD_2': '1:g 2:_ 3:_ 4:_ 5:_ 6:ooo 7:gggg',
 'UTTERANCE_3': 'throw out first one',
 'WORLD_3': '1:_ 2:_ 3:_ 4:_ 5:_ 6:ooo 7:gggg',
 'UTTERANCE_4': 'throw out orange beaker',
 'WORLD_4': '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:gggg',
 'UTTERANCE_5': 'throw out one unit of green',
 'WORLD_5': '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:ggg',
 'task': 'alchemy'}

In [5]:
instr, output = alchemy_sequence_to_instruction(scone_dataset["train"][0], 3)

print(instr)
print("=" * 80)
print(output)

the first beaker has 1 green, the second beaker is empty, the third beaker is empty, the fourth beaker is empty, the fifth beaker has 5 orange, the sixth beaker has 6 orange, the seventh beaker has 7 green
throw out two units of first beaker
the first beaker has 1 green, the second beaker is empty, the third beaker is empty, the fourth beaker is empty, the fifth beaker has 5 orange, the sixth beaker has 6 orange, the seventh beaker has 7 green
throw out fifth beaker
the first beaker has 1 green, the second beaker is empty, the third beaker is empty, the fourth beaker is empty, the fifth beaker is empty, the sixth beaker has 6 orange, the seventh beaker has 7 green
throw out first one
the first beaker is empty, the second beaker is empty, the third beaker is empty, the fourth beaker is empty, the fifth beaker is empty, the sixth beaker has 6 orange, the seventh beaker has 7 green


In [6]:
scone_dataset.filter(lambda x: x["task"] == "alchemy")

Filter:   0%|          | 0/11198 [00:00<?, ? examples/s]

Filter:   0%|          | 0/642 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2734 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'WORLD_0', 'UTTERANCE_1', 'WORLD_1', 'UTTERANCE_2', 'WORLD_2', 'UTTERANCE_3', 'WORLD_3', 'UTTERANCE_4', 'WORLD_4', 'UTTERANCE_5', 'WORLD_5', 'task'],
        num_rows: 3657
    })
    dev: Dataset({
        features: ['ID', 'WORLD_0', 'UTTERANCE_1', 'WORLD_1', 'UTTERANCE_2', 'WORLD_2', 'UTTERANCE_3', 'WORLD_3', 'UTTERANCE_4', 'WORLD_4', 'UTTERANCE_5', 'WORLD_5', 'task'],
        num_rows: 245
    })
    test: Dataset({
        features: ['ID', 'WORLD_0', 'UTTERANCE_1', 'WORLD_1', 'UTTERANCE_2', 'WORLD_2', 'UTTERANCE_3', 'WORLD_3', 'UTTERANCE_4', 'WORLD_4', 'UTTERANCE_5', 'WORLD_5', 'task'],
        num_rows: 899
    })
})

In [7]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


def generate_text(prompt, max_new_tokens=100):
    # Load pre-trained model and tokenizer
    # Encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")

    # Generate text
    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
    )

    # Decode the generated text
    generated_text = tokenizer.decode(
        output[0][input_ids.shape[1] :], skip_special_tokens=True
    )

    return generated_text

In [8]:
instr, out = alchemy_sequence_to_instruction(scone_dataset["train"][0], 3)
result = generate_text(instr, 100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [9]:
print(result)

 unit of second unit
The first unit has 2 green and the other one has 3 orange.
Throw out one of the units with the first green. The second one with 2 orange and one orange has 4 green but the orange with 3 green is not green so the green with 4 orange is green instead of orange
If the unit with green has a green unit, throw out the one that has orange but not orange because the Orange with orange unit is orange instead. If the Green with Orange unit
