In [28]:
import random
from names import PERSON, OBJECTS, PLACES
import numpy as np

In [3]:
len(PERSON), len(OBJECTS), len(PLACES)

(504, 20, 20)

This notebook is part of an effort to reproduce the paper _Interpretability in the Wild_, which introduced the Indirect Object Identification task (IOI).

In the IOI task, there is a subject S and an indirect object IO. Here these are denoted by A and B. These are the templates obtained from the paper.

`baba_templates` is copied from the paper, while `abba_templates` is the same pattern, with B and A swapped.

In [4]:
baba_templates = [
    "Then, {B} and {A} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {B} and {A} had a lot of fun at the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {B} and {A} were working at the {PLACE}. {B} decided to give a {OBJECT} to {A}",
    "Then, {B} and {A} were thinking about going to the {PLACE}. {B} wanted to give a {OBJECT} to {A}",
    "Then, {B} and {A} had a long argument, and afterwards {B} said to {A}",
    "After {B} and {A} went to the {PLACE}, {B} gave a {OBJECT} to {A}",
    "When {B} and {A} got a {OBJECT} at the {PLACE}, {B} decided to give it to {A}",
    "When {B} and {A} got a {OBJECT} at the {PLACE}, {B} decided to give the {OBJECT} to {A}",
    "While {B} and {A} were working at the {PLACE}, {B} gave a {OBJECT} to {A}",
    "While {B} and {A} were commuting to the {PLACE}, {B} gave a {OBJECT} to {A}",
    "After the lunch, {B} and {A} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Afterwards, {B} and {A} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {B} and {A} had a long argument. Afterwards {B} said to {A}",
    "The {PLACE} {B} and {A} went to had a {OBJECT}. {B} gave it to {A}",
    "Friends {B} and {A} found a {OBJECT} at the {PLACE}. {B} gave it to {A}",
]
abba_templates = [
    "Then, {A} and {B} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {A} and {B} had a lot of fun at the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {A} and {B} were working at the {PLACE}. {B} decided to give a {OBJECT} to {A}",
    "Then, {A} and {B} were thinking about going to the {PLACE}. {B} wanted to give a {OBJECT} to {A}",
    "Then, {A} and {B} had a long argument, and afterwards {B} said to {A}",
    "After {A} and {B} went to the {PLACE}, {B} gave a {OBJECT} to {A}",
    "When {A} and {B} got a {OBJECT} at the {PLACE}, {B} decided to give it to {A}",
    "When {A} and {B} got a {OBJECT} at the {PLACE}, {B} decided to give the {OBJECT} to {A}",
    "While {A} and {B} were working at the {PLACE}, {B} gave a {OBJECT} to {A}",
    "While {A} and {B} were commuting to the {PLACE}, {B} gave a {OBJECT} to {A}",
    "After the lunch, {A} and {B} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Afterwards, {A} and {B} went to the {PLACE}. {B} gave a {OBJECT} to {A}",
    "Then, {A} and {B} had a long argument. Afterwards {B} said to {A}",
    "The {PLACE} {A} and {B} went to had a {OBJECT}. {B} gave it to {A}",
    "Friends {A} and {B} found a {OBJECT} at the {PLACE}. {B} gave it to {A}",
]


In [5]:

def gen_person_a() -> list[str]:
    """randomly select 50 names from PERSON and put them in variable A, without repeats
    """
    A = []
    while len(A) < 50:
        name = random.choice(PERSON)
        if name not in A:
            A.append(name)
    return A

def gen_person_b(A: list[str]) -> list[str]:
    """randomly select 50 names from PERSON and put them in variable B, without repeats in A
    """
    B = []
    while len(B) < 50:
        name = random.choice(PERSON)
        if name not in A and name not in B:
            B.append(name)
    return B

In [6]:
person_a_names = gen_person_a()
person_b_names = gen_person_b(person_a_names)

In [7]:
len(person_a_names)

50

In [8]:
len(person_b_names)

50

In [9]:
random.shuffle(OBJECTS)

In [10]:
# BABA pattern
for template in baba_templates:
    print(template.format(A=random.choice(person_a_names),
                          B=random.choice(person_b_names),
                          OBJECT=random.choice(OBJECTS),
                          PLACE=random.choice(PLACES))
         )

Then, Jackie and Carmen went to the museum. Jackie gave a pear to Carmen
Then, Gwen and Juana had a lot of fun at the park. Gwen gave a cantaloupe to Juana
Then, Leigh and Ella were working at the beach. Leigh decided to give a grape to Ella
Then, Eleanor and Stacy were thinking about going to the market. Eleanor wanted to give a mango to Stacy
Then, Robin and Harriet had a long argument, and afterwards Robin said to Harriet
After Lula and Mark went to the beach, Lula gave a strawberry to Mark
When Jody and Brandy got a banana at the cave, Jody decided to give it to Brandy
When Inez and Juana got a plum at the museum, Inez decided to give the plum to Juana
While Marsha and Dianne were working at the museum, Marsha gave a peach to Dianne
While Marsha and Jacqueline were commuting to the store, Marsha gave a mango to Jacqueline
After the lunch, Pat and Harriet went to the museum. Pat gave a cantaloupe to Harriet
Afterwards, Inez and Anne went to the market. Inez gave a grape to Anne
Then

In [11]:
# ABBA pattern
for template in abba_templates:
    print(template.format(A=random.choice(person_b_names),
                          B=random.choice(person_a_names),
                          OBJECT=random.choice(OBJECTS),
                          PLACE=random.choice(PLACES))
         )

Then, Gwen and Anita went to the lake. Anita gave a strawberry to Gwen
Then, Richard and Rosa had a lot of fun at the theater. Rosa gave a watermelon to Richard
Then, Bernadette and Brandy were working at the forest. Brandy decided to give a pear to Bernadette
Then, Bernadette and Mark were thinking about going to the shop. Mark wanted to give a banana to Bernadette
Then, Rebecca and Jerry had a long argument, and afterwards Jerry said to Rebecca
After Alice and Ella went to the desert, Ella gave a blackberry to Alice
When Faye and Lorraine got a plum at the market, Lorraine decided to give it to Faye
When Teresa and Anita got a orange at the cave, Anita decided to give the orange to Teresa
While Kevin and Wilma were working at the park, Wilma gave a honeydew to Kevin
While Marsha and Carmen were commuting to the cave, Carmen gave a pear to Marsha
After the lunch, Bernadette and Rosemarie went to the forest. Rosemarie gave a peach to Bernadette
Afterwards, Grace and Paula went to the p

In [12]:
def generate_dataset():
    As = gen_person_a()
    random.shuffle(As)
    Bs = gen_person_b(person_a_names)
    random.shuffle(Bs)
    random.shuffle(OBJECTS)
    random.shuffle(PLACES)
    sentences = []
    for template in abba_templates:
        sentences.append(template.format(A=random.choice(As),
                                         B=random.choice(Bs),
                                         OBJECT=random.choice(OBJECTS),
                                         PLACE=random.choice(PLACES)))
    return sentences

In [13]:
generate_dataset()

['Then, Diane and Brittany went to the zoo. Brittany gave a raspberry to Diane',
 'Then, Peggy and Ellen had a lot of fun at the desert. Ellen gave a mango to Peggy',
 'Then, Willie and Irene were working at the store. Irene decided to give a orange to Willie',
 'Then, Ellen and Joy were thinking about going to the cave. Joy wanted to give a peach to Ellen',
 'Then, Virginia and Barbara had a long argument, and afterwards Barbara said to Virginia',
 'After Bonnie and Gloria went to the zoo, Gloria gave a orange to Bonnie',
 'When Virginia and Karla got a pear at the river, Karla decided to give it to Virginia',
 'When Natalie and Meredith got a honeydew at the forest, Meredith decided to give the honeydew to Natalie',
 'While Vera and Constance were working at the museum, Constance gave a apple to Vera',
 'While Claire and Joan were commuting to the desert, Joan gave a raspberry to Claire',
 'After the lunch, Peggy and Debra went to the school. Debra gave a pear to Peggy',
 'Afterwards

In [20]:
data = []
for template in abba_templates:
    for a, b, o, p in [(a, b, o, p) 
                       for p in PLACES
                       for o in OBJECTS
                       for b in person_b_names
                       for a in person_a_names]:
        data.append(template.format(A=a, B=b, OBJECT=o, PLACE=p))

In [15]:
baba_data = []
for template in baba_templates:
    for a, b, o, p in [(a, b, o, p) 
                       for p in PLACES
                       for o in OBJECTS
                       for b in person_b_names
                       for a in person_a_names]:
        baba_data.append(template.format(A=a, B=b, OBJECT=o, PLACE=p))

In [21]:
len(data)

15000000

In [22]:
len(set(data))

13005000

In [23]:
dedup = list(set(data))

In [16]:
len(baba_data)

15000000

In [26]:
len(dedup)

13005000

In [18]:
len(set(baba_data))

13005000

In [24]:
dedup_baba = list(set(baba_data))

In [25]:
len(dedup_baba)

13005000

In [27]:
ioi_data = []
while len(ioi_data) < 100000:
    ioi_data.append(random.choice(data))

In [28]:
len(ioi_data)

100000

In [29]:
# https://stackoverflow.com/questions/63467352/randomly-sampling-from-multiple-lists
sample_n = 100000
full_data = np.column_stack((dedup, dedup_baba))
idx = np.random.randint(0, len(full_data), size = sample_n) 
# idx = np.reshape(idx, (sample_size, 1))
#sample_data[idx,:]=data[idx,:]

In [33]:
sample_data = full_data[idx, :]

In [34]:
sample_data[0]

array(['Friends Juana and Kristi found a mango at the bar. Kristi gave it to Juana',
       'Then, Yvette and Angie were working at the mountain. Yvette decided to give a banana to Angie'],
      dtype='<U127')

In [36]:
len(sample_data.flatten())

200000

In [38]:
import pickle

In [39]:
with open("ioi_200k_baba_abba.pkl", "wb") as fobj:
    pickle.dump(sample_data.flatten(), fobj)

In [40]:
with open("ioi_26m_baba_abba.pkl", "wb") as fobj:
    pickle.dump(full_data.flatten(), fobj)

In [33]:
with open("pkl_full_data.pkl", "wb") as fobj:
    pickle.dump(data, fobj)

In [31]:
with open("pkl_ioi_data.pkl", "wb") as fobj:
    pickle.dump(ioi_data, fobj)

In [32]:
ioi_data[:20]

['Then, Tracey and Jacqueline went to the home. Jacqueline gave a orange to Tracey',
 'When Kristy and Jacqueline got a honeydew at the shop, Jacqueline decided to give it to Kristy',
 'While Robyn and Katrina were working at the mountain, Katrina gave a apple to Robyn',
 'Then, Karla and Dolores were working at the bar. Dolores decided to give a peach to Karla',
 'The shop Tracey and Francis went to had a coconut. Francis gave it to Tracey',
 'When Karla and Anna got a pineapple at the beach, Anna decided to give it to Karla',
 'The desert Angelina and Dawn went to had a strawberry. Dawn gave it to Angelina',
 'When Tracey and Harriet got a papaya at the forest, Harriet decided to give the papaya to Tracey',
 'Then, Jessica and Eula were working at the museum. Eula decided to give a orange to Jessica',
 'The forest Vivian and Mercedes went to had a apple. Mercedes gave it to Vivian',
 'Then, Jill and Pamela had a lot of fun at the desert. Pamela gave a strawberry to Jill',
 'Friends G