In [68]:
from pathlib import Path 
import json
import os 
from tqdm import tqdm
import random
import copy

In [69]:
alfred_dir = Path('/home/ksmehrab/EmbAIProject/data/json_2.1.0')
train_data_dir = alfred_dir / 'train'

In [70]:
task_dirs = os.listdir(train_data_dir)

In [71]:
task_dirs

['pick_and_place_with_movable_recep-DishSponge-Pan-DiningTable-20',
 'look_at_obj_in_light-BaseballBat-None-DeskLamp-316',
 'pick_and_place_simple-Knife-None-SinkBasin-28',
 'pick_cool_then_place_in_recep-BreadSliced-None-Microwave-24',
 'pick_clean_then_place_in_recep-Fork-None-Drawer-8',
 'pick_and_place_with_movable_recep-Vase-Box-DiningTable-227',
 'pick_heat_then_place_in_recep-Apple-None-SinkBasin-19',
 'pick_cool_then_place_in_recep-Pan-None-CounterTop-12',
 'look_at_obj_in_light-Laptop-None-DeskLamp-216',
 'pick_and_place_with_movable_recep-Watch-Bowl-SideTable-326',
 'pick_and_place_simple-Kettle-None-Cabinet-18',
 'pick_cool_then_place_in_recep-BreadSliced-None-GarbageCan-30',
 'pick_and_place_with_movable_recep-KeyChain-Plate-CoffeeTable-228',
 'pick_cool_then_place_in_recep-Bowl-None-Cabinet-25',
 'pick_heat_then_place_in_recep-TomatoSliced-None-GarbageCan-6',
 'pick_two_obj_and_place-Candle-None-Toilet-419',
 'pick_two_obj_and_place-BreadSliced-None-Fridge-14',
 'pick_cool

In [72]:
# Analyze one task_dir 

In [73]:
for task_dir in task_dirs:
    print(task_dir)
    trials = os.listdir(train_data_dir / task_dir)
    for trial in trials:
        json_file = os.listdir(train_data_dir / task_dir / trial)
        assert len(json_file) == 1, "Found more than one json file inside trial"
        json_file = json_file[0]
        assert json_file.endswith(".json"), f"{json_file} does not end with .json"
        
        #open json
        json_file_path = train_data_dir / task_dir / trial / json_file
        trial_data = json.load(json_file_path.open('r'))
    break

pick_and_place_with_movable_recep-DishSponge-Pan-DiningTable-20


In [54]:
idx = 2
instructions = copy.deepcopy(trial_data['turk_annotations']['anns'][idx]['high_descs'])

In [55]:
instructions

['Turn around and walk between the island and the fridge to the counter with the green sponge on it.',
 'Pick up the green sponge from the counter.',
 'Turn to your left and walk to the front of the stove.',
 'Place the sponge in the pan located on the back left burner of the stove.',
 'Pick up the pan with the sponge in it.',
 'Turn to your right and walk in the direction of the fridge, hang a right to stand at the long side of the island with stool in front of it.',
 'Place the pan with the sponge in it behind the left-most loaf of bread and to the left of the apple.']

In [53]:
# shuffle
random.shuffle(instructions)
instructions

['Turn around and walk between the island and the fridge to the counter with the green sponge on it.',
 'Pick up the pan with the sponge in it.',
 'Place the sponge in the pan located on the back left burner of the stove.',
 'Place the pan with the sponge in it behind the left-most loaf of bread and to the left of the apple.',
 'Turn to your right and walk in the direction of the fridge, hang a right to stand at the long side of the island with stool in front of it.',
 'Pick up the green sponge from the counter.',
 'Turn to your left and walk to the front of the stove.']

In [74]:
"""
Loop through to get all the data in a list of dict
Format: 
        [
            {'
                task': "...",
                'instructions: ["...", "..."]'
            }, 
            {'
                task': "...",
                'instructions: ["...", "..."]'
            }
        ]
"""

data_list = []
for task_dir in tqdm(task_dirs):
    #print(task_dir)
    trials = os.listdir(train_data_dir / task_dir)
    for trial in trials:
        json_file = os.listdir(train_data_dir / task_dir / trial)
        assert len(json_file) == 1, "Found more than one json file inside trial"
        json_file = json_file[0]
        assert json_file.endswith(".json"), f"{json_file} does not end with .json"
        
        #open json
        json_file_path = train_data_dir / task_dir / trial / json_file
        trial_data = json.load(json_file_path.open('r'))
        
        anns = trial_data['turk_annotations']['anns']
        for ann in anns:
            data = {}
            data['task'] = ann['task_desc']
            data['instructions'] = ann['high_descs']
            data_list.append(data)
    

100%|██████████| 2435/2435 [00:32<00:00, 74.63it/s]


In [75]:
%pwd

'/home/ksmehrab/EmbAIProject'

In [76]:
data_list_save_file = Path("./data/all_data.json")
json.dump(data_list, data_list_save_file.open('w'))

## Begin shuffling with all_data.json

In [77]:
# Shuffle
data_list_save_file = Path("./data/all_data.json")
data_list = json.load(data_list_save_file.open('r'))

In [78]:
len(data_list)

21025

In [79]:
"""
Create original sequences and shuffled sequences
Create this in two formats:

Format1 :

Two files. Each containing a list of instructions.
alfred_original_sequences.json -> list of original sequences. 
    Where each sequence is a list containing the task and instructions
alfred_shuffled_sequences.json -> list of shuffles sequences.
    Where each sequence is a list containing the task followed by the shuffled instructions
    
The original_sequences and shuffled_sequences need to map to each other. 
This means the orignal_sequences need to be repeated num_shuffles number of times.
"""

# Format 1
# alfred_original_sequences.json
num_shuffles = 4 
alfred_original_sequences = []
for data in tqdm(data_list):
    sequence = copy.deepcopy([data['task']])
    sequence.extend(data['instructions'])
    for i in range(num_shuffles):
        alfred_original_sequences.append(sequence)
    

100%|██████████| 21025/21025 [00:00<00:00, 302362.53it/s]


In [81]:
alfred_original_sequences[:5]

[['Put a pan containing a sponge on the white table.',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it.',
  'Pick up the green sponge on the left of the sink.',
  'Turn left, and walk to the stove.',
  'Place the sponge in the pan on the back left eye.',
  'Pick up the pan with the sponge in it.',
  'Turn right and walk until you have to turn right again and go to the white table.',
  'Place the pan with the sponge inside on the upper left corner of the table.'],
 ['Put a pan containing a sponge on the white table.',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it.',
  'Pick up the green sponge on the left of the sink.',
  'Turn left, and walk to the stove.',
  'Place the sponge in the pan on the back left eye.',
  'Pick up the pan with the sponge in it.',
  'Turn right and walk until you have to turn right again and go to the white table.',
  'Place the pan with the sponge inside on the upper l

In [82]:
alfred_original_sequences_filepath = Path("./data/alfred_original_sequences.json")
json.dump(alfred_original_sequences, alfred_original_sequences_filepath.open('w'), indent=4)

In [92]:
# Format 2
# alfred_shuffled_sequences.json
alfred_shuffled_sequences = []

for data in tqdm(data_list):
    for i in range(num_shuffles):
        shuffled_sequence = copy.deepcopy([data['task']])
        shuffled_instructions = copy.deepcopy(data['instructions'])
        random.shuffle(shuffled_instructions)
        shuffled_sequence.extend(shuffled_instructions)
        alfred_shuffled_sequences.append(shuffled_sequence)
    

100%|██████████| 21025/21025 [00:00<00:00, 24138.72it/s]


In [94]:
alfred_shuffled_sequences[:5]

[['Put a pan containing a sponge on the white table.',
  'Turn left, and walk to the stove.',
  'Pick up the green sponge on the left of the sink.',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it.',
  'Pick up the pan with the sponge in it.',
  'Place the sponge in the pan on the back left eye.',
  'Turn right and walk until you have to turn right again and go to the white table.',
  'Place the pan with the sponge inside on the upper left corner of the table.'],
 ['Put a pan containing a sponge on the white table.',
  'Pick up the green sponge on the left of the sink.',
  'Place the sponge in the pan on the back left eye.',
  'Turn left, and walk to the stove.',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it.',
  'Turn right and walk until you have to turn right again and go to the white table.',
  'Place the pan with the sponge inside on the upper left corner of the table.',
  'Pick up the pa

In [97]:
assert len(alfred_original_sequences) == len(alfred_shuffled_sequences) == len(data_list) * num_shuffles

In [98]:
alfred_shuffled_sequences_filepath = Path("./data/alfred_shuffled_sequences.json")
json.dump(alfred_shuffled_sequences, alfred_shuffled_sequences_filepath.open('w'), indent=4)

### Create data files for finetuning NSGI format 

In [102]:
"""
Create two files
    finetuning_alfred_original_sequences.json
    finetuning_alfred_shuffled_sequences.json

Format:

Convert each sequence from the original and shuffled lists to a string
 string format: "sequence[0] <-> sequence[1] ..."
"""

ft_orig_sequences = []
for sequences in alfred_original_sequences:
    # sequences is a list
    formatted_string = ' <-> '.join(sequences)
    ft_orig_sequences.append(formatted_string)

In [104]:
ft_orig_sequences[:5]

['Put a pan containing a sponge on the white table. <-> Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> Pick up the green sponge on the left of the sink. <-> Turn left, and walk to the stove. <-> Place the sponge in the pan on the back left eye. <-> Pick up the pan with the sponge in it. <-> Turn right and walk until you have to turn right again and go to the white table. <-> Place the pan with the sponge inside on the upper left corner of the table.',
 'Put a pan containing a sponge on the white table. <-> Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> Pick up the green sponge on the left of the sink. <-> Turn left, and walk to the stove. <-> Place the sponge in the pan on the back left eye. <-> Pick up the pan with the sponge in it. <-> Turn right and walk until you have to turn right again and go to the white table. <-> Place the pan with the sponge inside on the upper left corner of the

In [109]:
ft_orig_sequences_filepath = Path("./data/finetuning_alfred_original_sequences.json")
json.dump(ft_orig_sequences, ft_orig_sequences_filepath.open('w'), indent=4)

In [106]:
ft_shuffled_sequences = []
for shuff_seqs in alfred_shuffled_sequences:
    # shuff_seqs is a list
    formatted_string = ' <-> '.join(shuff_seqs)
    ft_shuffled_sequences.append(formatted_string)

In [107]:
ft_shuffled_sequences[:5]

['Put a pan containing a sponge on the white table. <-> Turn left, and walk to the stove. <-> Pick up the green sponge on the left of the sink. <-> Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> Pick up the pan with the sponge in it. <-> Place the sponge in the pan on the back left eye. <-> Turn right and walk until you have to turn right again and go to the white table. <-> Place the pan with the sponge inside on the upper left corner of the table.',
 'Put a pan containing a sponge on the white table. <-> Pick up the green sponge on the left of the sink. <-> Place the sponge in the pan on the back left eye. <-> Turn left, and walk to the stove. <-> Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> Turn right and walk until you have to turn right again and go to the white table. <-> Place the pan with the sponge inside on the upper left corner of the table. <-> Pick up the pan with the sponge

In [108]:
ft_shuff_seqs_filepath = Path("./data/finetuning_alfred_shuffled_sequences.json")
json.dump(ft_shuffled_sequences, ft_shuff_seqs_filepath.open('w'), indent=4)

### Prepare data in reference format

In [11]:
# Start with the original sequences 
from pathlib import Path
import json
alfred_original_sequences_filepath = Path("./data/alfred_original_sequences.json")
alfred_original_sequences = json.load(alfred_original_sequences_filepath.open('r'))

# Format it with the tokens
ref_sequences = []
for sequences in alfred_original_sequences:
    # sequences is a list
    formatted_sequences = []
    for instruction in sequences:
        formatted_sequences.append(instruction + " <-> ")
    ref_sequences.append(formatted_sequences)
    

In [12]:
ref_sequences_filepath = Path("./data/alfred_reference_step_libraries.json")
json

[['Put a pan containing a sponge on the white table. <-> ',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> ',
  'Pick up the green sponge on the left of the sink. <-> ',
  'Turn left, and walk to the stove. <-> ',
  'Place the sponge in the pan on the back left eye. <-> ',
  'Pick up the pan with the sponge in it. <-> ',
  'Turn right and walk until you have to turn right again and go to the white table. <-> ',
  'Place the pan with the sponge inside on the upper left corner of the table. <-> '],
 ['Put a pan containing a sponge on the white table. <-> ',
  'Turn around and walk to the wall and turn left and walk to the counter with the sink on it. <-> ',
  'Pick up the green sponge on the left of the sink. <-> ',
  'Turn left, and walk to the stove. <-> ',
  'Place the sponge in the pan on the back left eye. <-> ',
  'Pick up the pan with the sponge in it. <-> ',
  'Turn right and walk until you have to turn right again and go to the

### Preparea data for finetuning BARTforConditionalGeneration using huggingface scripts 