## ALFRED DATASET



In [52]:
import os, sys, platform
import os.path as osp
import json
import numpy as np
import networkx as nx
from typing import *

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# import spacy
# from spacy import displacy
# import scipy.sparse as sp
# import sympy; sympy.init_printing()

# import torch
# torch.manual_seed(42)

from IPython.core.display import display, Image, HTML
from IPython.core.debugger import set_trace

import json
import glob
import pprint

### Env Setup

In [53]:
DATA_DIR = f"/mnt/sda4/DATA/ALFRED/data"
PROJ_DIR = f"{osp.expanduser('~')}/Research/projects/embodied-ai"
SPLITS_DIR = f"{PROJ_DIR}/alfred/data/splits"
node = platform.node()
if (node == 'v') or ('gpu' in node):
    print("On Vector cluster")
    %env MUJOCO_GL=egl
    DATA_DIR = "/scratch/ssd004/datasets/alfred/data"
    
nb_dir = osp.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.insert(0, nb_dir)

# print(f"{os.name}.{platform.system()}.{platform.release()}.{platform.node()}")
# print(f'os.cwd() -> {os.getcwd()}')

In [4]:
!pwd

/home/raeidsaqur/Research/projects/embodied-ai/alfred/nb-playground


### Splits 
We want to create a split dealing with just one kind of task for quick debugging and such.


In [7]:

if not os.path.exists(SPLITS_DIR):
    raise FileNotFoundError("Splits folder not found")

print_count = lambda x: pprint.pprint({k: len(v) for k, v in x.items()})
    
# args.splits = 'splits/oct21.json'
splits_path = os.path.join(f"{PROJ_DIR}/alfred/data", "splits/oct21.json" )
  
with open(splits_path, 'r') as f:
    splits = json.load(f)
    pprint.pprint({k: len(v) for k, v in splits.items()})
    

    

{'tests_seen': 1533,
 'tests_unseen': 1529,
 'train': 21023,
 'valid_seen': 820,
 'valid_unseen': 821}


In [29]:
SPLIT_TYPES = ['tests_seen',
              'tests_unseen',
              'train',
              'valid_seen',
              'valid_unseen']
TASK_TYPES = ['pick_and_place_simple', 
              'pick_clean_then_place_in_recep', 
              'pick_heat_then_place_in_recep',
              'pick_cool_then_place_in_recep', 
              'pick_two_obj_and_place', 
              'look_at_obj_in_light',
              'pick_and_place_with_movable_recep']

# There is 3487/3 trial (unique task_ids) for pick_and_place_simple in train.


In [None]:
splits

In [49]:
# splits.update({'tests_seen': [], 
#               'tests_unseen': []})
# s_train = splits['train']
# print(f"len(s_train)={len(s_train)}")
pps = list(filter(lambda d: 'pick_and_place_simple' in d['task'], splits['train']))
print(len(pps))

splits_pps_fn = 'oct21-pps.json'
splits_pps = {}
task_name = 'pick_and_place_simple'
for k,v in splits.items():
        if 'tests' in k:
            continue
        print(f"len(v) = {len(v)}")
        #pps = list(filter(lambda d: 'pick_and_place_simple' in d['task'], v))
        pps = list(filter(lambda d: task_name in d['task'], v))
        print(f"No. of {task_name} samples in {k} = {len(pps)}")
        splits_pps[k] = pps

3245
len(v) = 21023
No. of pick_and_place_simple samples in train = 3245
len(v) = 820
No. of pick_and_place_simple samples in valid_seen = 142
len(v) = 821
No. of pick_and_place_simple samples in valid_unseen = 100


In [50]:
print_count(splits_pps)

taskids_pps = set()
t = 'pick_cool_then_place_in_recep-LettuceSliced-None-DiningTable-17/trial_T20190909_070538_437648'
# def get_task_id(t):
#     return t.split("/")[-1]

get_task_id = lambda t: t.split("/")[-1]

## Get the unique taskids in train and validation sets corresponding to the pps task
for k,v in splits_pps.items():
    for d in v:
        taskids_pps.add(get_task_id(d['task']))


{'train': 3245, 'valid_seen': 142, 'valid_unseen': 100}


In [47]:
print(len(taskids_pps))
import random
[v for i,v in enumerate(random.sample(taskids_pps, 5))]

1110


['trial_T20190909_091246_807206',
 'trial_T20190906_213417_095299',
 'trial_T20190907_092230_655414',
 'trial_T20190909_061144_774666',
 'trial_T20190907_173922_758000']

In [40]:
print_count(splits)

{'tests_seen': 1533,
 'tests_unseen': 1529,
 'train': 21023,
 'valid_seen': 820,
 'valid_unseen': 821}


In [41]:
## Any overlapping task_ids in 'tests' splits from train and valid? No
for k,v in splits.items():
    if 'tests' not in k:
        continue
    print(f"len(v) = {len(v)}")  
    pps = list(filter(lambda d: d['task'] in taskids_pps, v))
    print(f"No. of {task_name} samples in {k} = {len(pps)}")
    splits_pps[k] = pps
    
print_count(splits_pps)

len(v) = 1533
No. of pick_and_place_simple samples in tests_seen = 0
len(v) = 1529
No. of pick_and_place_simple samples in tests_unseen = 0
{'tests_seen': 0,
 'tests_unseen': 0,
 'train': 3245,
 'valid_seen': 142,
 'valid_unseen': 100}


For `tests_[seen|unseen]`, only the `traj_data.json` file is provided. No additonal image info is provided, so can't curate tests by task type.


In [51]:
# Coalesced function

def get_splits_by_task_type(task_type:str,
                            proj_dir=None,
                            proj_name='alf', default_split="splits/oct21.json",
                            save_json=False,
                            debug=False) -> Dict:
    print(f"Getting splits by task: {task_type}")
    if not proj_dir:
        proj_dir = f"{os.path.expanduser('~')}/Research/projects/embodied-ai/{proj_name}"
        if not os.path.exists(proj_dir):
            raise FileNotFoundError("Project dir does not exist")

    splits_path = os.path.join(f"{proj_dir}/data", default_split)
    with open(splits_path, 'r') as f:
        splits = json.load(f)
        if debug: print_count(splits)
    splits_task = {}
    for k,v in splits.items():
        if 'tests' in k:
            continue
        tvs = list(filter(lambda d: task_type in d['task'], v))
        if debug: print(f"No. of {task_type} samples in {k} = {len(tvs)}")
        splits_task[k] = tvs

    for stype in SPLIT_TYPES:
        # Set empty list for 'tests_[seen|unseen]'
        if stype not in splits_task.keys():
            splits_task[stype] = []

    if debug: print_count(splits_task)

    if save_json:
        fn_prefix = default_split.split("/")[-1].split('.')[0]
        fn = "{}-{}.json".format(fn_prefix, task_type)
        save_path = os.path.join(proj_dir, f"data/splits/{fn}")
        print(f"\tSaving {task_type} split to: {save_path}")
        with open(save_path, 'w') as fp:
            json.dump(splits_task, fp, sort_keys=True, indent=4)

    return splits_task
    
