# Get relgrams(relational triples, <subj, verb, obj>) from the deptrees generated from goals and step headlines.
01.05.2023

In [1]:
import dill as pickle
from collections import defaultdict
from tqdm import tqdm

In [2]:
# Get tuples from goals.
# For loading the deptrees.
deptree_path = "/mount/studenten/arbeitsdaten-studenten1/shencg/BA/V2/event_representations/relational/dep_parsing/"
# For loading fid2goal and sid2step.
relgram_path = "/mount/studenten/arbeitsdaten-studenten1/shencg/BA/V2/event_representations/relational/relgram/"

# Load fid2goal.
fid2goal = pickle.load(open(relgram_path + 'fid2goal.p', 'rb'))
# Load deptrees of goals.
goal_parses = pickle.load(open(deptree_path + 'goal_parsing/goal_parses.p', 'rb'))

## Goal.

In [3]:
def get_subtree(deptree, arc2dep:dict, idx_node:int):
    """
    Get the subtree of a node in a dependency tree in form of a python dictionary.
    
    @param deptree:supar.utils.Dataset  Dependency tree object generated by supar.
    @param arc2dep:dict A mapping from arc to a [word, idx, rel]-list representing 
                        the whole deptree of a sentence.
    @param idx_node:str  Idex of the word corresponding to the node in the deptree.
    @returns subtree:dict  A mapping from arc to a [word, idx, rel]-list representing 
                           the subtree of the passed node.
    """
    
    if idx_node in range(1, len(deptree.texts)+1):
        subtree = defaultdict(list)
        if idx_node in arc2dep:
            subtree[idx_node] = arc2dep[idx_node]
            subtree_words = [item[0] for item in arc2dep[idx_node]]
            subtree_idxs = [item[1] for item in arc2dep[idx_node]]
            for idx_subtree_node in subtree_idxs:
                subtree.update(get_subtree(deptree, arc2dep, idx_subtree_node))
        subtree = dict(sorted(subtree.items()))
        return subtree
    else:
        raise ValueError("The word is not in any node of the deptree! Check your argument!")

In [4]:
def get_quadruple(deptree):
    """
    Get <subj, verb, obj, prep_phrase>-tuples ('quadruple') from the dependency trees(deptree).
    
    @param deptree: supar.utils.Dataset  Dependency tree object generated by supar.
    @returns tuple: list[str]  The quadruple extracted from each deptree.
    """
    
    # Create a mapping from the arcs of a deptree to the [word, index, relation] of the deptree.
    arc2dep = defaultdict(list)
    for idx, w, arc, rel in zip(range(1, len(deptree.texts)+1), deptree.texts, deptree.arcs, deptree.rels):
        tmp = []
        if [w, idx, rel] not in arc2dep[arc]:
            tmp.append([w, idx, rel])
        arc2dep[arc] += tmp
    arc2dep = dict(sorted(arc2dep.items()))
    
    # Initialize 4-tuple with the root word, i.e. the predicate verb.
    root = arc2dep[0][0][0]
    # Get the index of the root word in the sentence.
    idx_root = arc2dep[0][0][1]
    
    quadruple = [root]
    indices = [[idx_root-2]]
    
    
    idx_dobj = None
    dobj = ""
    prep = ""
    idx_pobj = None
    for arc, deps in get_subtree(deptree, arc2dep, idx_root).items():
        for dep in deps:
            if dep[2] == 'dobj':
                idx_dobj = dep[1]
                if arc == idx_root:
                    dobj = dobj + dep[0] + ' '
#                     indices.append([idx_dobj-2])
                    if len(indices) == 1:
                        indices.append([idx_dobj-2])
                    elif len(indices) > 1:
                        indices[-1].append(idx_dobj-2)
                    else:
                        break
            if dep[2] == 'prep':
                idx_prep = dep[1]
                prep = prep + ' ' + dep[0]
                prep.strip()
                indices.append([idx_prep-2])
            if dep[2] == 'pobj':
                idx_pobj = dep[1]
                prep = prep + ' ' + dep[0]
                prep.strip()
                indices[-1] += [idx_pobj-2]
                
                
        # Check if the predicate consists of multiple verbs, e.g. "make and use", and 
        # if the direct object and the object of the prep. phrase is a phrase consisting of mulitple tokens, 
        # e.g. "heel pain and plantar fasciitis".
        if len(deps) > 1:
            for dep1, dep2 in zip(deps[:-1], deps[1:]):
                if dep1[2] == 'cc' and dep2[2] == 'conj':
                    # If the predicate consists of multiple verbs
                    if arc == idx_root:
                        quadruple[0] = quadruple[0] + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_root.
                        indices[0] += [dep1[1]-2, dep2[1]-2]
                    # If the direct object consists of multiple tokens
                    if arc == idx_dobj:
                        dobj = dobj + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_dobj.
                        if len(indices) > 1:
                            indices[1] += [dep1[1]-2, dep2[1]-2]
                    # If the object of the prep. phrase consists of multiple tokens
                    if arc == idx_pobj:
                        prep = prep + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_pobj.
                        if len(indices) > 1:
                            indices[-1] += [dep1[1]-2, dep2[1]-2]
                        
    # Remove the whitespace at the end of dobj, especially for the dobj with multiple words.
    dobj = dobj.strip()
    # Append dobj and prep to quadruple and add subj('person') at the beginning of quadruple.
    quadruple.append(dobj)
    quadruple.append(prep)
    quadruple = ['<|startoftext|>'] + quadruple
    indices = [[0]] + indices
    
    # Delete whitespace at the beginning of each item in quadruple.
    quadruple = [item[1:] if item.startswith(' ') else item for item in quadruple]
    
    # Unify the quadruple length by padding with a '[PAD]' token, so that the BERT-embeddings will be of the same length.
#     quadruple = ['[PAD]' if item == '' else item for item in quadruple]
    for idx, item in enumerate(quadruple):
        if item == "":
            quadruple[idx] = "[PAD]"
#             indices = indices + [[-1]]
            indices.insert(idx, [-1])
        else:
            item = item
            
    return [quadruple, indices]

### Start Test.

In [5]:
list(fid2goal.items())[1]

('10000832', ('how', 'to', 'plant', 'lavender', 'in', 'pots'))

In [6]:
deptree = list(fid2parses.items())[1][1].sentences[0]
deptree

1	how	_	_	_	_	3	advmod	_	_
2	to	_	_	_	_	3	aux	_	_
3	plant	_	_	_	_	0	root	_	_
4	lavender	_	_	_	_	3	dobj	_	_
5	in	_	_	_	_	3	prep	_	_
6	pots	_	_	_	_	5	pobj	_	_

In [7]:
result = get_quadruple(deptree)
result

[['<|startoftext|>', 'plant', 'lavender', 'in pots'], [[0], [1], [2], [3, 4]]]

### End Test.

In [25]:
for i, w in enumerate(["<|startoftext|>"] + list(fid2goal['10058376'][2:])):
    print(i, w)

0 <|startoftext|>
1 fly
2 to
3 new
4 zealand
5 for
6 cheap


In [4]:
# Create a mapping from file_id to quadruples extracted from that goal.
fid2quadruple = dict()
for fid in tqdm(goal_parses):
    deptree = goal_parses[fid].sentences[0]
    if fid not in fid2quadruple:
        fid2quadruple[fid] = get_quadruple(deptree)
        
for fid, values in list(fid2quadruple.items()):
    print("BEFORE:", fid, values)
    values[1] = [item for sublist in values[1] for item in sublist]
    print("AFTER:", fid, values, "\n")
        
# for fid, values in list(fid2quadruple.items()):
#     if len(values[1]) == 1 and type(values[1][0]) == list:
# #         print("BEFORE:", fid, values)
#         values[1] = values[1][0]
#         for word in values[0]:
#             if word == '[PAD]':
# #                 values[1][values[0].index(word)] = -1
#                 pad_idx = values[0].index(word)
#                 l1, l2 = values[1][1:pad_idx+1], values[1][pad_idx+1:]
#                 print("l1:", l1, "l2:", l2)
#                 values[1] = l1 + [-1] + l2
# #         print("AFTER:", fid, values, "\n")

  0%|                                                                                                   | 0/53186 [00:00<?, ?it/s]


NameError: name 'get_quadruple' is not defined

In [16]:
for i, values in list(fid2quadruple.items()):
    items = " ".join(values[0]).split()
    if len(items) != len(values[1]):
        print(i, values)
        l1, l2 = values[0][:2], values[0][2:]
        l3, l4 = values[1][:2], values[1][2:]
        for i, item in enumerate(l2):
            if item != '[PAD]':
                l2[i] = '[PAD]'
        l4 = [-1, -1]
        values[0] = l1 + l2
        values[1] = l3 + l4
        print("l1:", l1, "l2:", l2, "l3:", l3, "l4:", l4)
        print("AFTER:", i, values, '\n')

11720166 [['<|startoftext|>', 'stop', 'and cheeks', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'stop'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'stop', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

1773985 [['<|startoftext|>', 'react', 'or cat', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'react'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'react', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

1797701 [['<|startoftext|>', 'girls', 'and routine', '[PAD]'], [0, 9, -1]]
l1: ['<|startoftext|>', 'girls'] l2: ['[PAD]', '[PAD]'] l3: [0, 9] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'girls', '[PAD]', '[PAD]'], [0, 9, -1, -1]] 

18294 [['<|startoftext|>', 'know', 'or her', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'know'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'know', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

389518 [['<|startoftext|>', 'get', 'and fruits', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>',

In [21]:
for i, values in list(fid2quadruple.items()):
    items = " ".join(values[0]).split()
    assert len(items) == len(values[1])

In [22]:
print(fid2goal['11720166'])
print(fid2quadruple['11720166'])

('how', 'to', 'stop', 'biting', 'your', 'lips', 'and', 'cheeks')
[['<|startoftext|>', 'stop', '[PAD]', '[PAD]'], [0, 1, -1, -1]]


In [23]:
print(fid2goal['469743'])
print(fid2quadruple['469743'])

('how', 'to', 'sleep', 'longer', '(', 'for', 'kids', 'and', 'teens', ')')
[['<|startoftext|>', 'sleep', '[PAD]', '[PAD]'], [0, 1, -1, -1]]


In [25]:
# # Write fid2quadruple to fid2quadruple.p file.
# with open('fid2quadruple.p', 'wb') as file:
#     pickle.dump(fid2quadruple, file)

## Step Headline.
Since step headlines do not start with "how to", the indices do not need to be -2. Overwrite the get_quadruple() function.

In [4]:
def get_quadruple(deptree):
    """
    Get <subj, verb, obj, prep_phrase>-tuples ('quadruple') from the dependency trees(deptree).
    
    @param deptree: supar.utils.Dataset  Dependency tree object generated by supar.
    @returns tuple: list[str]  The quadruple extracted from each deptree.
    """
    
    # Create a mapping from the arcs of a deptree to the [word, index, relation] of the deptree.
    arc2dep = defaultdict(list)
    for idx, w, arc, rel in zip(range(1, len(deptree.texts)+1), deptree.texts, deptree.arcs, deptree.rels):
        tmp = []
        if [w, idx, rel] not in arc2dep[arc]:
            tmp.append([w, idx, rel])
        arc2dep[arc] += tmp
    arc2dep = dict(sorted(arc2dep.items()))
    
    # Initialize 4-tuple with the root word, i.e. the predicate verb.
    root = arc2dep[0][0][0]
    # Get the index of the root word in the sentence.
    idx_root = arc2dep[0][0][1]
    
    quadruple = [root]
    indices = [[idx_root]]
    
    
    idx_dobj = None
    dobj = ""
    prep = ""
    idx_pobj = None
    for arc, deps in get_subtree(deptree, arc2dep, idx_root).items():
        for dep in deps:
            if dep[2] == 'dobj':
                idx_dobj = dep[1]
                if arc == idx_root:
                    dobj = dobj + dep[0] + ' '
#                     indices.append([idx_dobj])
                    if len(indices) == 1:
                        indices.append([idx_dobj])
                    elif len(indices) > 1:
                        indices[-1].append(idx_dobj)
                    else:
                        break
            if dep[2] == 'prep':
                idx_prep = dep[1]
                prep = prep + ' ' + dep[0]
                prep.strip()
                indices.append([idx_prep])
            if dep[2] == 'pobj':
                idx_pobj = dep[1]
                prep = prep + ' ' + dep[0]
                prep.strip()
                indices[-1] += [idx_pobj]
                
        # Check if the predicate consists of multiple verbs, e.g. "make and use", and 
        # if the direct object and the object of the prep. phrase is a phrase consisting of mulitple tokens, 
        # e.g. "heel pain and plantar fasciitis".
        if len(deps) > 1:
            for dep1, dep2 in zip(deps[:-1], deps[1:]):
                if dep1[2] == 'cc' and dep2[2] == 'conj':
                    # If the predicate consists of multiple verbs
                    if arc == idx_root:
                        quadruple[0] = quadruple[0] + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_root.
                        indices[0] += [dep1[1], dep2[1]]
                    # If the direct object consists of multiple tokens
                    if arc == idx_dobj:
                        dobj = dobj + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_dobj.
                        if len(indices) > 1:
                            indices[1] += [dep1[1], dep2[1]]
                    # If the object of the prep. phrase consists of multiple tokens
                    if arc == idx_pobj:
                        prep = prep + ' ' + dep1[0] + ' ' + dep2[0]
                        # Add the indices of the tokens into the internal list for the idx_pobj.
                        if len(indices) > 1:
                            indices[-1] += [dep1[1], dep2[1]]
                        
    # Remove the whitespace at the end of dobj, especially for the dobj with multiple words.
    dobj = dobj.strip()
    # Append dobj and prep to quadruple and add subj('person') at the beginning of quadruple.
    quadruple.append(dobj)
    quadruple.append(prep)
    quadruple = ['<|startoftext|>'] + quadruple
    indices = [[0]] + indices
    
    # Delete whitespace at the beginning of each item in quadruple.
#     quadruple = [item[1:] if item.startswith(' ') else item for item in quadruple]
    for idx, item in enumerate(quadruple):
        if item == "":
            quadruple[idx] = "[PAD]"
#             indices = indices + [[-1]]
            indices.insert(idx, [-1])
        else:
            item = item
    
    # Unify the quadruple length by padding with a '[PAD]' token, so that the BERT-embeddings will be of the same length.
#     quadruple = ['[PAD]' if item == '' else item for item in quadruple]
    for idx, item in enumerate(quadruple):
        if item == "":
            quadruple[idx] = "[PAD]"
#             indices = indices + [[-1]]
            indices.insert(idx, [-1])
        else:
            item = item
            
    return [quadruple, indices]

In [5]:
# Load sid2step
sid2step = pickle.load(open(relgram_path + "sid2step.p", "rb"))
print(len(sid2step))
sid2step

772221


{'10000798_0_0': ('find',
  'a',
  'door',
  'to',
  'hang',
  'your',
  'ironing',
  'board',
  '.'),
 '10000798_0_1': ('ensure',
  'the',
  'board',
  'will',
  'fit',
  'on',
  'the',
  'back',
  'of',
  'the',
  'chosen',
  'door',
  '.'),
 '10000798_0_2': ('buy',
  'a',
  'hanger',
  'with',
  '2',
  'hooks',
  'for',
  'a',
  'board',
  'with',
  'a',
  't',
  '-',
  'shaped',
  'base',
  '.'),
 '10000798_0_3': ('choose',
  'bendable',
  'hooks',
  'for',
  'a',
  'board',
  'with',
  'a',
  'u-',
  'or',
  'v-shaped',
  'base',
  '.'),
 '10000798_0_4': ('place',
  'the',
  'ironing',
  'board',
  'hanger',
  'over',
  'the',
  'door',
  '.'),
 '10000798_0_5': ('hang',
  'your',
  'ironing',
  'board',
  'on',
  'the',
  'hanger',
  '.'),
 '10000798_1_0': ('choose',
  'a',
  'load',
  '-',
  'bearing',
  'wall',
  'to',
  'hang',
  'the',
  'ironing',
  'board',
  '.'),
 '10000798_1_1': ('confirm',
  'that',
  'the',
  'board',
  'will',
  'fit',
  'your',
  'chosen',
  'wall',
 

In [6]:
### Get quadruples from the dependency trees generated from step headlines. ###
# Load the merged step_parses_all.p file.
step_parses = pickle.load(open(deptree_path + '/step_parsing/step_parsing_results/step_parses_all.p', 'rb'))

In [7]:
# Create a mapping from step_id to quadruple extracted from the headline of that step.
sid2quadruple = dict()
for sid in tqdm(step_parses):
    deptree = step_parses[sid].sentences[0]
    if sid not in sid2quadruple:
        sid2quadruple[sid] = get_quadruple(deptree)
        
for sid, values in list(sid2quadruple.items()):
#     print("BEFORE:", sid, values)
    values[1] = [item for sublist in values[1] for item in sublist]
#     print("AFTER:", sid, values, "\n")
        
# for sid in sid2quadruple:
#     sid2quadruple[sid] = list(sid2quadruple[sid])
#     for i, word in enumerate(sid2quadruple[sid][0]):
#         sid2quadruple[sid][0][i] = word.strip()
        
# for fid, values in list(sid2quadruple.items()):
#     print("BEFORE:", fid, values)
#     tmp = []
#     for sublist in values[1]:
#         tmp += sublist
#     values[1] = tmp
#     print("AFTER:", fid, values, "\n")
#     sid2quadruple[fid][1] = values[1]

100%|██████████████████████████████████████████████████████████████████████████████████| 772221/772221 [00:33<00:00, 22948.94it/s]


In [8]:
for i, values in list(sid2quadruple.items()):
    items = " ".join(values[0]).split()
#     print("ITEMS:", items)
    if len(items) != len(values[1]):
        print(i, values)
        l1, l2 = values[0][:2], values[0][-2:]
        l3, l4 = values[1][:2], values[1][-2:]
        for i, item in enumerate(l2):
            if item != '[PAD]':
                l2[i] = '[PAD]'
        l4 = [-1, -1]
        # words
        values[0] = l1 + l2
        # indices
        values[1] = l3 + l4
        print("l1:", l1, "l2:", l2, "l3:", l3, "l4:", l4)
        print("AFTER:", i, values, '\n')

10002000_1_2 [['<|startoftext|>', 'let', 'or activity', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'let'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'let', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

1000676_1_3 [['<|startoftext|>', 'continue', 'and vegetables', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'continue'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'continue', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

10024314_2_2 [['<|startoftext|>', 'continue', 'and activities', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'continue'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'continue', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

10030143_0_2 [['<|startoftext|>', 'stop', 'or herb', ' by doctor'], [0, 1, 12, 14]]
l1: ['<|startoftext|>', 'stop'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'stop', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

100340_1_1 [['<|startoftext|>', 'avoi

1754860_0_4 [['<|startoftext|>', 'know', 'or strain', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'know'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'know', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

1754860_1_2 [['<|startoftext|>', 'pretend', 'or tingling', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'pretend'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'pretend', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

1757182_1_1 [['<|startoftext|>', 'remember', 'and dates', ' on time of conception'], [0, 1, 14, 16, 17, 18]]
l1: ['<|startoftext|>', 'remember'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'remember', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

176276_2_2 [['<|startoftext|>', 'make', 'or alcohol', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'make'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'make', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

176306_1_7 [['<|startoftext|

4562872_1_0 [['<|startoftext|>', 'add', 'tablespoons', ' wok or pan in of oil or oil'], [0, 7, 3, 1, 10, 15, 17, 18, 21]]
l1: ['<|startoftext|>', 'add'] l2: ['[PAD]', '[PAD]'] l3: [0, 7] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'add', '[PAD]', '[PAD]'], [0, 7, -1, -1]] 

45651_0_2 [['<|startoftext|>', 'try', 'and cards', ' by bank'], [0, 1, 11, 14]]
l1: ['<|startoftext|>', 'try'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'try', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

45651_2_1 [['<|startoftext|>', 'forget', 'and money', '[PAD]'], [0, 3, -1]]
l1: ['<|startoftext|>', 'forget'] l2: ['[PAD]', '[PAD]'] l3: [0, 3] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'forget', '[PAD]', '[PAD]'], [0, 3, -1, -1]] 

45682_0_4 [['<|startoftext|>', 'option', 'and settings', '[PAD]'], [0, 2, -1]]
l1: ['<|startoftext|>', 'option'] l2: ['[PAD]', '[PAD]'] l3: [0, 2] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'option', '[PAD]', '[PAD]'], [0, 2, -1, -1]] 

4571515_1_7 [['<|starto

8272783_0_4 [['<|startoftext|>', 'consider', 'or lathe', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'consider'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'consider', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

8272926_2_0 [['<|startoftext|>', 'determine', 'or female', '[PAD]'], [0, 1, -1]]
l1: ['<|startoftext|>', 'determine'] l2: ['[PAD]', '[PAD]'] l3: [0, 1] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'determine', '[PAD]', '[PAD]'], [0, 1, -1, -1]] 

8273078_0_6 [['<|startoftext|>', 'drop', 'or tablespoondough', ' onto sheet'], [0, 10, 13, 14, 17]]
l1: ['<|startoftext|>', 'drop'] l2: ['[PAD]', '[PAD]'] l3: [0, 10] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'drop', '[PAD]', '[PAD]'], [0, 10, -1, -1]] 

8273078_1_5 [['<|startoftext|>', 'drop', 'or tablespoondough', ' onto sheet'], [0, 10, 13, 14, 17]]
l1: ['<|startoftext|>', 'drop'] l2: ['[PAD]', '[PAD]'] l3: [0, 10] l4: [-1, -1]
AFTER: 1 [['<|startoftext|>', 'drop', '[PAD]', '[PAD]'], [0, 10, -1, -1]] 

82

In [39]:
print(sid2step['10044817_0_6'])
print(sid2quadruple['10044817_0_6'])

('pack', 'lightly', 'but', 'do', 'n’t', 'forget', 'your', 'rain', 'gear', 'and', 'layers', '.')
[['<|startoftext|>', 'pack but forget', '[PAD]', '[PAD]'], [0, 1, -1, -1]]


In [30]:
# for i, values in list(sid2quadruple.items())[:100]:
#     if len(values[0]) < len(values[1]):
#         print("BEFORE:", i, values)
#         values[0][-2] = '[PAD]'
#         values[1] += [[-1]]
#         print("AFTER:", i, values, '\n')

BEFORE: 10000798_0_1 [['<|startoftext|>', 'ensure', '[PAD]', ' on back of door'], [[0], [1], [-1], [6, 8], [9, 12]]]
AFTER: 10000798_0_1 [['<|startoftext|>', 'ensure', '[PAD]', ' on back of door'], [[0], [1], [-1], [6, 8], [9, 12], [-1]]] 

BEFORE: 10000798_0_2 [['<|startoftext|>', 'buy', 'hanger', ' with for hooks board with base'], [[0], [1], [3], [4], [7, 6, 9], [10, 15]]]
AFTER: 10000798_0_2 [['<|startoftext|>', 'buy', '[PAD]', ' with for hooks board with base'], [[0], [1], [3], [4], [7, 6, 9], [10, 15], [-1]]] 

BEFORE: 10000798_0_3 [['<|startoftext|>', 'choose', 'hooks', ' for board with base'], [[0], [1], [3], [4, 6], [7, 12]]]
AFTER: 10000798_0_3 [['<|startoftext|>', 'choose', '[PAD]', ' for board with base'], [[0], [1], [3], [4, 6], [7, 12], [-1]]] 

BEFORE: 10000832_0_2 [['<|startoftext|>', 'fill', 'inches', ' with of pot peanuts or gravel'], [[0], [1], [7], [11], [8, 10, 14, 15, 16]]]
AFTER: 10000832_0_2 [['<|startoftext|>', 'fill', '[PAD]', ' with of pot peanuts or gravel']

In [10]:
# # Write file.
# with open("sid2quadruple.p", "wb") as file:
#     pickle.dump(sid2quadruple, file)