In [1]:
import benepar 
parser = benepar.Parser("benepar_en2")
def parse_line(line):
    tree = parser.parse(line)
    item = None
    for tr in tree.subtrees():
        if tr.label() == 'VP':
            item = tr
    return item 

# Process queries by constituency parser, extract main verb (verb not in subordinating conjunction)
# Main objects are also extracted 
t = "person turns off the light as they're leaving"

from nltk.tree import ParentedTree
def is_sbar(tr, v_cnt):
    ptr = tr.parent()
    flag = False 
    while ptr:
        if ptr.label() == 'SBAR' and v_cnt !=0:
            flag = True 
            break 
        else:
            ptr = ptr.parent() 
    return flag 

verbs_tags = ['VBZ', 'VBP', 'VB', 'VBD','VBG', 'VBN']
nouns_tags = ['NNS', 'NN']
def find_verb(item):
    verb = None 
    for st in item.subtrees():
        if st.label() in verbs_tags:
            verb = st.leaves()
            return verb, st  
    return verb, st  

def find_noun(item):
    noun = None 
    for st in item.subtrees():
        if st.label() in nouns_tags:
            noun = st.leaves()
            return noun
    return None 

# Extract main verbs and nouns as translation phrase 
def process_line(t):
    line = t.split("##")[-1]
    tree = parser.parse(line)
    newtree = ParentedTree.convert(tree)
    output= [] 
    verbs = [] 
    nouns = [] 
    v_cnt = 0 
    out = ""
    for tr in newtree.subtrees():
        if tr.label() == 'VP':
            #flag = is_sbar(tr, v_cnt)
            flag = False 
            if not flag:
                verb, st = find_verb(tr)
                if verb not in verbs:
                    verbs.append(verb)
                    v_cnt +=1 
                    if verb:
                        out += " "+ " ".join(verb)
                noun = find_noun(tr)
                if noun not in nouns: 
                    nouns.append(noun)
                    #print(noun)
                    if noun:
                        out += " "+ " ".join(noun)
    return verbs, nouns, out   


In [2]:
verbs, nouns, out  = process_line(t)

In [3]:
stem_queries_verb = {}
from nltk.stem.wordnet import WordNetLemmatizer

out.split()
words = [WordNetLemmatizer().lemmatize(w,'v') for w in out.split()]

def process_pip(line):
    verbs, nouns, out  = process_line(line)
    words = [WordNetLemmatizer().lemmatize(w,'v') for w in out.split()]
    return words 
    
    

In [4]:
test_words = {}


test_lines = open("/Users/yanjungao/Desktop/VPMT/data/charades/annotations/charades_sta_test.txt").readlines()
train_lines = open("/Users/yanjungao/Desktop/VPMT/data/charades/annotations/charades_sta_train.txt").readlines()

vocabs = [] 
for l in test_lines:
    _id = test_lines.index(l)
    words = process_pip(l)
    test_words[_id] = words
    vocabs.append(words)
    

    
    

In [5]:
test_words

{0: ['turn', 'light'],
 1: ['flip', 'light'],
 2: ['turn', 'light'],
 3: ['be', 'switch', 'play'],
 4: ['be', 'picture', 'put'],
 5: ['put', 'bag'],
 6: ['walk', 'doorway'],
 7: ['open', 'door'],
 8: ['close', 'door'],
 9: ['close', 'door'],
 10: ['run', 'stairs'],
 11: ['watch', 'stairs', 'run'],
 12: ['run', 'window', 'look'],
 13: ['sit', 'chair'],
 14: ['run', 'window'],
 15: ['run', 'door'],
 16: ['sit', 'chair'],
 17: ['sit', 'chair'],
 18: ['stand', 'bathroom', 'hold', 'glass'],
 19: ['take', 'bag'],
 20: ['sit', 'chair'],
 21: ['sit', 'chair'],
 22: ['throw', 'blanket'],
 23: ['throw', 'clothe'],
 24: ['stand'],
 25: ['awaken', 'sofa'],
 26: ['take', 'medicine'],
 27: ['sit', 'chair'],
 28: ['take', 'cup'],
 29: ['open', 'door'],
 30: ['open', 'door', 'lead'],
 31: ['take', 'cup'],
 32: ['walk', 'doorway', 'drink', 'coffee'],
 33: ['drink', 'glass'],
 34: ['put', 'shoe'],
 35: ['hold', 'glass'],
 36: ['eat', 'food'],
 37: ['be', 'entryway', 'eat', 'sandwich'],
 38: ['hold', 'gl

In [6]:
train_words = {} 
for l in train_lines:
    _id = train_lines.index(l)
    words = process_pip(l)
    train_words[_id] = words
    vocabs.append(words)

In [7]:
vocabs.extend(list(test_words.values()))

In [8]:
vocabs_all = [j for i in vocabs for j in i]
vocabs = set(vocabs_all)

In [9]:
vocab_idx = {k:list(vocabs).index(k)+1 for k in vocabs}

In [10]:
vocab_idx['PAD'] = 0 
vocab_idx['<sos>'] = len(vocab_idx) 
vocab_idx['<eos>'] = len(vocab_idx) 

In [19]:
vocab_idx

{'belong': 1,
 'lollipop': 2,
 'snack': 3,
 'selfie': 4,
 'mirror': 5,
 'cell': 6,
 'crouch': 7,
 'stairs': 8,
 'guy': 9,
 'fall': 10,
 'puzzle': 11,
 'pills': 12,
 'pantry': 13,
 'entry': 14,
 'paper': 15,
 'pause': 16,
 'opend': 17,
 'form': 18,
 'feet': 19,
 'do': 20,
 'hit': 21,
 'head': 22,
 'jiggle': 23,
 'dog': 24,
 'photo': 25,
 'sock': 26,
 'soda': 27,
 'wash': 28,
 'load': 29,
 'type': 30,
 'products': 31,
 'game': 32,
 'pose': 33,
 'say': 34,
 'cook': 35,
 'enter': 36,
 'turn': 37,
 'cap': 38,
 'faucet': 39,
 'be': 40,
 'blind': 41,
 'eat': 42,
 'closet': 43,
 'tap': 44,
 'linen': 45,
 'entryway': 46,
 'stairwell': 47,
 'switch': 48,
 "'s": 49,
 'blanket/pillow': 50,
 'boot': 51,
 'program': 52,
 'toss': 53,
 'string': 54,
 'pot': 55,
 'photograph': 56,
 'doorways': 57,
 'plug': 58,
 'selfies': 59,
 'finger': 60,
 'others': 61,
 'plat': 62,
 'redress': 63,
 'seem': 64,
 'nightstand': 65,
 'view': 66,
 'pitcher': 67,
 'lightswitch': 68,
 'adjust': 69,
 'winter': 70,
 'doors':

In [12]:
idx_vocab = {v:k for k,v in vocab_idx.items()}

In [13]:
idx_vocab

{1: 'belong',
 2: 'lollipop',
 3: 'snack',
 4: 'selfie',
 5: 'mirror',
 6: 'cell',
 7: 'crouch',
 8: 'stairs',
 9: 'guy',
 10: 'fall',
 11: 'puzzle',
 12: 'pills',
 13: 'pantry',
 14: 'entry',
 15: 'paper',
 16: 'pause',
 17: 'opend',
 18: 'form',
 19: 'feet',
 20: 'do',
 21: 'hit',
 22: 'head',
 23: 'jiggle',
 24: 'dog',
 25: 'photo',
 26: 'sock',
 27: 'soda',
 28: 'wash',
 29: 'load',
 30: 'type',
 31: 'products',
 32: 'game',
 33: 'pose',
 34: 'say',
 35: 'cook',
 36: 'enter',
 37: 'turn',
 38: 'cap',
 39: 'faucet',
 40: 'be',
 41: 'blind',
 42: 'eat',
 43: 'closet',
 44: 'tap',
 45: 'linen',
 46: 'entryway',
 47: 'stairwell',
 48: 'switch',
 49: "'s",
 50: 'blanket/pillow',
 51: 'boot',
 52: 'program',
 53: 'toss',
 54: 'string',
 55: 'pot',
 56: 'photograph',
 57: 'doorways',
 58: 'plug',
 59: 'selfies',
 60: 'finger',
 61: 'others',
 62: 'plat',
 63: 'redress',
 64: 'seem',
 65: 'nightstand',
 66: 'view',
 67: 'pitcher',
 68: 'lightswitch',
 69: 'adjust',
 70: 'winter',
 71: 'doo

In [14]:
def label_index(queries, vocab_idx):
    out_idx = {}
    for k,v in queries.items():
        v.insert(0, '<sos>')
        v.append('<eos>')
        val = [vocab_idx[i] for i in v]
        out_idx[k] = val 
    return out_idx 

train_idx = label_index(train_words, vocab_idx)

In [15]:
len(train_idx)

12408

In [16]:
test_idx = label_index(test_words, vocab_idx)

In [20]:
test_idx

{0: [558, 37, 534, 559],
 1: [558, 458, 534, 559],
 2: [558, 37, 534, 559],
 3: [558, 40, 48, 96, 559],
 4: [558, 40, 269, 307, 559],
 5: [558, 307, 179, 559],
 6: [558, 418, 512, 559],
 7: [558, 431, 366, 559],
 8: [558, 112, 366, 559],
 9: [558, 112, 366, 559],
 10: [558, 459, 8, 559],
 11: [558, 312, 8, 459, 559],
 12: [558, 459, 272, 440, 559],
 13: [558, 97, 104, 559],
 14: [558, 459, 272, 559],
 15: [558, 459, 366, 559],
 16: [558, 97, 104, 559],
 17: [558, 97, 104, 559],
 18: [558, 121, 380, 482, 358, 559],
 19: [558, 100, 179, 559],
 20: [558, 97, 104, 559],
 21: [558, 97, 104, 559],
 22: [558, 204, 203, 559],
 23: [558, 204, 555, 559],
 24: [558, 121, 559],
 25: [558, 336, 218, 559],
 26: [558, 100, 164, 559],
 27: [558, 97, 104, 559],
 28: [558, 100, 448, 559],
 29: [558, 431, 366, 559],
 30: [558, 431, 366, 183, 559],
 31: [558, 100, 448, 559],
 32: [558, 418, 512, 355, 239, 559],
 33: [558, 355, 358, 559],
 34: [558, 307, 536, 559],
 35: [558, 482, 358, 559],
 36: [558, 42,

In [18]:
import json 
with open('train_translate.json', 'w') as f:
    json.dump([train_idx, train_words], f)

with open('test_translate.json', 'w') as f:
    json.dump([test_idx, test_words], f)
    
with open('vocab_translate.json', 'w') as f:
    json.dump([vocab_idx, idx_vocab], f)