In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [20]:
doc = nlp(u'A man and a man are playing computer and sitting on the ground in the wild. A woman is waving under the tree. A man is lying on the sofa playing games. There are some trees and plants in the park. An alien is lying crying.')
# parse(doc)
for token in doc:
    print("{2}({3}-{6}, {0}:{1}-{5})".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_, token.i+1, token.head.i+1))

det(man-2, A:DT-1)
nsubj(playing-7, man:NN-2)
cc(man-2, and:CC-3)
det(man-5, a:DT-4)
conj(man-2, man:NN-5)
aux(playing-7, are:VBP-6)
ROOT(playing-7, playing:VBG-7)
dobj(playing-7, computer:NN-8)
cc(playing-7, and:CC-9)
conj(playing-7, sitting:VBG-10)
prep(sitting-10, on:IN-11)
det(ground-13, the:DT-12)
pobj(on-11, ground:NN-13)
prep(sitting-10, in:IN-14)
det(wild-16, the:DT-15)
pobj(in-14, wild:NN-16)
punct(playing-7, .:.-17)
det(woman-19, A:DT-18)
nsubj(waving-21, woman:NN-19)
aux(waving-21, is:VBZ-20)
ROOT(waving-21, waving:VBG-21)
prep(waving-21, under:IN-22)
det(tree-24, the:DT-23)
pobj(under-22, tree:NN-24)
punct(waving-21, .:.-25)
det(man-27, A:DT-26)
nsubj(lying-29, man:NN-27)
aux(lying-29, is:VBZ-28)
ROOT(lying-29, lying:VBG-29)
prep(lying-29, on:IN-30)
det(games-34, the:DT-31)
compound(games-34, sofa:NN-32)
compound(games-34, playing:NN-33)
pobj(on-30, games:NNS-34)
punct(lying-29, .:.-35)
expl(are-37, There:EX-36)
ROOT(are-37, are:VBP-37)
det(trees-39, some:DT-38)
attr(are-37

In [101]:
from rules.labels import subjects
from collections import defaultdict

def incre_name(s, dic):
    count = 0
    for k in dic:
        # remove any tail digits
        sub = re.sub(r'(?<=\w)\d+$','', k)
        if re.match(r'%s\d*' % sub, s):
            # print(count, r'%s\d*' % sub, s)
            count += 1
    if count > 0:
        return '%s%i' % (s, count)
    return s

def get_tokens(doc):
    return [t for t in doc if not t.is_stop and not t.is_punct]
    
def ground_subj(tokens_, nested_):
    
    ## query the base to identify a subject
    tokens_copy = tokens_.copy()
    for token in tokens_copy:
        for type_ in subjects:
            if token.lemma_ in subjects[type_]:
                tokens_.remove(token)
                nested_[token] = defaultdict(set)
#                 if token.lemma_ not in nested_:
#                     map_dict[token] = token.lemma_
#                     nested_[token.lemma_] = defaultdict(set)
#                 else:
#                     lemma_i = incre_name(token.lemma_, nested_)
#                     map_dict[token] = lemma_i
#                     nested_[lemma_i] = defaultdict(set)

def ground_act(tokens_, nested_):
    
    ## ---- syntactical parency
    tokens_copy = tokens_.copy()
    for token in tokens_copy:
        for key in nested_:
            if key.head == token:
                assert(key.dep_ == 'nsubj')
                tokens_.remove(token)
                nested_[key][token] = set()
    
    ## ---- conjuncted verbs
    tokens_copy = tokens_.copy()
    # cannot pickle a spacy.token
    ## nested_copy = deepcopy(nested_)
    # cannnot modify a dict during iteration, thus save keys
    saved_tups = []
    for token in tokens_copy:
        for subj in nested_:
            for act in nested_[subj]:
                if token.head == act and token.pos_ == 'VERB':
                    assert(token.dep_ in ['conj', 'xcomp', 'advcl']), token.dep_
                    tokens_.remove(token)
                    saved_tups.append((subj, token))
    for subj, token in saved_tups:
        nested_[subj][token] = set()
    
    ## ---- other verbs
    ### Finally,
    ### if the verb has the common root with any of the subjects, bind it. This cause confusion when a sentence contains two or more subjects
    tokens_copy = tokens_.copy()
    saved_tups = []
    for token in tokens_copy:
        if token.pos_ == 'VERB':
            for subj in nested_:
                if token.sent.root == subj.sent.root:
                    tokens_.remove(token)
                    saved_tups.append(subj, token)
    for subj, token in saved_tups:
        nested_[subj][token] = set()    
    
# static var stuff
# related dict needs to transformed to dict
def query_related(t, k):
    import dill
    with open('relateDict.pkl', 'rb') as f:
        relateDict = dill.load(f) 
    if k in relateDict and t in relateDict[k]:
        return relateDict[k][t]
    
    import requests
    return requests.get('http://api.conceptnet.io/relatedness?node1=/c/en/%s&node2=/c/en/%s' % (k, t)).json()['value']
        
def get_simi_obj(token, keywords, thresh=0.5):
    tups = [(k, query_related(token, k)) for k in keywords]
    simi_obj, simi_ = sorted(tups, key=lambda x: x[1])[-1]
    if simi_ >= thresh:
        return simi_obj
#     return None
#     for keyword in keywords:
#         if  > thresh:
#             return keyword
    return None
    
def ground_obj(tokens_, nested_):
    
    ## sytactically ground
    tokens_copy = tokens_.copy()
    saved_tups = []
    for token in tokens_copy:
        for subj in nested_:
            for act in nested_[subj]:
                if token.head == act and token.pos_ == 'NOUN':
                    assert(token.dep_ in ['dobj'])
                    tokens_.remove(token)
                    saved_tups.append((subj, act, token))
    for subj, act, token in saved_tups:
        nested_[subj][act].add(token)
        
    ## query the base knowledge for surrounding objects
    tokens_copy = tokens_.copy()
    saved_tups = []
    for token in tokens_copy:
        for subj in nested_:
            if subj.lemma_ in subjects['surrounding']:
                assert(subj.lemma_ in layerbase.layer_merge_.nested_entities_)
                if token.lemma_ in layerbase.layer_merge_.nested_entities_[subj.lemma_]['have']:            
                    print(subj, token)
                # get the most similar obj in the vocabulary under this subject
                obj_ = get_simi_obj(token.lemma_,
                                    layerbase.layer_merge_.nested_entities_[subj.lemma_]['have'],
                                    thresh=0.2)
                print(token.lemma_, obj_)
                if obj_:
                    tokens_.remove(token)
                    saved_tups.append((subj, token))
                    # prevent other subjs containing the same object emerging
                    # first come, first serve
                    break 
    for subj, token in saved_tups:
        nested_[subj]['have'].add(token)
        
#     ## other objects, just query the surroundings in knowledge and see which it belongs to
#     tokens_copy = tokens_.copy()
#     for token in tokens_copy:    
#         for subj in layerbase.layer_merge_.nested_entities_:
#             if subj not in subjects['surrounding']:
#                 continue
#             for act in layerbase.layer_merge_.nested_entities_[subj]:
#                 obj_ = get_simi_obj(token.lemma_,
#                                     layerbase.layer_merge_.nested_entities_[subj][act],
#                                     thresh=0.3)
#                 if obj_:
#                     assert(act == 'have'), (act, token)
#                     # assert(subj not in [s.lemma_ for s in nested_])
#                     ## Attention! here the key type is string now
#                     print(token, subj, obj_)
#                     tokens_.remove(token)
#                     nested_[subj][act].add(token)
#                     break
                    

# def comb_obj(tokens_, nested_):
    
#     ## layers built from clusters
#     tokens_copy = tokens_.copy()
#     for token in tokens_copy:
    
    

####
# technically all entities can be grounded based on similarity, no need to exactly same
####
nested = defaultdict(lambda: defaultdict(set))
tokens = get_tokens(doc)
# map_dict = defaultdict(str)
ground_subj(tokens, nested)
# print(tokens)
ground_act(tokens, nested)
## no verbs will be left
assert(all([t.pos_ != 'VERB' for t in tokens]))
print(tokens)
ground_obj(tokens, nested)
print(tokens)
# print(map_dict)
nested

## why don't we use scapy token as dict keys, such that no overlapping

[computer, ground, tree, sofa, playing, games, trees, plants]
ground None
ground None
wild tree
tree tree
sofa None
sofa None


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [99]:
relateDict['plant']['plant']

0.0

In [97]:
layerbase.layer_merge_.nested_entities_['wild']['have']

{'balloon',
 'beach',
 'boat',
 'cloud',
 'firework',
 'fish',
 'fish_tank',
 'guideboard',
 'hill',
 'lake',
 'leaf',
 'plant',
 'rock',
 'sign',
 'sun',
 'sunflower',
 'tree',
 'water',
 'wind'}

In [8]:
from tools.instance import Node
from tools.containers import Picture, Description
from tools.knowledge import LayerBase, TextBase

In [14]:
import glob
from tools.image_process import LayerName
from tools.common import ravel

class LayerBase():
    """
    layer Base knowledge, show only built on train set!
    """
    def __init__(self, filenames=[],
                 img_dir='images',
                 ext='.svg'):

        """
        need other dictionary to save the layer frequency
        """
        if not filenames:
            filenames = glob.glob('%s/*%s' % (img_dir, ext))
        else:
            filenames = ['%s/%s%s' % (img_dir, name, ext) for name in filenames]

        self.layer_merge_ = LayerName()
        self.pictures_ = []
        for svg in filenames:
            picture = Picture(svg)
            self.layer_merge_.absorb(picture.layer_merge_)
            self.pictures_.append(picture)
        self.entities_ = self.layer_merge_.entities_

        # picture vocab contains no dupicates
        self.pic_vocab_ = set(self.pictures_)

        # layer vocab contains no dupicates
        self.layer_vocab_ = set([layer for picture in self.pictures_ for layer in picture.layers_])

        # keyword vocab contains no dupicates
        # here we explicitly need the order the make sure results reproducable
        ## such as index and line up features
        self.vocab_ = sorted(ravel(self.layer_merge_.entities_))

    def index(self, keyword):
        assert(isinstance(keyword, Node))
        return self.vocab_.index(keyword)

    def __len__(self):
        return len(self.vocab_)

In [38]:
layerbase = LayerBase()

In [64]:
import dill
relateDict['sofa']['sofa']
# mirror_dict = defaultdict(lambda: defaultdict(float))
# for k1 in relateDict:
#     for k2 in relateDict[k1]:
#         mirror_dict[k2][k1] = relateDict[k1][k2]
# mirror_dict

1.0

In [54]:
import requests
requests.get('http://api.conceptnet.io/relatedness?node1=/c/en/%s&node2=/c/en/%s' % ('fishing_rod', 'fish')).json()['value']

0.415

In [19]:
layerbase.layer_merge_.nested_entities_

defaultdict(<function tools.image_process.LayerName.collapse_subj.<locals>.<lambda>()>,
            {'background': {},
             'accessory': {'have': {'leaf', 'plant'}},
             'other': {'have': {'bulletin',
               'camera',
               'circle',
               'cloud',
               'drone',
               'gamepad',
               'plant',
               'square',
               'sun',
               'tree',
               'triangle',
               'webpage',
               'website',
               'windmill'}},
             'man': {'point_to': set(),
              'stand': set(),
              'sit': set(),
              'hold': {'cube', 'fishing_rod', 'paper', 'phone'},
              'fishing': set(),
              'raise': {'arm'},
              'drink': {'beer'},
              'eat': {'chips'},
              'touch': set(),
              'play': {'computer'},
              'think': set(),
              'walk': set(),
              'lie_on': {'stone'},
    