In [2]:
from collections import defaultdict
import dill
import glob
import warnings
warnings.filterwarnings('always')

from tools.text_process import LemmaTokenizer
from tools.image_process import LayerName, getLayerNames
from tools.instance import Node
from tools.knowledge import LayerBase, TextBase
from tools.common import ravel

In [8]:
layerbase = LayerBase()
print(layerbase.index(Node('man','subj')))

textbase = TextBase()
len(textbase.vocab_)#index('man','n')
# textbase.vocab_

50


85

In [3]:
# with open('relateDict.pkl', 'rb') as f:
#     relateDict = dill.load(f)
# sum([len(relateDict[key]) for key in relateDict])

In [25]:
class Picture:
    def __init__(self, img_name):
        self.img_name = img_name
        self.layernames_ = getLayerNames(img_name)
        self.layer_merge_ = LayerName()
        self.layers_ = []
        for layername in self.layernames_:
            layer = LayerName(layername)
            self.layers_.append(layer)
            self.layer_merge_.absorb(layer)
        # make the layers immutable
        self.layers_ = tuple(self.layers_)
        self.plot = self.layer_merge_.plot
        self.keywords_ = ravel(self.layer_merge_.entities_)
        
    def __repr__(self):
        """
        used to print a string when directly call the object?
        """
        return '; '.join(self.layernames_)

class Description:
    """
    save tokens and original sentence
    """
    def __init__(self, txt_name):
        self.txt_name = txt_name

        tokenizer = LemmaTokenizer()
        with open('%s/%s.txt' % (txt_dir, name)) as f:
            self.text = f.read()
        self.tokens_ = tokenizer(self.text)
        
    def __repr__(self):
        return self.text

# picture = Picture('images/Firmware.svg')
# [n.t for n in picture.ravel_]
# picture.plot()

In [26]:
# temporarily set to 0
relateDict = defaultdict(lambda: defaultdict(float))

In [27]:
img_dir='images'
txt_dir='text'
name = 'Firmware'

description = Description('%s/%s.txt' % (txt_dir, name))
picture = Picture('%s/%s.svg' % (img_dir, name))
print(picture)
description

#background; #alien(have[robot],stand); #man(exercise,step_on[stone]); #man(work,play[computer],sit)


A man played the computer. A man stepped on the stone and exercised. They had a robot.

In [28]:
from scipy import sparse
import numpy as np

class Encoder():
    def __init__(self):
        self.layerbase = LayerBase()
        self.textbase = TextBase()

    def encode(self, tokens, keywords, length=5):
        assert(length < len(self.textbase)), (length,
                                              len(self.textbase))
        tuples = []
        for token in tokens:
            for keyword in keywords:
                tuples.append((self.layerbase.index(keyword),
                               self.textbase.index(token),
                               relateDict[keyword.t][token.t]))
        row, col, data = zip(*tuples)
        matrix = sparse.csr_matrix((data, (row, col)),
                                    shape=(len(self.layerbase),
                                           len(self.textbase)))
        return self.to_hists(matrix, length)
    
    def to_hists(self, matrix, num=5):
        assert(isinstance(matrix, sparse.csr_matrix))
        assert(matrix.min()>=0)
        assert(matrix.max()<=1)
        
        arr = matrix.toarray()
        bins = np.linspace(0, 1, num+1)
        hists = []
        for row in range(arr.shape[0]):
            hists.append(np.histogram(arr[row], bins)[0])
        return np.vstack(hists)
    

In [29]:
encoder = Encoder()
arr = encoder.encode(description.tokens_,
                     picture.keywords_)#.toarray()
arr.shape

(101, 5)

In [30]:
class LayerBase():
    def __init__(self):


        self.layer_merge_ = LayerName()
        self.pictures_ = []
        for svg in glob.glob('images/*.svg'):
            picture = Picture(svg)
            # for layer in getLayerNames(svg):
            #     layername = LayerName(layer)
            #     self.layers_.append(layername)
            self.layer_merge_.absorb(picture.layer_merge_)
            self.pictures_.append(picture)
        self.entities_ = self.layer_merge_.entities_
        self.keywords_ = sorted(ravel(self.layer_merge_.entities_))

    # def index(self, keyword, attr):
    #     return self.nodes_.index(Node(keyword, attr))

    def index(self, keyword):
        assert(isinstance(keyword, Node))
        return self.keywords_.index(keyword)

    def __len__(self):
        return len(self.keywords_)

In [31]:
layerbase = LayerBase()
alllayers = [layer for picture in layerbase.pictures_ for layer in picture.layers_]

In [33]:
from collections import Counter
print(len(set(alllayers)))
c = Counter(alllayers)
for layer, count in c.most_common():
    if count <= 1: break
    print(ravel(layer.entities_), count)
"""
layer keywords
then fetch these contain the tokens in the transformed(to keyword) description

how?
for each keyword, fetch a layer?
"""

82
{background(subj)} 29
{leaf(obj), accessory(subj), have(act)} 3
{man(subj), stand(act)} 3
{plant(obj), wild(subj), have(act)} 3
{tree(obj), plant(obj), wild(subj), have(act)} 2


'\nlayer keywords\nthen fetch these contain the tokens in the transformed(to keyword) description\n\nhow?\nfor each keyword, fetch a layer?\n'

In [47]:
import spacy
from spacy.pipeline import Tagger
nlp = spacy.load("en_core_web_sm")
tagger = Tagger(nlp.vocab)
tagger = nlp.create_pipe("tagger")
# tagger(
doc = nlp(u'A joyful man plays computer frequently and sits on the ground')
for token in doc:
    print(token.text, token.tag_)


A DT
joyful JJ
man NN
plays VBZ
computer NN
frequently RB
and CC
sits VBZ
on IN
the DT
ground NN


In [44]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u'A man plays computer and sits on the ground. A woman is waving.')
for token in doc:
    print("{2}({3}-{6}, {0}-{5})".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_, token.i+1, token.head.i+1))

det(man-2, A-1)
nsubj(plays-3, man-2)
ROOT(plays-3, plays-3)
dobj(plays-3, computer-4)
cc(plays-3, and-5)
conj(plays-3, sits-6)
prep(sits-6, on-7)
det(ground-9, the-8)
pobj(on-7, ground-9)
punct(plays-3, .-10)
det(woman-12, A-11)
nsubj(waving-14, woman-12)
aux(waving-14, is-13)
ROOT(waving-14, waving-14)
punct(waving-14, .-15)


In [27]:
def generator(description):
    """
    Given a description, propose several layers for selection
    """
    assert(isinstance(description, Description))
    

In [16]:
LayerName('#man(have[pant,woof],do[shit])').nested_entities_ == LayerName('#man(do[shit],have[woof,pant])').nested_entities_

True

In [21]:
LayerName('#man(have[pant,woof],do[shit])').nested_entities_.items()

dict_items([('man', {'have': {'woof', 'pant'}, 'do': {'shit'}})])

In [77]:
a = set()
a.add(1)
a
a.union({1,2})
a

{1}

In [22]:
((1,),(2,))

((1,), (2,))