In [42]:
from collections import defaultdict, namedtuple

In [93]:
import networkx as nx
import stanfordnlp
nlp = stanfordnlp.Pipeline() 

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pr

In [143]:
def governor2idx(old_idx):
    if old_idx == 0:
        return None
    return old_idx -1

Node = namedtuple("Node", ["idx", "token", "pos", "dep", "governor"])
def parse_sentence(sentence):
    token2idx, nodes = defaultdict(lambda: []), []
    parsed_sent = sentence.dependencies
    for i in range(len(parsed_sent)):
        token2idx[parsed_sent[i][2].text].append(i)
        node = Node(i, parsed_sent[i][2].text, parsed_sent[i][2].xpos, parsed_sent[i][2].dependency_relation, governor2idx(parsed_sent[i][2].governor))
        nodes.append(node)
    return token2idx, nodes

NN = ['NN', 'NNS', 'NNP']
def compound(new_targets, token2idx, nodes):
    to_be_deleted, to_be_added = set(), set()
    for target in new_targets:
        comp_child_nodes = [nodes[i] for i in range(len(nodes)) if nodes[i].governor in token2idx[target] and nodes[i].dep=='compound' and nodes[i].pos in NN]
        if len(comp_child_nodes) > 0:
            to_be_deleted.add(target)
            for child_node in comp_child_nodes:
                to_be_added.add(' '.join([child_node.token, target]))
    for item in to_be_deleted:
        new_targets.remove(item)
    for item in to_be_added:
        new_targets.add(item)

def handle_hyphen(hyphenated_word, token2idx, nodes):
    indices = []
    for token in hyphenated_word.split('-'):
        indices.extend(token2idx[token])
    root_words = [nodes[i].token for i in indices if nodes[i].governor not in indices]
    return root_words[0]        
    
def print_(sample, o_word, t_word):
    doc = nlp(sample)
    token2idx, nodes = parse_sentence(doc.sentences[0])
    
    edges = []
    token2tagdep = {}
    for token in doc.sentences[0].dependencies:
        token2tagdep[token[2].text]=(token[2].xpos, token[2].dependency_relation)
        if token[0].text.lower() != 'root':
            edges.append((token[0].text, token[2].text))
    graph = nx.Graph(edges)
    
    print('\n===\nedges:', edges)
    
    entity1, entity2 = o_word, t_word
    try: print('shortest path length:', nx.shortest_path_length(graph, source=entity1, target=entity2))
    except:
        entity1 = handle_hyphen(o_word, token2idx, nodes)
        print('shortest path length:', nx.shortest_path_length(graph, source=entity1, target=entity2))
        
    shortest_path = nx.shortest_path(graph, source=entity1, target=entity2)
    print('shortest path:', shortest_path)
    print('token2tagdep edges:', token2tagdep)
    print('\n', [token2tagdep[token] for token in shortest_path])
    
    new_targets = set([t_word])
    compound(new_targets, token2idx, nodes)
    print(new_targets)
    return new_targets

In [144]:
print_('This is easy-to-use', 'easy-to-use', 'This')


===
edges: [('easy', 'This'), ('easy', 'is'), ('easy', '-'), ('use', 'to'), ('use', '-'), ('easy', 'use')]
shortest path length: 1
shortest path: ['easy', 'This']
token2tagdep edges: {'This': ('DT', 'nsubj'), 'is': ('VBZ', 'cop'), 'easy': ('JJ', 'root'), '-': ('HYPH', 'punct'), 'to': ('IN', 'compound'), 'use': ('VB', 'parataxis')}

 [('JJ', 'root'), ('DT', 'nsubj')]
{'This'}




{'This'}

# Rule 1

In [35]:
print_('The phone has a good screen.', 'good', 'screen')


===
edges: [('phone', 'The'), ('has', 'phone'), ('screen', 'a'), ('screen', 'good'), ('has', 'screen'), ('has', '.')]
shortest path length: 1
shortest path: ['good', 'screen']
token2tagdep edges: {'The': ('DT', 'det'), 'phone': ('NN', 'nsubj'), 'has': ('VBZ', 'root'), 'a': ('DT', 'det'), 'good': ('JJ', 'amod'), 'screen': ('NN', 'obj'), '.': ('.', 'punct')}

 [('JJ', 'amod'), ('NN', 'obj')]

===
edges: [('pleased', 'I'), ('pleased', 'am'), ('pleased', 'not'), ('quality', 'with'), ('quality', 'the'), ('quality', 'picture'), ('pleased', 'quality'), ('pleased', '.')]
shortest path length: 2
shortest path: ['pleased', 'quality', 'picture']
token2tagdep edges: {'I': ('PRP', 'nsubj'), 'am': ('VBP', 'cop'), 'not': ('RB', 'advmod'), 'pleased': ('JJ', 'root'), 'with': ('IN', 'case'), 'the': ('DT', 'det'), 'picture': ('NN', 'compound'), 'quality': ('NN', 'obl'), '.': ('.', 'punct')}

 [('JJ', 'root'), ('NN', 'obl'), ('NN', 'compound')]




# Compound 처리 필요

In [116]:
# temp
o_word = 'new'
print_('This is %s.' % o_word, o_word, 'This')   # easy-to-use


===
edges: [('new', 'This'), ('new', 'is'), ('new', '.')]
shortest path length: 1
shortest path: ['new', 'This']
token2tagdep edges: {'This': ('DT', 'nsubj'), 'is': ('VBZ', 'cop'), 'new': ('JJ', 'root'), '.': ('.', 'punct')}

 [('JJ', 'root'), ('DT', 'nsubj')]
{'This'}




{'This'}

In [59]:
print_('The photo quality is amazing.', 'amazing', 'quality')   # photo quality
print_('I am not pleased with the picture quality.', 'pleased', 'quality')   # picture quality
print_('The software of the player is not easy-to-use.', 'easy', 'software')   # easy-to-use


===
edges: [('quality', 'The'), ('quality', 'photo'), ('amazing', 'quality'), ('amazing', 'is'), ('amazing', '.')]
shortest path length: 1
shortest path: ['amazing', 'quality']
token2tagdep edges: {'The': ('DT', 'det'), 'photo': ('NN', 'compound'), 'quality': ('NN', 'nsubj'), 'is': ('VBZ', 'cop'), 'amazing': ('JJ', 'root'), '.': ('.', 'punct')}

 [('JJ', 'root'), ('NN', 'nsubj')]
{'photo quality'}

===
edges: [('pleased', 'I'), ('pleased', 'am'), ('pleased', 'not'), ('quality', 'with'), ('quality', 'the'), ('quality', 'picture'), ('pleased', 'quality'), ('pleased', '.')]
shortest path length: 1
shortest path: ['pleased', 'quality']
token2tagdep edges: {'I': ('PRP', 'nsubj'), 'am': ('VBP', 'cop'), 'not': ('RB', 'advmod'), 'pleased': ('JJ', 'root'), 'with': ('IN', 'case'), 'the': ('DT', 'det'), 'picture': ('NN', 'compound'), 'quality': ('NN', 'obl'), '.': ('.', 'punct')}

 [('JJ', 'root'), ('NN', 'obl')]
{'picture quality'}

===
edges: [('software', 'The'), ('easy', 'software'), ('playe



{'software'}

# Rule 2

In [31]:
print_('The iPod is the best mp3 player.', 'best', 'iPod')


===
edges: [('iPod', 'The'), ('player', 'iPod'), ('player', 'is'), ('player', 'the'), ('player', 'best'), ('player', 'mp3'), ('player', '.')]
shortest path length: 2
shortest path: ['best', 'player', 'iPod']
token2tagdep edges: {'The': ('DT', 'det'), 'iPod': ('NNP', 'nsubj'), 'is': ('VBZ', 'cop'), 'the': ('DT', 'det'), 'best': ('JJS', 'amod'), 'mp3': ('NN', 'compound'), 'player': ('NN', 'root'), '.': ('.', 'punct')}

 [('JJS', 'amod'), ('NN', 'root'), ('NNP', 'nsubj')]




In [27]:
print_('The camera has a wonderful set of features.', 'wonderful', 'features')

edges: [('camera', 'The'), ('has', 'camera'), ('set', 'a'), ('set', 'wonderful'), ('has', 'set'), ('features', 'of'), ('set', 'features'), ('has', '.')]
shortest path length: 2
shortest path: ['wonderful', 'set', 'features']
token2tagdep edges: {'The': ('DT', 'det'), 'camera': ('NN', 'nsubj'), 'has': ('VBZ', 'root'), 'a': ('DT', 'det'), 'wonderful': ('JJ', 'amod'), 'set': ('NN', 'obj'), 'of': ('IN', 'case'), 'features': ('NNS', 'nmod'), '.': ('.', 'punct')}

 [('JJ', 'amod'), ('NN', 'obj'), ('NNS', 'nmod')]


