In [89]:
import nltk
from nltk.parse.util import taggedsents_to_conll
from pymorphy2 import MorphAnalyzer
from pprint import pprint

In [2]:
from russian_tagsets import converters

In [3]:
m = MorphAnalyzer()
word_tags = m.tag("Сейчас")[0]


In [4]:
conv = converters.converter('opencorpora-int', 'ud14')

In [26]:
parser = nltk.parse.malt.MaltParser(
    parser_dirname="/home/artiom/Programming/NLP/maltparser/maltparser-1.9.1",
    model_filename="/home/artiom/Programming/NLP/maltparser/maltparser-1.9.1/russian_syntags.mco")

In [102]:
import os
import sys
import pymorphy2
from russian_tagsets import converters

class MaltParser:
    def __init__(self, parser_jarname, model_filename, pymorphy_analyzer=None):
        self.parser_jarname = parser_jarname
        self.model_filename = model_filename
        if not pymorphy_analyzer:
            pymorphy_analyzer = pymorphy2.MorphAnalyzer()
        self.pymorphy_analyzer = pymorphy_analyzer
        self.oc2ud = converters.converter('opencorpora-int','ud20')


    # TODO: Вынести    
    def text_to_conllu(self, txt):
        sents = nltk.sent_tokenize(txt)
        out_text = []
        for i,s in enumerate(sents):
            s = s.replace('\n','')
            tokens = nltk.word_tokenize(s)
            out_text.append("\n# sent: {}".format(i))
            out_text.append("# text: {}".format(s))
            for j,t in enumerate(tokens):
                t = m.parse(t)[0]
                pos, feats = self.oc2ud(str(t.tag)).split(' ')
                out_text.append("{0}\t{1}\t{2}\t{3}\t_\t{4}\t_\t_\t_\t_".format(j+1, t.word, 
                                                           t.normal_form, pos,
                                                            feats))
        return "\n".join(out_text)[1:]


    def output_process(self, lines):
        res = []
        curr_sent = []
        for i, l in enumerate(lines):
            l = l.strip()
            if not l:
                # next sent
                if curr_sent:
                    res.append(curr_sent)
                curr_sent = []
            elif l.startswith('#'):
                continue
            else:
                c = {}
                c['id'], c['token'], c['norm_form'], c['pos'], _, c['feats'], c['par'], c['deprel'], _, _ = l.split('\t')
                curr_sent.append(c)

        if len(res) and len(curr_sent) and res[-1] != curr_sent:
            res.append(curr_sent)

        return res
    
    def parse(self, txt):
        text_filename = "param.tmp"
        result_filename = "out.tmp"
        
        txt_conllu = self.text_to_conllu(txt)
        
        with open(text_filename, "w") as f_txt:
            f_txt.write(txt_conllu)
            
        command_to_run = "java -jar " + self.parser_jarname + " -c " + self.model_filename + " -i " + text_filename + " -m parse -o " + result_filename
        print("Running:", command_to_run)
        
        os.system(command_to_run)
        with open(result_filename, "r", encoding="utf8") as f_res:
            result_text = f_res.readlines()
            
        return self.output_process(result_text)
        
        
        

In [103]:
malt_parser = MaltParser("maltparser-1.9.1.jar", "russian_syntags", m)

In [124]:
r = malt_parser.parse("Привет, дамы и господа! Спойте песню!")

Running: java -jar maltparser-1.9.1.jar -c russian_syntags -i param.tmp -m parse -o out.tmp


In [126]:
r

[[{'deprel': 'ROOT',
   'feats': 'Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing',
   'id': '1',
   'norm_form': 'привет',
   'par': '0',
   'pos': 'NOUN',
   'token': 'привет'},
  {'deprel': 'punct',
   'feats': '_',
   'id': '2',
   'norm_form': ',',
   'par': '1',
   'pos': 'PUNCT',
   'token': ','},
  {'deprel': 'conj',
   'feats': 'Animacy=Anim|Case=Nom|Gender=Fem|Number=Plur',
   'id': '3',
   'norm_form': 'дама',
   'par': '1',
   'pos': 'NOUN',
   'token': 'дамы'},
  {'deprel': 'cc',
   'feats': '_',
   'id': '4',
   'norm_form': 'и',
   'par': '5',
   'pos': 'CCONJ',
   'token': 'и'},
  {'deprel': 'conj',
   'feats': 'Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur',
   'id': '5',
   'norm_form': 'господин',
   'par': '1',
   'pos': 'NOUN',
   'token': 'господа'},
  {'deprel': 'punct',
   'feats': '_',
   'id': '6',
   'norm_form': '!',
   'par': '5',
   'pos': 'PUNCT',
   'token': '!'}],
 [{'deprel': 'root',
   'feats': 'Aspect=Perf|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin',
 

In [122]:
def to_visjs_graph(sentence):
    from json import dumps
    edges = []
    nodes = []
    for token_line in sentence:
        id = int(token_line["id"])
        par_id = int(token_line["par"])
        deprel = token_line["deprel"]
        
        if par_id:
            edges.append({"from": id,  "to": par_id, "label": deprel})
            
        word = token_line["token"]
        pos = token_line["pos"]
        label = "{} ({})".format(word, pos)
        nodes.append({"id": id, "label": label})
        
    return dumps(nodes, ensure_ascii=False),dumps(edges, ensure_ascii=False)



In [125]:
to_visjs_graph(r[0])

('[{"label": "привет (NOUN)", "id": 1}, {"label": ", (PUNCT)", "id": 2}, {"label": "дамы (NOUN)", "id": 3}, {"label": "и (CCONJ)", "id": 4}, {"label": "господа (NOUN)", "id": 5}, {"label": "! (PUNCT)", "id": 6}]',
 '[{"from": 2, "label": "punct", "to": 1}, {"from": 3, "label": "conj", "to": 1}, {"from": 4, "label": "cc", "to": 5}, {"from": 5, "label": "conj", "to": 1}, {"from": 6, "label": "punct", "to": 5}]')

In [97]:
script_template 

In [98]:
pprint(output_process(r))

[[{'deprel': 'ROOT',
   'feats': 'Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing',
   'id': '1',
   'norm_form': 'привет',
   'par': '0',
   'pos': 'NOUN',
   'token': 'привет'},
  {'deprel': 'punct',
   'feats': '_',
   'id': '2',
   'norm_form': ',',
   'par': '1',
   'pos': 'PUNCT',
   'token': ','},
  {'deprel': 'conj',
   'feats': 'Animacy=Anim|Case=Nom|Gender=Fem|Number=Plur',
   'id': '3',
   'norm_form': 'дама',
   'par': '1',
   'pos': 'NOUN',
   'token': 'дамы'},
  {'deprel': 'cc',
   'feats': '_',
   'id': '4',
   'norm_form': 'и',
   'par': '5',
   'pos': 'CCONJ',
   'token': 'и'},
  {'deprel': 'conj',
   'feats': 'Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur',
   'id': '5',
   'norm_form': 'господин',
   'par': '1',
   'pos': 'NOUN',
   'token': 'господа'},
  {'deprel': 'punct',
   'feats': '_',
   'id': '6',
   'norm_form': '!',
   'par': '5',
   'pos': 'PUNCT',
   'token': '!'}],
 [{'deprel': 'root',
   'feats': 'Aspect=Perf|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin',
 