In [None]:
from itertools import product
import warnings

In [None]:
import sys
import os
sys.path.append("../") # go to parent dir
from src.Derivation import Derivation
derivator = Derivation(use_guesser=True)

In [None]:
from tqdm import tqdm

In [None]:
from src.FinateStateMachine import FSM

In [None]:
wordlist = dict()
derivator.pos_all.append('part')
for pos in derivator.pos_all:
    wordlist_pos = list()
    with open(f'../data/wiktionary/{pos}.txt', encoding='utf8') as f:
        for l in f:
            wordlist_pos.append(l.strip())
    wordlist[pos] = wordlist_pos

In [None]:
from multiprocessing import Pool, TimeoutError
import os

In [None]:
warnings.simplefilter("ignore")

In [None]:
class Part:
    def __init__(self, name, pos_b, pos_a = None, rule_id = None, wordlist = []):
        self.name = name
        self.pos_b, self.pos_a = pos_b, pos_a or pos_b
        self.rule_id = rule_id or name
        print('FSM', self.rule_id)
        self.fsm = FSM({self.rule_id})
        if rule_id:        
            with Pool(30) as p:
                results = list(tqdm(p.imap(self.get_derived, wordlist), total=len(wordlist)))
                print(len(results))
                for result_, word in zip(results, wordlist):
                    for result in result_:
                        self.fsm.add_word(list(result.lower()) + [self.rule_id, (word, self.pos_b)])
        else:
            for word in tqdm(wordlist):
                self.fsm.add_word(list(word.lower()) + [self.rule_id, (word, self.pos_b)])
    
    def get_derived(self, word):
        derived = derivator.derive(word_b=word.lower(), pos_b=self.pos_b, rule_id=self.rule_id, use_rare=True)
        if derived:
            return derived[self.rule_id]
        return []
        
    def add_word(self, form, lemma=None, pos=None):
        self.fsm.add_word(list(form.lower()) + [self.rule_id, (lemma or form, pos or self.pos_b)])
    
    def analyze_word(self, word):
        return self.fsm.analyze_word(word.lower())
    
    def to_dict(self):
        return {
            "name": self.name, 
            "pos_b": self.pos_b, 
            "pos_a": self.pos_a,
            "rule_id": self.rule_id,
            "fsm": self.fsm.to_dict()
        }
    
    @classmethod
    def from_dict(cls, d):
        c = cls(name=d["name"], pos_b=d["pos_b"], pos_a=d["pos_a"], rule_id=d["rule_id"])
        c.fsm = FSM.from_dict(d["fsm"])
        return c
    
    def save(self):
        with open(self.name.replace('/', '_') + ".pickle", "wb") as f:
            pickle.dump(self.to_dict(), f)
    
    @classmethod
    def load(cls, f):
        d = pickle.load(open(f, "rb"))
        return cls.from_dict(d)

In [None]:
prt = Part("ruleINTERFIX(num)", 'num', None, "ruleINTERFIX(num)", wordlist['num'])

In [None]:
import pickle

In [None]:
prt.save()

In [None]:
with open(prt.name + ".pickle", "rb") as f:
    prt0 = Part.load(prt.name + ".pickle")

In [None]:
prt0.fsm.analyze_word("двух")

In [None]:
mod_rare = dict()
head_rare = dict()

for pos in derivator.pos_all:
    mod_rare[pos] = []
    head_rare[pos] = []    
    try:
        with open(f'../src/rules/compounds_rare_{pos}.csv', encoding='utf8') as f:
            for line in f.readlines()[1:]:
                lemma, pos_lemma, form, i_arg = line.strip().split(';')    
                if i_arg == '0':
                    head_rare[pos].append((lemma, pos_lemma, form))
                else:
                    mod_rare[pos].append((lemma, pos_lemma, form))
    except FileNotFoundError:
        pass

try:
    with open(f'../src/rules/compounds_rare_star.csv', encoding='utf8') as f:
        for line in f.readlines()[1:]:
            lemma, pos_lemma, form, rule_id, i_arg = line.strip().split(';')    
            if rule_id not in head_rare:
                head_rare[rule_id] = []
            if rule_id not in mod_rare:
                mod_rare[rule_id] = []
            if i_arg == '0':
                head_rare[rule_id].append((lemma, pos_lemma, form))
            else:
                mod_rare[rule_id].append((lemma, pos_lemma, form))
except FileNotFoundError:
    pass


In [None]:
import os

In [None]:
head_parts = dict()
mod_parts = dict()

for rule in tqdm(derivator.rules_compound):
    if rule.after_merge_rule_ids:
        # not implemented yet
        continue
    head_rules, mod_rules = rule.simple_rule_ids[0], rule.simple_rule_ids[1]
    if head_rules:
        # e.g. suffix
        head_rule = derivator.rules_dict[head_rules[0]]
        if head_rule.pos_a == "noun":
            continue
        if os.path.exists(head_rule.name.replace('/', '_') + ".pickle"):
            head_parts[head_rule.name] = Part.load(head_rule.name.replace('/', '_') + ".pickle")
        elif head_rule.name not in head_parts:
            head_parts[head_rule.name] = Part(
                head_rule.name, 
                head_rule.pos_b, 
                head_rule.pos_a, 
                head_rule.name, 
                wordlist[head_rule.pos_b]
            )
            head_parts[head_rule.name].save()
    else:
        # no changes
        if os.path.exists(rule.pos_b.replace('/', '_') + ".pickle"):
            head_parts[rule.pos_b] = Part.load(rule.pos_b.replace('/', '_') + ".pickle")
        elif rule.pos_b not in head_parts:
            head_parts[rule.pos_b] = Part(rule.pos_b, rule.pos_b, rule.pos_b, None, wordlist[rule.pos_b])
            for lemma, pos, form in head_rare.get(rule.pos_b, []):
                head_parts[rule.pos_b].add_word(form, lemma, pos)
            head_parts[rule.pos_b].save()
    if mod_rules:
        # interfix
        mod_rule = derivator.rules_dict[mod_rules[0]]
        if os.path.exists(mod_rule.name.replace('/', '_') + ".pickle"):
            mod_parts[mod_rule.name] = Part.load(mod_rule.name.replace('/', '_') + ".pickle")
        elif mod_rule.name not in mod_parts:
            mod_parts[mod_rule.name] = Part(
                mod_rule.name, 
                mod_rule.pos_b, 
                mod_rule.pos_a, 
                mod_rule.name, 
                wordlist[mod_rule.pos_b]
            )
            mod_parts[mod_rule.name].save()
    else:
        # no changes or star
        if rule.poss_m[0]  == '*':
            # star
            if os.path.exists(rule.name.replace('/', '_') + ".pickle"):
                mod_parts[rule.name] = Part.load(rule.name.replace('/', '_') + ".pickle")
            elif rule.name not in mod_parts:
                mod_parts[rule.name] = Part(rule.name, '*', '*', None, [])
                for lemma, pos, form in mod_rare.get(rule.name, []):
                    mod_parts[rule.name].add_word(form, lemma, pos)
                mod_parts[rule.name].save()
        else:
            # adv, noun, etc.
            pos_m = rule.poss_m[0]
            if os.path.exists(pos_m.replace('/', '_') + ".pickle"):
                mod_parts[pos_m] = Part.load(pos_m.replace('/', '_') + ".pickle")
            elif pos_m not in mod_parts:
                mod_parts[pos_m] = Part(pos_m, pos_m, pos_m, None, wordlist[pos_m])
                for lemma, pos, form in mod_rare.get(pos_m, []):
                    mod_parts[pos_m].add_word(form, lemma, pos)
                mod_parts[pos_m].save()

In [None]:
head_parts

In [None]:
mod_parts

In [None]:
class CompoundAnalyzer:
    def __init__(self, name, pos, left, right):
        self.name = name
        self.pos = pos
        self.left = left
        self.right = right
    
    def analyze(self, word, pos):
        if pos != self.pos:
            return []
        left_res = self.left.analyze_word(word)
        final_res = []
        for st, left, left_rule in left_res:
            if st == len(word):
                continue
            if word[st] == '-':
                st += 1
            right_res = self.right.analyze_word(word[st:])
            for fi, right, right_rule in right_res:
                if st + fi == len(word):
                    final_res.append((word, self.pos, self.name, left, left_rule, right, right_rule))
        return final_res

In [None]:
analyzed = []

for rule in tqdm(derivator.rules_compound):
#     if not rule.name.startswith("rule754") and not rule.name.startswith("rule776"):
#         continue
    if rule.after_merge_rule_ids:
        # not implemented yet
        continue
    head_rules, mod_rules = rule.simple_rule_ids[0], rule.simple_rule_ids[1]
    print(rule.name, head_rules, mod_rules)
    if head_rules:
        # e.g. suffix
        head_rule = derivator.rules_dict[head_rules[0]]
        head_part = head_parts[head_rule.name]
    else:
        # no changes
        head_part = head_parts[rule.pos_b]
    if mod_rules:
        # interfix
        mod_rule = derivator.rules_dict[mod_rules[0]]
        mod_part = mod_parts[mod_rule.name]
    else:
        # no changes or star
        if rule.poss_m[0]  == '*':
            # star
            mod_part = mod_parts[rule.name]
        else:
            # adv, noun, etc.
            pos_m = rule.poss_m[0]
            mod_part = mod_parts[pos_m]
    
    print(head_part, mod_part, rule.order)
    if rule.order == [0, 1]:
        ca = CompoundAnalyzer(rule.name, rule.pos_a, head_part, mod_part)
    else:
        # [1, 0]
        ca = CompoundAnalyzer(rule.name, rule.pos_a, mod_part, head_part)
    for word in wordlist[rule.pos_a]:
        analyzed.extend(ca.analyze(word, rule.pos_a))

In [None]:
head_parts['rule619*(noun + н1(ый) -> adj)'].fsm.states

In [None]:
with open('generated_comp_adj.txt', 'w') as f:
    for l in analyzed:
        w, p, nm, (ff, pf), iff, (s, ps), ss = l
        f.writelines('\t'.join([w, p, nm, ff, pf, iff, s, ps, ss]) + '\n')

In [None]:
len(analyzed)

In [None]:
fixed = []
for l in analyzed:
    w, p, nm, (ff, pf), iff, (s, ps), ss = l
    if w.find('-') != -1:
        fixed.append(l)
    else:
        if len(ff) >= 3 and len(s) >= 3:
            fixed.append(l)

In [None]:
with open('compo_new_cor3+.txt', 'w') as f:
    for l in fixed:
        w, p, nm, (ff, pf), iff, (s, ps), ss = l
        f.writelines('\t'.join([w, p, nm, ff, pf, iff, s, ps, ss]) + '\n')

In [None]:
len(fixed)

In [None]:
fixed[::10000]

In [None]:
rules_used = set()
for l in analyzed:
    w, p, nm, (ff, pf), iff, (s, ps), ss = l
    rules_used.add(nm)

In [None]:
rules_used

In [None]:
ca = CompoundAnalyzer(derivator.rules_compound[52].name, derivator.rules_compound[52].pos_a, num_left, adj_right)

In [None]:
ca.analyze('двухметровый', 'adj')

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
analyzed = []
for word in tqdm(adj_wordlist):
    analyzed.extend(ca.analyze(word, 'adj'))

In [None]:
len(analyzed)

In [None]:
def get_words(rule, word_b, pos_b, words_m, poss_m, pos_a, **kwargs):    
    tags = [derivator.tag_guesser.guess(word=word_m, pos=pos_m) or [{}]
            for (word_m, pos_m) in zip([word_b] + words_m, [pos_b] + poss_m)]
    results = set()
    for tag_comb in product(*tags):
        results |= rule.apply_with_tags(word_b=word_b, pos_b=pos_b, pos_a=pos_a, words_m=words_m, poss_m=poss_m, tags_dict=tag_comb, **kwargs)
    return results

In [None]:
get_words(rule, 'проходить', 'verb', ['первый'], ['adj'], 'noun', use_rare=True)