In [1]:
import sys
sys.path.append("../") # go to parent dir

In [2]:
import pandas as pd
import warnings
from tqdm import tqdm
from collections import defaultdict
from typing import List, Optional, Dict

from src.fsm_part import DerivedFSM
from src.compound_analyzer import CompoundAnalyzer

In [3]:
def build_fsm(
        wordlist: Dict[str, List[str]],
        rule_name: str,
        pos_b: str,
        pos_a: Optional[str],
        rule_id: Optional[str],
        rare_dict: dict,
        load: bool = False,
        save: bool = True
) -> DerivedFSM:
    if load:
        try:
            fsm = DerivedFSM.load(rule_name.replace('/', '_') + ".pickle")
            return fsm
        except FileNotFoundError:
            warnings.warn("File does not exist!")
    fsm = DerivedFSM(
        rule_name, pos_b, pos_a, rule_id, wordlist.get(pos_b, [])
    )
    for lemma, pos, form in rare_dict.get(rule_name, []):
        fsm.add_word(form, lemma, pos)
    if save:
        fsm.save()
    return fsm

In [4]:
def prepare_fsms(wordlist: Dict[str, List[str]]):
    derivator = DerivedFSM.derivator

    # loading rare forms
    mod_rare = defaultdict(list)
    head_rare = defaultdict(list)

    rare_star = pd.read_csv('../src/rules/compounds_rare_star.csv', sep=";")
    for i in range(len(rare_star)):
        d = dict(rare_star.iloc[i])
        if d["index"] == 0:
            head_rare[d["with_rule_id"]].append((d["lemma"], d["pos"], d["form"]))
        else:
            mod_rare[d["with_rule_id"]].append((d["lemma"], d["pos"], d["form"]))

    # processing normal words
    head_parts = defaultdict(DerivedFSM)
    mod_parts = defaultdict(DerivedFSM)

    for rule in tqdm(derivator.rules_compound):
        if rule.after_merge_rule_ids:
            # not implemented yet
            continue
        head_rules, mod_rules = rule.simple_rule_ids[:2]
        pos_m = rule.poss_m[0]

        # filling in the head FSM
        if head_rules:
            # e.g. suffix
            head_rule = derivator.rules_dict[head_rules[0]]
            head_rule_name = head_rule.name
            head_rule_id = head_rule_name
            pos_b, pos_a = head_rule.pos_b, head_rule.pos_a
        else:
            # no changes, e. g. "rule550([adj + ITFX] + noun -> noun)"
            head_rule_name = rule.pos_b
            head_rule_id = None
            pos_b, pos_a = rule.pos_b, rule.pos_b

        if head_rule_name not in head_parts:
            head_parts[head_rule_name] = build_fsm(
                wordlist, head_rule_name, pos_b, pos_a, head_rule_id, head_rare
            )
        # filling in the modifier FSM
        if mod_rules:
            # interfix
            mod_rule = derivator.rules_dict[mod_rules[0]]
            mod_rule_name = mod_rule.name
            mod_rule_id = mod_rule_name
            pos_b, pos_a = mod_rule.pos_b, mod_rule.pos_b
        elif pos_m == '*':
            # star
            mod_rule_name = rule.name
            mod_rule_id = None
            pos_b, pos_a = "*", "*"
        else:
            # adv, noun, etc.
            mod_rule_name = pos_m
            mod_rule_id = None
            pos_b, pos_a = pos_m, pos_m
        mod_parts[mod_rule_name] = build_fsm(
            wordlist, mod_rule_name, pos_b, pos_a, mod_rule_id, mod_rare
        )
    return derivator, head_parts, mod_parts, head_rare, mod_rare

In [5]:
def prepare_compound_fsms(
        wordlist: Dict[str, List[str]]
) -> Dict[str, CompoundAnalyzer]:
    derivator, head_parts, mod_parts, head_rare, mod_rare = prepare_fsms(wordlist)

    cas = dict()
    for rule in tqdm(derivator.rules_compound):
        if rule.after_merge_rule_ids:
            # not implemented yet
            continue
        head_rules, mod_rules = rule.simple_rule_ids[:2]
        pos_m = rule.poss_m[0]
        print(rule.name, head_rules, mod_rules)

        if head_rules:
            head_rule = derivator.rules_dict[head_rules[0]]
            head_rule_name = head_rule.name
        else:
            head_rule_name = rule.pos_b
        head_part = head_parts[head_rule_name]

        if mod_rules:
            mod_rule = derivator.rules_dict[mod_rules[0]]
            mod_rule_name = mod_rule.name
        elif pos_m == '*':
            mod_rule_name = rule.name
        else:
            mod_rule_name = pos_m
        mod_part = mod_parts[mod_rule_name]

        print(head_part.name, mod_part.name, rule.order)
        if rule.order == [0, 1]:
            ca = CompoundAnalyzer(rule.name, rule.pos_a, head_part, mod_part)
        else:
            ca = CompoundAnalyzer(rule.name, rule.pos_a, mod_part, head_part)
        cas[rule.name] = ca
    return cas  # derivator, head_parts, mod_parts, head_rare, mod_rare


In [6]:
wordlist = defaultdict(list)
wordlist["adv"] = ["сидя"]
wordlist["adj"] = ["сладкий", "недельный"]
wordlist["noun"] = ["монета", "неделя"]
wordlist["num"] = ["один", "два"]
wordlist["verb"] = ["ходить", "идти"]

In [7]:
derivator, head_parts, mod_parts, head_rare, mod_rare = prepare_fsms(wordlist)

  0%|          | 0/78 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 4544.21it/s]

100%|██████████| 2/2 [00:00<00:00, 199.60it/s]
  1%|▏         | 1/78 [00:00<01:00,  1.28it/s]
100%|██████████| 2/2 [00:00<00:00, 2228.64it/s]
  3%|▎         | 2/78 [00:01<00:59,  1.27it/s]
100%|██████████| 2/2 [00:00<00:00, 2739.58it/s]
  4%|▍         | 3/78 [00:02<00:54,  1.37it/s]
100%|██████████| 2/2 [00:00<00:00, 524.32it/s]

0it [00:00, ?it/s][A
  5%|▌         | 4/78 [00:02<00:53,  1.37it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00, 17.62it/s][A

100%|██████████| 2/2 [00:00<00:00, 1577.40it/s]
  6%|▋         | 5/78 [00:04<01:15,  1.04s/it]
100%|██████████| 2/2 [00:00<00:00, 2833.99it/s]
  8%|▊         | 6/78 [00:05<01:06,  1.09it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00, 17.19it/s][A

100%|██████████| 2/2 [00:00<00:00, 364.14it/s]
  9%|▉         | 7/78 [00:06<01:19,  1.12s/it]
100%|██████████| 2/2 [00:00<00:00, 321.88it/s

 82%|████████▏ | 64/78 [01:08<00:15,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 4279.90it/s]
 83%|████████▎ | 65/78 [01:08<00:13,  1.00s/it]
100%|██████████| 2/2 [00:00<00:00, 22.85it/s]

100%|██████████| 1/1 [00:00<00:00, 2057.04it/s]
 85%|████████▍ | 66/78 [01:09<00:11,  1.08it/s]
100%|██████████| 1/1 [00:00<00:00, 2248.96it/s]

100%|██████████| 1/1 [00:00<00:00, 3806.08it/s]

100%|██████████| 2/2 [00:00<00:00, 5518.82it/s]

0it [00:00, ?it/s][A

100%|██████████| 2/2 [00:00<00:00, 2662.21it/s]
 90%|████████▉ | 70/78 [01:10<00:03,  2.15it/s]
100%|██████████| 2/2 [00:00<00:00, 5622.39it/s]
 91%|█████████ | 71/78 [01:11<00:03,  1.97it/s]
0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

100%|██████████| 1/1 [00:00<00:00, 1623.18it/s]

0it [00:00, ?it/s][A
 96%|█████████▌| 75/78 [01:11<00:00,  3.81it/s]
100%|██████████| 2/2 [00:00<00:00, 20.28it/s]

0it [00:00, ?it/s][A
100%|██████████| 78/78 [01:12<00:00,  1.08it/s]


In [8]:
mod_parts['rule961([полу/само] + verb -> verb)']

<src.fsm_part.DerivedFSM at 0x7ff831523588>

In [9]:
mod_rare

defaultdict(list,
            {'rule1026([полу] + adv -> adv)': [('половина', 'noun', 'полу')],
             'rule1027([мимо/само] + verb + ом -> adv)': [('мимо',
               'adv',
               'мимо'),
              ('сам', 'pron', 'само')],
             'rule552([пол] + noun + GEN -> noun)': [('половина',
               'noun',
               'пол')],
             'rule1028(в + [пол] + noun + GEN -> adv)': [('половина',
               'noun',
               'пол')],
             'rule1029(в + [пол/три] + adj + а -> adv)': [('половина',
               'noun',
               'пол'),
              ('три', 'num', 'три')],
             'rule961([полу/само] + verb -> verb)': [('половина',
               'noun',
               'полу'),
              ('сам', 'pron', 'само')],
             'rule962(noun + [фицировать] -> verb)': [('фицировать',
               'suffixoid',
               'фицировать')],
             'adv': [('выше', 'comporative', 'выше'),
              ('ниже', 'compora

In [10]:
head_rare

defaultdict(list,
            {'adj': [('образный', 'suffixoid', 'образный'),
              ('валентный', 'suffixoid', 'валентный'),
              ('геничный', 'suffixoid', 'геничный'),
              ('генный', 'suffixoid', 'генный')]})

In [11]:
cas = dict()
for rule in tqdm(derivator.rules_compound):
    if rule.after_merge_rule_ids:
        # not implemented yet
        continue
    head_rules, mod_rules = rule.simple_rule_ids[:2]
    pos_m = rule.poss_m[0]
    print(rule.name, head_rules, mod_rules)

    if head_rules:
        head_rule = derivator.rules_dict[head_rules[0]]
        head_rule_name = head_rule.name
    else:
        head_rule_name = rule.pos_b
    head_part = head_parts[head_rule_name]

    if mod_rules:
        mod_rule = derivator.rules_dict[mod_rules[0]]
        mod_rule_name = mod_rule.name
    elif pos_m == '*':
        mod_rule_name = rule.name
    else:
        mod_rule_name = pos_m
    mod_part = mod_parts[mod_rule_name]

    print(head_part.name, mod_part.name, rule.order)
    if rule.order == [0, 1]:
        ca = CompoundAnalyzer(rule.name, rule.pos_a, head_part, mod_part)
    else:
        ca = CompoundAnalyzer(rule.name, rule.pos_a, mod_part, head_part)
    cas[rule.name] = ca

100%|██████████| 78/78 [00:00<00:00, 414.84it/s]

rule550([noun + ITFX] + noun -> noun) [] ['ruleINTERFIX(noun)']
noun ruleINTERFIX(noun) [1, 0]
rule550([adj + ITFX] + noun -> noun) [] ['ruleINTERFIX(adj)']
noun ruleINTERFIX(adj) [1, 0]
rule550([num + ITFX] + noun -> noun) [] ['ruleINTERFIX(num)']
noun ruleINTERFIX(num) [1, 0]
rule552([пол] + noun + GEN -> noun) ['rule1028*(noun + GEN -> noun)'] []
rule1028*(noun + GEN -> noun) rule552([пол] + noun + GEN -> noun) [1, 0]
rule558([noun + ITFX] + verb + тель -> noun) ['rule211(verb + тель -> noun)'] ['ruleINTERFIX(noun)']
rule211(verb + тель -> noun) ruleINTERFIX(noun) [1, 0]
rule558([adj + ITFX] + verb + тель -> noun) ['rule211(verb + тель -> noun)'] ['ruleINTERFIX(adj)']
rule211(verb + тель -> noun) ruleINTERFIX(adj) [1, 0]
rule559([noun + ITFX] + verb + ец/нец/енец/омец -> noun) ['rule216(verb + ец/нец/енец/омец -> noun)'] ['ruleINTERFIX(noun)']
rule216(verb + ец/нец/енец/омец -> noun) ruleINTERFIX(noun) [1, 0]
rule559([adj + ITFX] + verb + ец/нец/енец/омец -> noun) ['rule216(verb + е




In [12]:
def analyze_word(word, pos):
    analyzes = []
    for rule_name, ca in cas.items():
        if ca.pos != pos:
            continue        
        analyzes.extend(ca.analyze(word, pos))
    return analyzes

In [13]:
analyze_word("полусладкий", "adj")

[('полусладкий',
  'adj',
  'rule754([num + ITFX] + adj -> adj)',
  ('половина', 'noun'),
  'ruleINTERFIX(num)',
  ('сладкий', 'adj'),
  'adj')]

In [14]:
analyze_word("еженедельный", "adj")

[('еженедельный',
  'adj',
  'rule754([adj + ITFX] + adj -> adj)',
  ('еже', 'prefixoid'),
  'ruleINTERFIX(adj)',
  ('недельный', 'adj'),
  'adj'),
 ('еженедельный',
  'adj',
  'rule761([adj + ITFX] + noun + н1(ый) -> adj)',
  ('еже', 'prefixoid'),
  'ruleINTERFIX(adj)',
  ('неделя', 'noun'),
  'rule619*(noun + н1(ый) -> adj)')]

In [15]:
analyze_word("двухнедельный", "adj")

[('двухнедельный',
  'adj',
  'rule754([num + ITFX] + adj -> adj)',
  ('два', 'num'),
  'ruleINTERFIX(num)',
  ('недельный', 'adj'),
  'adj'),
 ('двухнедельный',
  'adj',
  'rule761([num + ITFX] + noun + н1(ый) -> adj)',
  ('два', 'num'),
  'ruleINTERFIX(num)',
  ('неделя', 'noun'),
  'rule619*(noun + н1(ый) -> adj)')]

In [17]:
analyze_word("полнедели", "noun")

[('полнедели',
  'noun',
  'rule552([пол] + noun + GEN -> noun)',
  ('половина', 'noun'),
  'rule552([пол] + noun + GEN -> noun)',
  ('неделя', 'noun'),
  'rule1028*(noun + GEN -> noun)')]

In [18]:
analyze_word("псевдомонета", "noun")

[('псевдомонета',
  'noun',
  'rule550([adj + ITFX] + noun -> noun)',
  ('псевдо', 'prefixoid'),
  'ruleINTERFIX(adj)',
  ('монета', 'noun'),
  'noun')]