# 1. Извлеваем все сложные слова из датасета

In [1]:
import pandas as pd

In [9]:
train_df = pd.read_csv("classification_train.csv")

In [10]:
train_df.head()

Unnamed: 0,lemma_c,morphemes,is_in_tikhonov,n_roots,pos_c,is_in_wiktionary,is_compound,is_derived,needs_analysis
0,одуматься,о:PREF/дум:ROOT/а:SUFF/ть:SUFF/ся:POSTFIX,1,1,VERB,1,0,1.0,0.0
1,врать,вр:ROOT/а:SUFF/ть:SUFF,1,1,VERB,1,0,1.0,0.0
2,хлюстовой,хлюст:ROOT/ов:SUFF/ой:END,1,1,,0,0,,0.0
3,отклеить,от:PREF/кле:ROOT/и:SUFF/ть:SUFF,1,1,VERB,1,0,1.0,0.0
4,писание,пис:ROOT/а:SUFF/ни:SUFF/е:END,1,1,NOUN,1,0,1.0,0.0


72034

In [11]:
test_df = pd.read_csv("classification_test.csv")

In [12]:
test_df.head()

Unnamed: 0,lemma_c,morphemes,is_in_tikhonov,n_roots,pos_c,is_in_wiktionary,is_compound,is_derived,needs_analysis
0,упасти,у:PREF/пас:ROOT/ти:SUFF,1,1,VERB,1,0,1.0,0.0
1,передняя,перед:ROOT/н:SUFF/яя:END,1,1,NOUN,1,0,0.0,0.0
2,воскрыляться,вос:PREF/крыл:ROOT/я:SUFF/ть:SUFF/ся:POSTFIX,1,1,VERB,1,0,1.0,0.0
3,лядунка,лядун:ROOT/к:SUFF/а:END,1,1,NOUN,1,0,0.0,0.0
4,бурить,бур:ROOT/и:SUFF/ть:SUFF,1,1,VERB,1,0,1.0,0.0


In [13]:
train_compounds = train_df[train_df["n_roots"] > 1]
test_compounds = test_df[test_df["n_roots"] > 1]

train_compounds = train_compounds[["lemma_c", "pos_c"]]
test_compounds = test_compounds[["lemma_c", "pos_c"]]

In [106]:
len(train_compounds), len(test_compounds)

(14282, 4704)

In [14]:
train_compounds.head()

Unnamed: 0,lemma_c,pos_c
11,чертыхаться,VERB
15,милливольтметр,NOUN
23,санитарно-ветеринарный,
32,морфемный,ADJ
33,богомильский,ADJ


In [15]:
test_compounds.head()

Unnamed: 0,lemma_c,pos_c
8,родословие,NOUN
10,кругорама,
13,горько-сладкий,ADJ
14,фрезерно-центровальный,
16,некредитоспособный,ADJ


# 2. Запустить DerivBase.Ru (желательно применить multiprocessing)

In [71]:
from itertools import product
import warnings

import sys
import os
sys.path.append("../") # go to parent dir
from src.Derivation import Derivation
derivator = Derivation(use_guesser=True)

In [72]:
from tqdm.notebook import tqdm

In [73]:
def get_pos_tag(word):
    try:
        tag = derivator.tag_guesser.morph.parse(word)[0].tag.POS
        if tag == "INFN":
            return "VERB"
        if tag == "NUMR":
            return "NUM"
        if tag == "ADJF":
            return "ADJ"
        if tag == "PRTF":
            return "PART"
        if tag == "GRND":
            return "TRG"
        return tag
    except:
        return None

In [74]:
from collections import defaultdict

In [75]:
words_by_pos = defaultdict(set)
derivator.pos_all.append('part')
pos_all = derivator.pos_all

for dataset in (train_compounds, test_compounds):
    for line in tqdm(dataset.values):
        lemma, pos = line
        if pos is None:
            pos = get_pos_tag(lemma)
        try:
            pos = pos.strip().lower()
        except:
            pos = None
        if pos is None:
            if any([lemma.endswith('ый'), lemma.endswith('ий'), lemma.endswith('ой')]):
                words_by_pos["adj"].add(lemma)
            elif any([lemma.endswith('ть'), lemma.endswith('ти'), lemma.endswith('ться'), lemma.endswith('тись')]):
                words_by_pos["verb"].add(lemma)
            else:
                words_by_pos["noun"].add(lemma)
        else:
            words_by_pos[pos].add(lemma)
    

  0%|          | 0/14282 [00:00<?, ?it/s]

  0%|          | 0/4704 [00:00<?, ?it/s]

In [91]:
def find_connections(word_b, pos_b, pos_a):
    connections = []
    derived = derivator.derive(word_b, pos_b=pos_b, pos_a=pos_a, is_extended=True, use_rare=True)
    for rule_id, derived_words in derived.items():
        exist_in_vocabulary = words_by_pos[pos_a] & derived_words
        connections.extend([(word_b, pos_b, word_a, pos_a, rule_id) for word_a in exist_in_vocabulary])
    return connections

def find_all_word_connections(word_b, pos_b):
    connections = []
    for pos_a in pos_all:
        connections.extend(find_connections(word_b, pos_b, pos_a))
    return connections

def find_all_pos_connections(pos_b):
    connections = []
    for word_b in tqdm(words_by_pos[pos_b], desc=pos_b):
        # NOTE: убрать этот иф!
        if len(connections) >= 10:
            break
        connections.extend(find_all_word_connections(word_b, pos_b))
    return connections

def find_all_connections():
    connections = []
    for pos_b in pos_all:
        connections.extend(find_all_pos_connections(pos_b))
    return connections

In [92]:
find_all_word_connections('морфема', 'noun')

[('морфема', 'noun', 'морфемный', 'adj', 'rule619(noun + н1(ый) -> adj)')]

In [93]:
all_connections = find_all_connections()

noun:   0%|          | 0/9929 [00:00<?, ?it/s]

adj:   0%|          | 0/8369 [00:00<?, ?it/s]

verb:   0%|          | 0/565 [00:00<?, ?it/s]

adv:   0%|          | 0/106 [00:00<?, ?it/s]

num:   0%|          | 0/12 [00:00<?, ?it/s]

part: 0it [00:00, ?it/s]

In [94]:
# NOTE: закомментировать!
all_connections

[('голодранка',
  'noun',
  'голодранка',
  'noun',
  'rule415(noun + к(а)/очк(а) -> noun)'),
 ('сандружина',
  'noun',
  'сандружинница',
  'noun',
  'rule348(noun + ниц(а) -> noun)'),
 ('политотдел', 'noun', 'политотделец', 'noun', 'rule414(noun + ец -> noun)'),
 ('шубенка', 'noun', 'шубенка', 'noun', 'rule415(noun + к(а)/очк(а) -> noun)'),
 ('парапсихология',
  'noun',
  'парапсихологический',
  'adj',
  'rule630(noun + ск(ий) -> adj)'),
 ('гелиотроп', 'noun', 'гелиотропизм', 'noun', 'rule355(noun + изм -> noun)'),
 ('гелиотроп', 'noun', 'гелиотропин', 'noun', 'rule360(noun + ин -> noun)'),
 ('короткометражка',
  'noun',
  'короткометражка',
  'noun',
  'rule415(noun + к(а)/очк(а) -> noun)'),
 ('короткометражка',
  'noun',
  'короткометражный',
  'adj',
  'rule619(noun + н1(ый) -> adj)'),
 ('мелочевка',
  'noun',
  'мелочевка',
  'noun',
  'rule415(noun + к(а)/очк(а) -> noun)'),
 ('нейролептический',
  'adj',
  'нейролептик',
  'noun',
  'rule460(adj + 0m2 -> noun)'),
 ('бомбардирск

# 3. Построить граф и найти корни

In [97]:
derived_from = defaultdict(list)

for word_b, pos_b, word_a, pos_a, rule_id in tqdm(all_connections):
    if (word_a, pos_a) == (word_b, pos_b):
        continue
    derived_from[(word_a, pos_a)].append((word_b, pos_b, rule_id))

  0%|          | 0/36 [00:00<?, ?it/s]

In [99]:
len(derived_from)

30

In [103]:
roots = []

for pos_a in pos_all:
    for word_a in words_by_pos[pos_a]:
        if not derived_from[(word_a, pos_a)]:
            roots.append((word_a, pos_a))

In [104]:
len(roots)

18951

# 4. Сохранить всё, что извлеклось

In [None]:
#NOTE: todo!