In [None]:
import pandas as pd
from tqdm.notebook import tqdm

from tg.grammar_ru.features import PyMorphyFeaturizer

In [None]:
from tg.grammar_ru import Loc

CORPUS_NAMES = [
    "books.base.zip",
    "pub.base.zip",
    "lenta.base.zip"
]
#TODO: add smth else?

CORPUS_LIST = [Loc.corpus_path / corpus_name for corpus_name in CORPUS_NAMES]

In [None]:
from yo_fluq_ds import *
from tg.grammar_ru import Separator
from tg.grammar_ru.corpus.corpus_reader import CorpusReader

In [None]:
viewer = Separator.Viewer()

texts = list(CorpusReader.read_frames_from_several_corpora(CORPUS_LIST)
             .feed(fluq.with_progress_bar(console=None))
             .select(viewer.to_text)
             )

In [None]:
import jsonlines

with jsonlines.open('texts.jsonl', 'w') as write:
    write.write_all(texts)

In [None]:
import jsonlines

with jsonlines.open('texts.jsonl') as read:
    texts2 = [t for t in read]

In [None]:
import itertools

sents = list(itertools.chain.from_iterable(text.splitlines() for text in texts2))

In [None]:
import random

random.shuffle(sents)

In [None]:
len(sents)

In [None]:
from tg.grammar_ru.features import PyMorphyFeaturizer

db = Separator.build_bundle(sents[:100000], [PyMorphyFeaturizer()])

In [None]:
db.src = db.src.join(db.pymorphy, on='word_id')
db.src.head()

In [None]:
db.pymorphy[db.pymorphy.POS == 'ADJS'].info()

In [None]:
db.pymorphy[db.pymorphy.POS == 'ADJF'].info()

In [None]:
import numpy as np

WINDOW_SIZE = 4

features = ['POS', 'gender', 'number', 'case', 'animacy', 'aspect', 'transitivity', 'person', 'tense', 'mood', 'voice', 'involvement']

def get_offset_word_feats(word_row, offset) -> dict:
    result = {}
    for feat in features:
        result[f'{feat}_{offset}'] = word_row[feat]
    result[f'OFFSET_{offset}'] = f'OFFSET_{offset}'
    return result

def get_empty_feats(offset) -> dict:
    result = {}
    for feat in features:
        result[f'{feat}_{offset}'] = np.nan
    result[f'OFFSET_{offset}'] = f'OFFSET_{offset}'
    return result


adj_dataset = []

for sent_id, sentence_df in tqdm(db.src.groupby('sentence_id')):
    for idx in sentence_df.index[sentence_df.POS.eq('ADJS') | sentence_df.POS.eq('ADJF')]:
        adj_window_data = {}
        for offset in range(-WINDOW_SIZE, WINDOW_SIZE + 1):
            if idx + offset not in sentence_df.index:
                adj_window_data.update(get_empty_feats(offset))
            else:
                adj_window_data.update(get_offset_word_feats(sentence_df.loc[idx + offset], offset))
        
        adj_dataset.append(adj_window_data)
                

In [68]:
adj_df = pd.DataFrame.from_records(adj_dataset)
adj_df.head()

Unnamed: 0,POS_-4,gender_-4,number_-4,case_-4,animacy_-4,aspect_-4,transitivity_-4,person_-4,tense_-4,mood_-4,...,case_4,animacy_4,aspect_4,transitivity_4,person_4,tense_4,mood_4,voice_4,involvement_4,OFFSET_4
0,,,,,,,,,,,...,,,,,,,,,,OFFSET_4
1,CONJ,,,,,,,,,,...,,,,,,,,,,OFFSET_4
2,,,,,,,,,,,...,ablt,inan,,,,,,,,OFFSET_4
3,,,,,,,,,,,...,,,,,,,,,,OFFSET_4
4,NONE,,,,,,,,,,...,gent,anim,,,,,,,,OFFSET_4


In [69]:
adj_df = adj_df.fillna('missing')
adj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1368913 entries, 0 to 1368912
Columns: 117 entries, POS_-4 to OFFSET_4
dtypes: object(117)
memory usage: 1.2+ GB


In [71]:
adj_df = adj_df.astype('category')
adj_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1368913 entries, 0 to 1368912
Columns: 117 entries, POS_-4 to OFFSET_4
dtypes: category(117)
memory usage: 152.8 MB


In [72]:
db.data_frames['adjectives'] = adj_df
db.adjectives

Unnamed: 0,POS_-4,gender_-4,number_-4,case_-4,animacy_-4,aspect_-4,transitivity_-4,person_-4,tense_-4,mood_-4,...,case_4,animacy_4,aspect_4,transitivity_4,person_4,tense_4,mood_4,voice_4,involvement_4,OFFSET_4
0,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
1,CONJ,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
2,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,ablt,inan,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
3,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
4,NONE,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,gent,anim,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368908,NOUN,femn,sing,gent,inan,missing,missing,missing,missing,missing,...,nomn,inan,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
1368909,PRCL,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
1368910,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4
1368911,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,...,missing,missing,missing,missing,missing,missing,missing,missing,missing,OFFSET_4


In [73]:
db.save_as_zip('mytest_db.zip')

In [74]:
adj_center_predict_features = ['gender_0', 'number_0', 'case_0', 'animacy_0']
adj_center_expect_features = ['POS_0']

In [75]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(adj_df, test_size=0.2)

In [76]:
len(train_df), len(test_df)

(1095130, 273783)

In [77]:
input_features = [col for col in adj_df.columns if col[-1] != '0' or col in adj_center_expect_features]
label_features = [col for col in adj_df.columns if col in adj_center_predict_features]

In [78]:
for label_feat in label_features:
    print(label_feat, adj_df[label_feat].value_counts())

gender_0 missing    392113
masc       380459
femn       347360
neut       248981
Name: gender_0, dtype: int64
number_0 sing    976800
plur    392113
Name: number_0, dtype: int64
case_0 gent       541353
nomn       389682
accs       134703
loct       100043
ablt        98429
missing     54627
datv        50076
Name: case_0, dtype: int64
animacy_0 missing    1316083
inan         50389
anim          2441
Name: animacy_0, dtype: int64


In [118]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist')
ohe.fit(train_df[label_features])
# ohe.fit(train_df[label_features])

In [80]:
_ = ohe.transform(adj_df[label_features])

In [81]:
import catboost

model = catboost.CatBoostClassifier(
    objective='MultiCrossEntropy',
    iterations=1000,
    custom_metric='F1',
    task_type='GPU'
)

In [82]:
train_data = catboost.Pool(data=train_df[input_features], label=ohe.transform(train_df[label_features]), cat_features=input_features)
test_data = catboost.Pool(data=test_df[input_features], label=ohe.transform(test_df[label_features]), cat_features=input_features)

In [83]:
model.fit(
    train_data,
    eval_set=test_data,
    verbose=50,
)

Learning rate set to 0.03928
0:	learn: 0.6422572	test: 0.6422182	best: 0.6422182 (0)	total: 506ms	remaining: 8m 25s
50:	learn: 0.2046018	test: 0.2047443	best: 0.2047443 (50)	total: 26.5s	remaining: 8m 13s
100:	learn: 0.1827694	test: 0.1828681	best: 0.1828681 (100)	total: 49.5s	remaining: 7m 20s
150:	learn: 0.1745207	test: 0.1746032	best: 0.1746032 (150)	total: 1m 12s	remaining: 6m 47s
200:	learn: 0.1699278	test: 0.1699888	best: 0.1699888 (200)	total: 1m 38s	remaining: 6m 31s
250:	learn: 0.1667069	test: 0.1667730	best: 0.1667730 (250)	total: 2m 3s	remaining: 6m 8s
300:	learn: 0.1641061	test: 0.1641945	best: 0.1641945 (300)	total: 2m 28s	remaining: 5m 45s
350:	learn: 0.1616468	test: 0.1617722	best: 0.1617722 (350)	total: 2m 53s	remaining: 5m 21s
400:	learn: 0.1594430	test: 0.1596328	best: 0.1596328 (400)	total: 3m 19s	remaining: 4m 57s
450:	learn: 0.1578286	test: 0.1580611	best: 0.1580611 (450)	total: 3m 45s	remaining: 4m 34s
500:	learn: 0.1565404	test: 0.1568086	best: 0.1568086 (500)	to

<catboost.core.CatBoostClassifier at 0x7fc58862d2d0>

In [84]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,case_1,26.689132
1,POS_0,18.114269
2,number_1,14.181042
3,gender_1,13.770026
4,POS_-1,7.278599
...,...,...
100,tense_4,0.000000
101,mood_4,0.000000
102,voice_4,0.000000
103,involvement_4,0.000000


In [85]:
model.best_score_

{'learn': {'F1:class=10': 0.8812829239781469,
  'F1:class=12': 0.8812829239781469,
  'F1:class=2': 0.8812829239781469,
  'F1:class=0': 0.8812829239781469,
  'F1:class=9': 0.8812829239781469,
  'F1:class=11': 0.8812829239781469,
  'F1:class=8': 0.8812829239781469,
  'F1:class=3': 0.8812829239781469,
  'F1:class=14': 0.8812829239781469,
  'F1:class=15': 0.8812829239781469,
  'F1:class=13': 0.8812829239781469,
  'F1:class=7': 0.8812829239781469,
  'F1:class=4': 0.8812829239781469,
  'MultiCrossEntropy': 0.14861874275200204,
  'F1:class=6': 0.8812829239781469,
  'F1:class=5': 0.8812829239781469,
  'F1:class=1': 0.8812829239781469},
 'validation': {'F1:class=10': 0.8804150571155337,
  'F1:class=12': 0.8804150571155337,
  'F1:class=2': 0.8804150571155337,
  'F1:class=0': 0.8804150571155337,
  'F1:class=9': 0.8804150571155337,
  'F1:class=11': 0.8804150571155337,
  'F1:class=8': 0.8804150571155337,
  'F1:class=3': 0.8804150571155337,
  'F1:class=14': 0.8804150571155337,
  'F1:class=15': 0.880

синий ['femn' 'sing' 'nomn' 'missing'] синяя
красивая ['femn' 'sing' 'gent' 'missing'] красивой
необычная ['missing' 'plur' None 'missing'] необычные


In [125]:
import pymorphy2
import numpy as np
morph = pymorphy2.MorphAnalyzer(lang='ru')

In [131]:
def get_adjectives_corrections(text: str) -> list[str | None]:
    text_db = Separator.build_bundle(text, [PyMorphyFeaturizer()])
    text_db.src = text_db.src.join(text_db.pymorphy, on='word_id')

    text_df = text_db.src
    text_df['suggestion'] = np.nan
    
    for idx in text_df.index[text_df.POS.eq('ADJS') | text_df.POS.eq('ADJF')]:
        adj_window_data = {}
        for offset in range(-WINDOW_SIZE, WINDOW_SIZE + 1):
            if idx + offset not in text_df.index:
                adj_window_data.update(get_empty_feats(offset))
            else:
                adj_window_data.update(get_offset_word_feats(text_df.loc[idx + offset], offset))
    
        # inp = ohe.transform()
        inp = pd.DataFrame([adj_window_data])[input_features]
        inp = inp.fillna('missing')
    
        pool_inp = catboost.Pool(inp, cat_features=input_features)
        res = model.predict(pool_inp)
        word = text_df.loc[idx].word
        
        parsed = morph.parse(word)[0]
        res1 = ohe.inverse_transform(res)[0]
        inflected = parsed.inflect(set(res1) - {'missing', None}).word
        text_df.loc[idx, 'suggestion'] = inflected
        # print(parsed.word, res1, inflected)
    
    return text_df.suggestion.tolist()

In [133]:
%%time

get_adjectives_corrections('Синий машина едет по красивая дороге и делает необычная вещи!')

CPU times: user 66.6 ms, sys: 150 µs, total: 66.7 ms
Wall time: 63.8 ms


['синяя', nan, nan, nan, 'красивой', nan, nan, nan, 'необычные', nan, nan]

In [135]:
model.save_model('catboost_adjectives.pth')