# Предсказание вида (аспекта) сербских глаголов
## _На материалах UD_Serbian-SET_

Основная идея: в _UD_Serbian-SET_ не размечен глагольный вид, но существующих данных и инструментов достаточно, чтобы можно было разметить небольшое количество данных и обучить модель высокоточно предсказывать (почти) всю генеральную совокупность.

### Импорты, настройки, глобальные переменные

In [1]:
import pandas as pd
import csv
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.model_selection import FeatureImportances

import numpy as np
from gensim.models import Word2Vec

import torch
from itertools import product
from tqdm import tqdm
import re

from sklearn.model_selection import train_test_split
import numpy as np

import datetime
from nltk import word_tokenize

In [2]:
class Log:

    cols = ["experiment_name", "experiment_group", "model_name", "PCA", "F1_cv", "F1_holdout", "F1_unkn"]

    def __init__(self, fname: Path, sep: str):

        self.fname = fname
        self.sep = sep

        if not self.fname.exists():
            with open(fname, "w", encoding="utf-8") as file:
                file.write(self.sep.join(self.cols) + "\n")

    def write(self, vals):
        with open(self.fname, "a+", encoding="utf-8") as file:
            file.write(self.sep.join(vals) + "\n")

In [3]:
PAD_TOK = "[PAD]"
RANDOM_STATE = 42

DATADIR = Path.cwd() / "data"
DATAPATH = DATADIR / "datasetForAnalysis.csv"

LOGNAME_TMPL = datetime.datetime.now().strftime("%I-%M%p on %B %d %Y.csv")
SEP = "\t"

%matplotlib inline
SNS_COLOR = "coolwarm"

torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

import warnings
warnings.filterwarnings("ignore")

In [4]:
LOG = Log(DATADIR / LOGNAME_TMPL, SEP)

### Загрузка и подготовка датасета

In [5]:
df = pd.read_csv(
    DATAPATH,
    sep="\t",
    quoting=csv.QUOTE_NONE,
    index_col=0,
    dtype=str
)

In [6]:
df = df.drop(["POS", "db_id"], axis=1)

In [7]:
df = df.fillna(PAD_TOK)

In [8]:
target_vars = [
    "aspect",
    "disambig",
]

dict_vars = [
    "lemma",
    "word",
]

grapheme_vars = [
    "l_gr_1",
    "l_gr_2",
    "l_gr_3",
    "r_gr_1",
    "r_gr_2",
    "r_gr_3",
]

grammar_vars = [
    "Gender",
    "Mood",
    "Number",
    "Person",
    "Tense",
    "VerbForm",
    "Voice",
    "nsubj_count",
    "obj_count",
    "obl_count",
    "advmod_count",
    "polarity",
]

text_vars = [
    "lemma",
    "word",
    "text",
    "l_tok_3",
    "l_tok_2",
    "l_tok_1",
    "r_tok_1",
    "r_tok_2",
    "r_tok_3",
    "nsubj",
    "obj",
    "obl",
    "advmod",
    "polarity_word",
]

In [9]:
# Выборка лемм, про которые классификатор не будет знать
lemmas_train, lemmas_test = train_test_split(
    df["lemma"].unique(),
    test_size=0.094,
    random_state=RANDOM_STATE,
    shuffle=True
)
lemmas_train.shape, lemmas_test.shape

((281,), (30,))

In [10]:
train_df      = df[df["lemma"].isin(lemmas_train)]
test_df_unkn  = df[df["lemma"].isin(lemmas_test )]
train_df.shape, test_df_unkn.shape

((4409, 34), (550, 34))

In [11]:
y_test_unkn = test_df_unkn[target_vars]
X_test_unkn = test_df_unkn.drop(target_vars, axis=1)
X_test_unkn.shape, y_test_unkn.shape

((550, 32), (550, 2))

In [12]:
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(
    train_df.drop(target_vars, axis=1),
    train_df[target_vars],
    test_size=0.1247,
    random_state=RANDOM_STATE,
    shuffle=True
)
X_train.shape, X_test_holdout.shape, y_train.shape, y_test_holdout.shape

((3859, 32), (550, 32), (3859, 2), (550, 2))

### Модели

In [13]:
get_lgbm = lambda: LGBMClassifier(
    random_state = RANDOM_STATE,
    learning_rate = 0.1,
    n_estimators = 100,
    max_depth = 7,
    num_leaves = 30,
    subsample = 0.6,
    verbose = -1
)

get_nn = lambda: MLPClassifier(
    random_state=RANDOM_STATE,
    hidden_layer_sizes=(100, 50, 25),
    alpha=0.0001,
    warm_start=False,
    max_iter=200,
)


### Текстовые фичи

In [14]:
class Encoder:

    def __init__(self, path_to_model):
        self.model = Word2Vec.load(path_to_model)
        self.ndim = self.model.wv.word_vec("krastavac").shape[0]

    def encode(self, text: str):
        tokens = word_tokenize(text)

        vecs = [
            self.model.wv.word_vec(token)
            for token
            in tokens
            if token in self.model.wv.key_to_index
        ]

        if vecs:
            return np.mean(vecs, axis=0)
        else:
            return np.zeros((self.ndim,))


In [15]:
encoder = Encoder(str(Path.cwd() / "SrW2V" / "TeslaW2V"))

In [16]:
X_train_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_unkn.iterrows()
]

In [17]:
import timeit
from pprint import pprint
from tqdm import tqdm

In [18]:
def time_stuff(get_model, training_data, y, testing_data, reps=15):
    train_times = []
    infer_times = []

    for rep in tqdm(range(reps)):
        model = get_model()
        time_train = timeit.timeit(
            lambda: model.fit(
                training_data,
                y
            ),
            number=1
        )

        time_test = timeit.timeit(
            lambda: model.predict(
                testing_data,
            ),
            number=1
        )

        train_times.append(time_train)
        infer_times.append(time_test)

    return np.mean(train_times), sum(train_times), np.mean(infer_times), sum(infer_times), train_times, infer_times


In [19]:
time_stuff(
    get_lgbm,
    X_train_words_and_ctx_encoded + X_test_holdout_words_and_ctx_encoded,
    y_train["aspect"].to_list()  + y_test_holdout["aspect"].to_list(),
    X_test_unkn_words_and_ctx_encoded,
)

100%|██████████| 15/15 [00:23<00:00,  1.56s/it]


(1.5564234760666296,
 23.346352140999443,
 0.00498708953333941,
 0.07480634300009115,
 [0.7217053109998233,
  0.6762737249991915,
  0.6897834709998278,
  0.7088830280008551,
  0.7081631349992676,
  0.6819829989999562,
  0.6703786260004563,
  0.6680941309987247,
  1.033948424999835,
  2.1370755749994714,
  2.4300957750001544,
  3.3352648680011043,
  3.7130213950003963,
  2.6186007790001895,
  2.5530808980001893],
 [0.003052003999982844,
  0.0025176130002364516,
  0.0028608060001715785,
  0.0026571369999146555,
  0.0027851130016642855,
  0.0025215839996235445,
  0.002404286999080796,
  0.0024199540002882713,
  0.004498251999393688,
  0.005900101999941398,
  0.007587379001051886,
  0.007326885999646038,
  0.010154820000025211,
  0.008883163000064087,
  0.009237242999006412])

In [20]:
time_stuff(
    get_nn,
    X_train_words_and_ctx_encoded + X_test_holdout_words_and_ctx_encoded,
    y_train["aspect"].to_list()  + y_test_holdout["aspect"].to_list(),
    X_test_unkn_words_and_ctx_encoded,
)

100%|██████████| 15/15 [01:11<00:00,  4.74s/it]


(4.732104733000475,
 70.98157099500713,
 0.0063410403334273726,
 0.09511560500141059,
 [12.640707497999756,
  4.109504706000735,
  3.413406706000387,
  4.324687326999992,
  3.4708761680012685,
  3.5583505110007536,
  3.5405135310011246,
  3.354182158000185,
  3.6469912600005046,
  3.7632809639999323,
  4.080342920000476,
  5.417400376001751,
  5.784636194000996,
  4.80293185499977,
  5.073758820999501],
 [0.03155961800075602,
  0.0030735479995200876,
  0.003265133000240894,
  0.0031120410003495635,
  0.0032880660000955686,
  0.004299366999475751,
  0.0039892570002848515,
  0.006010225999489194,
  0.0030599170004279586,
  0.0031263689998013433,
  0.007180803000665037,
  0.005294742999467417,
  0.002995589999045478,
  0.004972328000803827,
  0.009888599000987597])

In [21]:
raise ValueError

ValueError: 