# Предсказание вида (аспекта) сербских глаголов
## _На материалах UD_Serbian-SET_

Основная идея: в _UD_Serbian-SET_ не размечен глагольный вид, но существующих данных и инструментов достаточно, чтобы можно было разметить небольшое количество данных и обучить модель высокоточно предсказывать (почти) всю генеральную совокупность.

### Импорты, настройки, глобальные переменные

In [1]:
import pandas as pd
import csv
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score


from lightgbm import LGBMClassifier
import numpy as np
from gensim.models import Word2Vec

import torch
from itertools import product
from tqdm import tqdm
import re

from sklearn.model_selection import train_test_split
import numpy as np

import datetime
from nltk import word_tokenize

In [2]:
class Log:

    cols = ["experiment_name", "experiment_group", "model_name", "PCA", "F1_cv", "F1_holdout", "F1_unkn"]

    def __init__(self, fname: Path, sep: str):

        self.fname = fname
        self.sep = sep

        if not self.fname.exists():
            with open(fname, "w", encoding="utf-8") as file:
                file.write(self.sep.join(self.cols) + "\n")

    def write(self, vals):
        with open(self.fname, "a+", encoding="utf-8") as file:
            file.write(self.sep.join(vals) + "\n")

In [3]:
PAD_TOK = "[PAD]"
RANDOM_STATE = 42

DATADIR = Path.cwd() / "data"
DATAPATH = DATADIR / "datasetForAnalysis.csv"

LOGNAME_TMPL = datetime.datetime.now().strftime("%I-%M%p on %B %d %Y.csv")
SEP = "\t"

%matplotlib inline
SNS_COLOR = "coolwarm"

torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

import warnings
warnings.filterwarnings("ignore")

In [4]:
LOG = Log(DATADIR / LOGNAME_TMPL, SEP)

### Загрузка и подготовка датасета

In [5]:
df = pd.read_csv(
    DATAPATH,
    sep="\t",
    quoting=csv.QUOTE_NONE,
    index_col=0,
    dtype=str
)

In [6]:
df = df.drop(["POS", "db_id"], axis=1)

In [7]:
df = df.fillna(PAD_TOK)

In [8]:
target_vars = [
    "aspect",
    "disambig",
]

dict_vars = [
    "lemma",
    "word",
]

grapheme_vars = [
    "l_gr_1",
    "l_gr_2",
    "l_gr_3",
    "r_gr_1",
    "r_gr_2",
    "r_gr_3",
]

grammar_vars = [
    "Gender",
    "Mood",
    "Number",
    "Person",
    "Tense",
    "VerbForm",
    "Voice",
    "nsubj_count",
    "obj_count",
    "obl_count",
    "advmod_count",
    "polarity",
]

text_vars = [
    "lemma",
    "word",
    "text",
    "l_tok_3",
    "l_tok_2",
    "l_tok_1",
    "r_tok_1",
    "r_tok_2",
    "r_tok_3",
    "nsubj",
    "obj",
    "obl",
    "advmod",
    "polarity_word",
]

In [9]:
# Выборка лемм, про которые классификатор не будет знать
lemmas_train, lemmas_test = train_test_split(
    df["lemma"].unique(),
    test_size=0.094,
    random_state=RANDOM_STATE,
    shuffle=True
)
lemmas_train.shape, lemmas_test.shape

((281,), (30,))

In [10]:
train_df      = df[df["lemma"].isin(lemmas_train)]
test_df_unkn  = df[df["lemma"].isin(lemmas_test )]
train_df.shape, test_df_unkn.shape

((4409, 34), (550, 34))

In [11]:
y_test_unkn = test_df_unkn[target_vars]
X_test_unkn = test_df_unkn.drop(target_vars, axis=1)
X_test_unkn.shape, y_test_unkn.shape

((550, 32), (550, 2))

In [12]:
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(
    train_df.drop(target_vars, axis=1),
    train_df[target_vars],
    test_size=0.1247,
    random_state=RANDOM_STATE,
    shuffle=True
)
X_train.shape, X_test_holdout.shape, y_train.shape, y_test_holdout.shape

((3859, 32), (550, 32), (3859, 2), (550, 2))

### Модели

In [13]:
# get_lgbm = lambda: LGBMClassifier(
#     random_state = RANDOM_STATE,
#     learning_rate = 0.1,
#     n_estimators = 100,
#     max_depth = 7,
#     num_leaves = 30,
#     subsample = 0.6,
#     verbose = -1
# )

### Текстовые фичи

In [14]:
class Encoder:

    def __init__(self, path_to_model):
        self.model = Word2Vec.load(path_to_model)
        self.ndim = self.model.wv.word_vec("krastavac").shape[0]

    def encode(self, text: str):
        tokens = word_tokenize(text)

        vecs = [
            self.model.wv.word_vec(token)
            for token
            in tokens
            if token in self.model.wv.key_to_index
        ]

        if vecs:
            return np.mean(vecs, axis=0)
        else:
            return np.zeros((self.ndim,))


In [15]:
encoder = Encoder(str(Path.cwd() / "SrW2V" / "TeslaW2V"))

In [16]:
X_train_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_unkn.iterrows()
]

In [17]:
def custom_gridsearch(model:LGBMClassifier, param_grid, X_train, y_train, X_test, y_test):

    keys = param_grid.keys()
    value_combinations = product(*param_grid.values())

    best_metric = 0.0
    best_params = None

    for combination in tqdm(list(value_combinations)):
        params = {
            k: val
            for k, val
            in zip(keys, combination)
        }

        cur_model = model(**params)
        cur_model.fit(X_train, y_train)
        preds= cur_model.predict(X_test)

        f1_wtd = f1_score(y_test, preds, average="weighted")

        if f1_wtd > best_metric:
            best_metric = f1_wtd
            best_params = params

    return best_params, best_metric

In [23]:
param_grid = {
    "learning_rate": [0.05],
    # "n_estimators": [30, 50, 100,],
    # "n_estimators": [100, 150, 200],
    "n_estimators": [80, 90, 100],
    "max_depth": [6, 7, 8, 9,],
    "num_leaves": [20, 30, 40],
    "subsample": [0.6, 0.8, 1.0],
    "verbose": [-1]
}

In [24]:
result = custom_gridsearch(
    LGBMClassifier,
    param_grid,
    X_train_words_and_ctx_encoded,
    y_train["aspect"],
    X_test_unkn_words_and_ctx_encoded,
    y_test_unkn["aspect"]
)

result

100%|██████████| 108/108 [04:11<00:00,  2.33s/it]


({'learning_rate': 0.05,
  'n_estimators': 90,
  'max_depth': 7,
  'num_leaves': 30,
  'subsample': 0.6,
  'verbose': -1},
 0.9527626446855212)

In [25]:
lgbm = LGBMClassifier(
    **{'learning_rate': 0.05,
  'n_estimators': 90,
  'max_depth': 7,
  'num_leaves': 30,
  'subsample': 0.6,
  'verbose': -1}
)

In [26]:
lgbm.fit(
    X_train_words_and_ctx_encoded,
    y_train["aspect"],
)

In [27]:
preds_holdout = lgbm.predict(
    X_test_holdout_words_and_ctx_encoded
)

preds_unknown = lgbm.predict(
    X_test_unkn_words_and_ctx_encoded
)

In [29]:
(
f1_score(y_test_holdout["aspect"], preds_holdout, average="weighted"),
f1_score(y_test_unkn["aspect"], preds_unknown, average="weighted")
)

(0.983568342677472, 0.9527626446855212)

In [35]:
df['X'] = df.apply(
    lambda row: np.append(
        encoder.encode(row["word"]),
        encoder.encode(" ".join([
            row["l_tok_3"], 
            row["l_tok_2"], 
            row["l_tok_1"], 
            row["r_tok_1"], 
            row["r_tok_2"], 
            row["r_tok_3"]
        ]))
    ),
    axis=1
)

In [42]:
df["y_bar"] = lgbm.predict(df["X"].to_list())

In [44]:
df[["word", "aspect", "y_bar"]]

Unnamed: 0,word,aspect,y_bar
0,baca,imp,imp
1,bacaju,imp,imp
2,bacaju,imp,imp
3,bacaju,imp,imp
4,bacala,imp,imp
...,...,...,...
4954,živimo,imp,imp
4955,živimo,imp,imp
4956,živimo,imp,imp
4957,živimo,imp,imp


In [52]:
df_err = df[df["aspect"] != df["y_bar"]][["word", "lemma","aspect", "y_bar"]]
df_err.shape

(35, 4)

In [53]:
df_true = df[df["aspect"] == df["y_bar"]][["word", "lemma", "aspect", "y_bar"]]
df_true.shape

(4924, 4)

In [54]:
df_err

Unnamed: 0,word,lemma,aspect,y_bar
92,daju,davati,imp,perf
402,držao,držati,imp,perf
403,držao,držati,imp,perf
889,iznosila,iznositi,imp,perf
892,iznosiće,iznositi,imp,perf
1772,obećajmo,obećati,imp,perf
1958,olakšaće,olakšati,perf,imp
2008,organizovati,organizovati,both,perf
2608,Posetili,posetiti,perf,imp
2649,postaju,postati,perf,imp
