# Предсказание вида (аспекта) сербских глаголов
## _На материалах UD_Serbian-SET_

Основная идея: в _UD_Serbian-SET_ не размечен глагольный вид, но существующих данных и инструментов достаточно, чтобы можно было разметить небольшое количество данных и обучить модель высокоточно предсказывать (почти) всю генеральную совокупность.

### Импорты, настройки, глобальные переменные

In [1]:
import pandas as pd
import csv
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.model_selection import FeatureImportances

import numpy as np
from gensim.models import Word2Vec

import torch
from itertools import product
from tqdm import tqdm
import re

from sklearn.model_selection import train_test_split
import numpy as np

import datetime
from nltk import word_tokenize

In [2]:
class Log:

    cols = ["experiment_name", "experiment_group", "model_name", "PCA", "F1_cv", "F1_holdout", "F1_unkn"]

    def __init__(self, fname: Path, sep: str):

        self.fname = fname
        self.sep = sep

        if not self.fname.exists():
            with open(fname, "w", encoding="utf-8") as file:
                file.write(self.sep.join(self.cols) + "\n")

    def write(self, vals):
        with open(self.fname, "a+", encoding="utf-8") as file:
            file.write(self.sep.join(vals) + "\n")

In [3]:
PAD_TOK = "[PAD]"
RANDOM_STATE = 42

DATADIR = Path.cwd() / "data"
DATAPATH = DATADIR / "datasetForAnalysis.csv"

LOGNAME_TMPL = datetime.datetime.now().strftime("%I-%M%p on %B %d %Y.csv")
SEP = "\t"

%matplotlib inline
SNS_COLOR = "coolwarm"

torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

import warnings
warnings.filterwarnings("ignore")

In [4]:
LOG = Log(DATADIR / LOGNAME_TMPL, SEP)

### Вспомогательные функции

In [5]:
def one_hot_encode(X_train:pd.DataFrame, *X_tests: list[pd.DataFrame]):

    X_train_cpy = X_train.copy(deep=True)

    X_tests_cpy = [
        X_test.copy(deep=True)
        for X_test
        in X_tests
    ]

    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    pandas_columns = X_train_cpy.columns

    X_train_cpy = pd.DataFrame(encoder.fit_transform(X_train_cpy))
    X_tests_cpy = [
        pd.DataFrame(encoder.transform(X_test_cpy))
        for X_test_cpy
        in X_tests_cpy
    ]

    new_columns = encoder.get_feature_names_out(pandas_columns)
    new_columns = [re.sub(r"\[|\]", "_", col) for col in new_columns]

    X_train_cpy.columns = new_columns
    for X_test_cpy in X_tests_cpy:
        X_test_cpy.columns = new_columns

    return X_train_cpy, *X_tests_cpy

In [6]:
def cross_val(model, X, y):
    return np.mean(
        cross_val_score(
            model,
            X,
            y,
            cv=KFold(
                n_splits=5,
                shuffle=True,
                random_state=RANDOM_STATE
            ),
            scoring="f1_weighted"
        )
    )

In [7]:
def experiment(
    experiment_name,
    experiment_group,
    get_model,
    X_train,
    X_test_holdout,
    X_test_unkn,
    y_train,
    y_test_holdout,
    y_test_unkn,
    log=None,
    cv=False,
    confusion=False,
    report=False,
    fi=False,
    pca=False
):

    model = get_model()
    model_name = model.__class__.__name__

    model.fit(X_train, y_train)
    preds_holdout = model.predict(X_test_holdout)
    preds_unkn    = model.predict(X_test_unkn)

    to_print = [f"\n--- {experiment_name}: {model_name} ---\n"]
    print(len(preds_holdout), len(y_test_holdout))
    f1_wtd_holdout = f1_score(y_test_holdout, preds_holdout, average="weighted")
    f1_wtd_unkn    = f1_score(y_test_unkn, preds_unkn, average="weighted")
    to_print.append(f"F1 weighted (Holdout): {f1_wtd_holdout:.3f}")
    to_print.append(f"F1 weighted (Unknown): {f1_wtd_unkn:.3f}")

    if cv:
        cv_f1 = cross_val(model, X_train, y_train)
        to_print.append(f"CV F1: {cv_f1:.3f}")
    else:
        cv_f1 = None

    if confusion:
        confusions_holdount = confusion_matrix(y_test_holdout, preds_holdout)
        confusions_unkn     = confusion_matrix(y_test_unkn, preds_unkn)
        to_print.append("Holdout confusion matrix")
        to_print.append(confusions_holdount)
        to_print.append("Unknown confusion matrix")
        to_print.append(confusions_unkn)

    if report:
        holdout_report = classification_report(
            y_test_holdout,
            preds_holdout,
            zero_division=1.0
        )
        unkn_report = classification_report(
            y_test_unkn,
            preds_unkn,
            zero_division=1.0
        )
        to_print.append("Holdout report:")
        to_print.append(holdout_report)
        to_print.append("Unknown report:")
        to_print.append(unkn_report)

    list(
        map(lambda string: print(string, "\n"), to_print)
    )

    if fi:
        viz = FeatureImportances(model, relative=False, topn=min(X_train.shape[1], 25))
        viz.fit(X_train, y_train)
        viz.show()

    if log is not None:
        log.write(
            [
                str(experiment_name),
                str(experiment_group),
                str(model_name),
                str(pca),
                str(np.round(cv_f1         , decimals=3)),
                str(np.round(f1_wtd_holdout, decimals=3)),
                str(np.round(f1_wtd_unkn   , decimals=3)),
            ]
        )

### Загрузка и подготовка датасета

In [8]:
df = pd.read_csv(
    DATAPATH,
    sep="\t",
    quoting=csv.QUOTE_NONE,
    index_col=0,
    dtype=str
)

In [9]:
df = df.drop(["POS", "db_id"], axis=1)

In [10]:
df = df.fillna(PAD_TOK)

In [11]:
target_vars = [
    "aspect",
    "disambig",
]

dict_vars = [
    "lemma",
    "word",
]

grapheme_vars = [
    "l_gr_1",
    "l_gr_2",
    "l_gr_3",
    "r_gr_1",
    "r_gr_2",
    "r_gr_3",
]

grammar_vars = [
    "Gender",
    "Mood",
    "Number",
    "Person",
    "Tense",
    "VerbForm",
    "Voice",
    "nsubj_count",
    "obj_count",
    "obl_count",
    "advmod_count",
    "polarity",
]

text_vars = [
    "lemma",
    "word",
    "text",
    "l_tok_3",
    "l_tok_2",
    "l_tok_1",
    "r_tok_1",
    "r_tok_2",
    "r_tok_3",
    "nsubj",
    "obj",
    "obl",
    "advmod",
    "polarity_word",
]

In [12]:
# Выборка лемм, про которые классификатор не будет знать
lemmas_train, lemmas_test = train_test_split(
    df["lemma"].unique(),
    test_size=0.094,
    random_state=RANDOM_STATE,
    shuffle=True
)
lemmas_train.shape, lemmas_test.shape

((281,), (30,))

In [13]:
train_df      = df[df["lemma"].isin(lemmas_train)]
test_df_unkn  = df[df["lemma"].isin(lemmas_test )]
train_df.shape, test_df_unkn.shape

((4409, 34), (550, 34))

In [14]:
y_test_unkn = test_df_unkn[target_vars]
X_test_unkn = test_df_unkn.drop(target_vars, axis=1)
X_test_unkn.shape, y_test_unkn.shape

((550, 32), (550, 2))

In [15]:
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(
    train_df.drop(target_vars, axis=1),
    train_df[target_vars],
    test_size=0.1247,
    random_state=RANDOM_STATE,
    shuffle=True
)
X_train.shape, X_test_holdout.shape, y_train.shape, y_test_holdout.shape

((3859, 32), (550, 32), (3859, 2), (550, 2))

### Бэйзлайн

In [16]:
from random_classifier import RandomClassifier

In [17]:
experiment(
    "baseline_random",
    "baseline_random",
    lambda: RandomClassifier(),
    list(X_train.iterrows()),
    list(X_test_holdout.iterrows()),
    list(X_test_unkn.iterrows()),
    y_train["aspect"].to_list(),
    y_test_holdout["aspect"].to_list(),
    y_test_unkn["aspect"].to_list(),
    log=None,
    cv=True,
    confusion=False,
    report=True,
    fi=False,
)

550 550

--- baseline_random: RandomClassifier ---
 

F1 weighted (Holdout): 0.458 

F1 weighted (Unknown): 0.468 

CV F1: 0.453 

Holdout report: 

              precision    recall  f1-score   support

        both       0.03      0.05      0.04        19
         imp       0.48      0.43      0.45       261
        perf       0.48      0.50      0.49       270

    accuracy                           0.45       550
   macro avg       0.33      0.33      0.33       550
weighted avg       0.46      0.45      0.46       550
 

Unknown report: 

              precision    recall  f1-score   support

        both       0.00      1.00      0.00         0
         imp       0.54      0.41      0.47       305
        perf       0.44      0.51      0.47       245

    accuracy                           0.45       550
   macro avg       0.33      0.64      0.31       550
weighted avg       0.49      0.45      0.47       550
 



### Модели

In [18]:
get_regression = lambda: LogisticRegression(
    max_iter=200
)

get_dt = lambda: DecisionTreeClassifier(
    random_state=RANDOM_STATE
)

get_lgbm = lambda: LGBMClassifier(
    random_state=RANDOM_STATE,
    verbose=-1
)

get_nn = lambda: MLPClassifier(
    random_state=RANDOM_STATE,
    hidden_layer_sizes=(100, 50, 25),
    alpha=0.0001,
    max_iter=200
)

models = [get_regression, get_dt, get_lgbm, get_nn]

### Эксперименты с графемами

In [19]:
X_train_graphemes          = X_train         [grapheme_vars]
X_test_holdout_graphemes   = X_test_holdout  [grapheme_vars]
X_test_unkn_graphemes      = X_test_unkn     [grapheme_vars]

X_train_graphemes.head()

Unnamed: 0,l_gr_1,l_gr_2,l_gr_3,r_gr_1,r_gr_2,r_gr_3
1898,o,d,r,i,l,a
3998,s,p,r,e,d,e
2987,p,o,š,e,j,u
4256,u,b,i,o,i,b
4506,u,č,e,a,l,a


In [20]:
X_train_graphemes_2 = X_train_graphemes.drop(["l_gr_3", "r_gr_3"], axis=1)
X_test_holdout_graphemes_2 = X_test_holdout_graphemes.drop(["l_gr_3", "r_gr_3"], axis=1)
X_test_unkn_graphemes_2 = X_test_unkn_graphemes.drop(["l_gr_3", "r_gr_3"], axis=1)

X_train_graphemes_2.head()

Unnamed: 0,l_gr_1,l_gr_2,r_gr_1,r_gr_2
1898,o,d,i,l
3998,s,p,e,d
2987,p,o,e,j
4256,u,b,o,i
4506,u,č,a,l


In [21]:
X_train_graphemes_1 = X_train_graphemes_2.drop(["l_gr_2", "r_gr_2"], axis=1)
X_test_holdout_graphemes_1 = X_test_holdout_graphemes_2.drop(["l_gr_2", "r_gr_2"], axis=1)
X_test_unkn_graphemes_1 = X_test_unkn_graphemes_2.drop(["l_gr_2", "r_gr_2"], axis=1)

X_train_graphemes_1.head()

Unnamed: 0,l_gr_1,r_gr_1
1898,o,i
3998,s,e
2987,p,e
4256,u,o
4506,u,a


In [22]:
X_train_sufs = pd.DataFrame()
X_test_holdout_sufs = pd.DataFrame()
X_test_unkn_sufs = pd.DataFrame()

X_train_sufs["pref"] = X_train_graphemes["l_gr_1"] + X_train_graphemes["l_gr_2"] + X_train_graphemes["l_gr_3"]
X_train_sufs["suf"] = X_train_graphemes["r_gr_1"] + X_train_graphemes["r_gr_2"] + X_train_graphemes["r_gr_3"]

X_test_holdout_sufs["pref"] = X_test_holdout_graphemes["l_gr_1"] + X_test_holdout_graphemes["l_gr_2"] + X_test_holdout_graphemes["l_gr_3"]
X_test_holdout_sufs["suf"] = X_test_holdout_graphemes["r_gr_1"] + X_test_holdout_graphemes["r_gr_2"] + X_test_holdout_graphemes["r_gr_3"]

X_test_unkn_sufs["pref"] = X_test_unkn_graphemes["l_gr_1"] + X_test_unkn_graphemes["l_gr_2"] + X_test_unkn_graphemes["l_gr_3"]
X_test_unkn_sufs["suf"] = X_test_unkn_graphemes["r_gr_1"] + X_test_unkn_graphemes["r_gr_2"] + X_test_unkn_graphemes["r_gr_3"]

X_train_sufs.head()

Unnamed: 0,pref,suf
1898,odr,ila
3998,spr,ede
2987,poš,eju
4256,ubi,oib
4506,uče,ala


In [23]:
experiments = [
    lambda model: experiment(
        "6_graphemes",
        "Graphemes",
        model,
        *one_hot_encode(
            X_train_graphemes,
            X_test_holdout_graphemes,
            X_test_unkn_graphemes,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "4_graphemes",
        "Graphemes",
        model,
        *one_hot_encode(
            X_train_graphemes_2,
            X_test_holdout_graphemes_2,
            X_test_unkn_graphemes_2,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "2_graphemes",
        "Graphemes",
        model,
        *one_hot_encode(
            X_train_graphemes_1,
            X_test_holdout_graphemes_1,
            X_test_unkn_graphemes_1,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "sufs",
        "Graphemes",
        model,
        *one_hot_encode(
            X_train_sufs,
            X_test_holdout_sufs,
            X_test_unkn_sufs,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
]

In [24]:
# 6 min 20 sec
for e in experiments:
    for model in models:
        e(model)

550 550

--- 6_graphemes: LogisticRegression ---
 

F1 weighted (Holdout): 0.873 

F1 weighted (Unknown): 0.818 

CV F1: 0.889 

550 550

--- 6_graphemes: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.953 

F1 weighted (Unknown): 0.519 

CV F1: 0.960 

550 550

--- 6_graphemes: LGBMClassifier ---
 

F1 weighted (Holdout): 0.967 

F1 weighted (Unknown): 0.596 

CV F1: 0.972 

550 550

--- 6_graphemes: MLPClassifier ---
 

F1 weighted (Holdout): 0.971 

F1 weighted (Unknown): 0.748 

CV F1: 0.974 

550 550

--- 4_graphemes: LogisticRegression ---
 

F1 weighted (Holdout): 0.838 

F1 weighted (Unknown): 0.882 

CV F1: 0.862 

550 550

--- 4_graphemes: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.911 

F1 weighted (Unknown): 0.832 

CV F1: 0.907 

550 550

--- 4_graphemes: LGBMClassifier ---
 

F1 weighted (Holdout): 0.920 

F1 weighted (Unknown): 0.808 

CV F1: 0.914 

550 550

--- 4_graphemes: MLPClassifier ---
 

F1 weighted (Holdout): 0.920 

F1 weighted (Unknown): 0

### Эксперименты с грамматикой

In [25]:
X_train_grammar = X_train[grammar_vars]
X_test_holdout_grammar = X_test_holdout[grammar_vars]
X_test_unkn_grammar = X_test_unkn[grammar_vars]

In [26]:
categories = ["Gender", "Mood", "Number", "Person", "Tense", "VerbForm", "Voice"]

In [27]:
X_train_grammar_only_categories = X_train_grammar[categories]
X_test_holdout_grammar_only_categories = X_test_holdout_grammar[categories]
X_test_unkn_grammar_only_categories = X_test_unkn_grammar[categories]

X_train_grammar_no_categories = X_train_grammar.drop(categories, axis=1)
X_test_holdout_grammar_no_categories = X_test_holdout_grammar.drop(categories, axis=1)
X_test_unkn_grammar_no_categories = X_test_unkn_grammar.drop(categories, axis=1)

In [28]:
experiments = [
    lambda model: experiment(
        "all grammar",
        "Grammar",
        model,
        *one_hot_encode(
            X_train_grammar,
            X_test_holdout_grammar,
            X_test_unkn_grammar,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "only categories grammar",
        "Grammar",
        model,
        *one_hot_encode(
            X_train_grammar_only_categories,
            X_test_holdout_grammar_only_categories,
            X_test_unkn_grammar_only_categories,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "no categories grammar",
        "Grammar",
        model,
        *one_hot_encode(
            X_train_grammar_no_categories,
            X_test_holdout_grammar_no_categories,
            X_test_unkn_grammar_no_categories,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
]

In [29]:
# 3 min 20 sec
for e in experiments:
    for model in models:
        e(model)

550 550

--- all grammar: LogisticRegression ---
 

F1 weighted (Holdout): 0.736 

F1 weighted (Unknown): 0.689 

CV F1: 0.706 

550 550

--- all grammar: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.738 

F1 weighted (Unknown): 0.699 

CV F1: 0.705 

550 550

--- all grammar: LGBMClassifier ---
 

F1 weighted (Holdout): 0.742 

F1 weighted (Unknown): 0.699 

CV F1: 0.713 

550 550

--- all grammar: MLPClassifier ---
 

F1 weighted (Holdout): 0.740 

F1 weighted (Unknown): 0.702 

CV F1: 0.708 

550 550

--- only categories grammar: LogisticRegression ---
 

F1 weighted (Holdout): 0.743 

F1 weighted (Unknown): 0.693 

CV F1: 0.711 

550 550

--- only categories grammar: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.738 

F1 weighted (Unknown): 0.701 

CV F1: 0.713 

550 550

--- only categories grammar: LGBMClassifier ---
 

F1 weighted (Holdout): 0.738 

F1 weighted (Unknown): 0.701 

CV F1: 0.714 

550 550

--- only categories grammar: MLPClassifier ---
 

F1 weig

### Совмещаем грамматику и графемы.

In [30]:
X_train_graphemes_2.columns.to_list()

['l_gr_1', 'l_gr_2', 'r_gr_1', 'r_gr_2']

In [31]:
X_train_graphemes_and_grammar_categories = X_train[
    X_train_graphemes_2.columns.to_list() + X_train_grammar_only_categories.columns.to_list()
]
X_train_graphemes_and_grammar_categories.head()

Unnamed: 0,l_gr_1,l_gr_2,r_gr_1,r_gr_2,Gender,Mood,Number,Person,Tense,VerbForm,Voice
1898,o,d,i,l,Masc,[PAD],Plur,[PAD],Past,Part,Act
3998,s,p,e,d,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
2987,p,o,e,j,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
4256,u,b,o,i,Masc,[PAD],Sing,[PAD],Past,Part,Act
4506,u,č,a,l,Fem,[PAD],Sing,[PAD],Past,Part,Act


In [32]:
X_test_holdout_graphemes_and_grammar_categories = X_test_holdout[
    X_train_graphemes_2.columns.to_list() + X_train_grammar_only_categories.columns.to_list()
]

X_test_unkn_graphemes_and_grammar_categories = X_test_unkn[
    X_train_graphemes_2.columns.to_list() + X_train_grammar_only_categories.columns.to_list()
]

In [33]:
# 1 min 30 sec
for model in models:

    experiment(
        "grammar and graphemes",
        "GrammarAndGraphemes",
        model,
        *one_hot_encode(
            X_train_graphemes_and_grammar_categories,
            X_test_holdout_graphemes_and_grammar_categories,
            X_test_unkn_graphemes_and_grammar_categories,
        ),
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    )

550 550

--- grammar and graphemes: LogisticRegression ---
 

F1 weighted (Holdout): 0.850 

F1 weighted (Unknown): 0.876 

CV F1: 0.863 

550 550

--- grammar and graphemes: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.922 

F1 weighted (Unknown): 0.811 

CV F1: 0.914 

550 550

--- grammar and graphemes: LGBMClassifier ---
 

F1 weighted (Holdout): 0.927 

F1 weighted (Unknown): 0.828 

CV F1: 0.917 

550 550

--- grammar and graphemes: MLPClassifier ---
 

F1 weighted (Holdout): 0.924 

F1 weighted (Unknown): 0.793 

CV F1: 0.923 



### Текстовые фичи

In [34]:
class Encoder:

    def __init__(self, path_to_model):
        self.model = Word2Vec.load(path_to_model)
        self.ndim = self.model.wv.word_vec("krastavac").shape[0]

    def encode(self, text: str):
        tokens = word_tokenize(text)

        vecs = [
            self.model.wv.word_vec(token)
            for token
            in tokens
            if token in self.model.wv.key_to_index
        ]

        if vecs:
            return np.mean(vecs, axis=0)
        else:
            return np.zeros((self.ndim,))


In [35]:
encoder = Encoder(str(Path.cwd() / "SrW2V" / "TeslaW2V"))

In [36]:
encoder.encode("Nisam bio")

array([ 8.47340345e-01, -1.24501288e-01,  4.86958170e+00, -1.34273803e+00,
       -2.97086954e+00,  3.71142268e+00,  2.43387508e+00,  3.85181069e+00,
       -2.95421219e+00,  3.34436417e-01, -1.58875257e-01, -1.58686471e+00,
       -1.28946507e+00, -3.01051617e-01,  9.45121050e-01,  1.77668595e+00,
        5.86993098e-01, -1.22962356e+00,  2.33354139e+00, -3.38673544e+00,
       -1.20569634e+00, -2.75937104e+00, -5.83984470e+00,  1.30355811e+00,
       -4.45962906e+00,  1.23837054e-01, -4.12644053e+00,  3.44431496e+00,
       -2.49842501e+00, -6.92725182e-03, -1.94456697e-01, -3.43638372e+00,
       -2.82443762e-02,  8.21025610e-01, -2.22832203e+00, -3.07466745e+00,
        2.21040154e+00,  8.69077206e-01,  1.93974113e+00,  1.94455415e-01,
        1.29528332e+00,  4.54815626e+00,  6.07638001e-01,  1.40939832e-01,
        4.75796402e-01,  3.91436172e+00, -1.04877102e+00,  5.56937504e+00,
       -4.74003887e+00,  4.52731252e-01, -3.81715775e+00, -4.98578358e+00,
       -3.72529936e+00,  

In [37]:
encoder.encode("alsdjhja akjdbha")

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [38]:
X_train_words_encoded = [
    encoder.encode(word)
    for word
    in X_train["word"].values
]

X_test_holdout_words_encoded = [
    encoder.encode(word)
    for word
    in X_test_holdout["word"].values
]

X_test_unkn_words_encoded = [
    encoder.encode(word)
    for word
    in X_test_unkn["word"].values
]

X_train_lemmas_encoded = [
    encoder.encode(word)
    for word
    in X_train["lemma"].values
]

X_test_holdout_lemmas_encoded = [
    encoder.encode(word)
    for word
    in X_test_holdout["lemma"].values
]

X_test_unkn_lemmas_encoded = [
    encoder.encode(word)
    for word
    in X_test_unkn["lemma"].values
]

In [39]:
experiments = [
    lambda model: experiment(
        "word embeddings",
        "W2V",
        model,
        X_train_words_encoded,
        X_test_holdout_words_encoded,
        X_test_unkn_words_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "lemma embeddings",
        "W2V",
        model,
        X_train_lemmas_encoded,
        X_test_holdout_lemmas_encoded,
        X_test_unkn_lemmas_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word and lemma embeddings ",
        "W2V",
        model,
        X_train_lemmas_encoded + X_train_words_encoded,
        X_test_holdout_lemmas_encoded,
        X_test_unkn_lemmas_encoded,
        y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist(),
        y_test_holdout["aspect"].values.tolist(),
        y_test_unkn["aspect"].values.tolist(),
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word and lemma embeddings many",
        "W2V",
        model,
        X_train_lemmas_encoded + X_train_words_encoded + X_train_words_encoded + X_train_words_encoded,
        X_test_holdout_lemmas_encoded,
        X_test_unkn_lemmas_encoded,
        y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist(),
        y_test_holdout["aspect"].values.tolist(),
        y_test_unkn["aspect"].values.tolist(),
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings many",
        "W2V",
        model,
        X_train_words_encoded + X_train_words_encoded + X_train_words_encoded,
        X_test_holdout_words_encoded,
        X_test_unkn_words_encoded,
        y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist(),
        y_test_holdout["aspect"].values.tolist(),
        y_test_unkn["aspect"].values.tolist(),
        cv=True,
        log=LOG,
    ),
]

In [40]:
for e in experiments:
    for model in models:
        e(model)

550 550

--- word embeddings: LogisticRegression ---
 

F1 weighted (Holdout): 0.964 

F1 weighted (Unknown): 0.785 

CV F1: 0.944 

550 550

--- word embeddings: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.955 

F1 weighted (Unknown): 0.649 

CV F1: 0.934 

550 550

--- word embeddings: LGBMClassifier ---
 

F1 weighted (Holdout): 0.985 

F1 weighted (Unknown): 0.935 

CV F1: 0.970 

550 550

--- word embeddings: MLPClassifier ---
 

F1 weighted (Holdout): 0.987 

F1 weighted (Unknown): 0.881 

CV F1: 0.980 

550 550

--- lemma embeddings: LogisticRegression ---
 

F1 weighted (Holdout): 1.000 

F1 weighted (Unknown): 0.556 

CV F1: 1.000 

550 550

--- lemma embeddings: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 1.000 

F1 weighted (Unknown): 0.314 

CV F1: 1.000 

550 550

--- lemma embeddings: LGBMClassifier ---
 

F1 weighted (Holdout): 1.000 

F1 weighted (Unknown): 0.780 

CV F1: 1.000 

550 550

--- lemma embeddings: MLPClassifier ---
 

F1 weighted (Holdou

In [41]:
X_train_words_and_nsubj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["nsubj"]))
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_nsubj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["nsubj"]))
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_nsubj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["nsubj"]))
    for _, row
    in X_test_unkn.iterrows()
]

X_train_words_and_obj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obj"]))
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_obj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obj"]))
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_obj_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obj"]))
    for _, row
    in X_test_unkn.iterrows()
]

X_train_words_and_obl_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obl"]))
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_obl_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obl"]))
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_obl_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["obl"]))
    for _, row
    in X_test_unkn.iterrows()
]

X_train_words_and_advmod_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["advmod"]))
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_advmod_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["advmod"]))
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_advmod_encoded = [
    np.append(encoder.encode(row["word"]), encoder.encode(row["advmod"]))
    for _, row
    in X_test_unkn.iterrows()
]

In [42]:
X_train_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_train.iterrows()
]

X_test_holdout_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_words_and_ctx_encoded = [
    np.append(
        encoder.encode(row["word"]),
        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    )
    for _, row
    in X_test_unkn.iterrows()
]

In [43]:
X_train_nsubj_encoded = [
    encoder.encode(row["nsubj"])
    for _, row
    in X_train.iterrows()
]

X_test_holdout_nsubj_encoded = [
    encoder.encode(row["nsubj"])
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_nsubj_encoded = [
    encoder.encode(row["nsubj"])
    for _, row
    in X_test_unkn.iterrows()
]

X_train_obj_encoded = [
    encoder.encode(row["obj"])
    for _, row
    in X_train.iterrows()
]

X_test_holdout_obj_encoded = [
    encoder.encode(row["obj"])
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_obj_encoded = [
    encoder.encode(row["obj"])
    for _, row
    in X_test_unkn.iterrows()
]

X_train_obl_encoded = [
    encoder.encode(row["obl"])
    for _, row
    in X_train.iterrows()
]

X_test_holdout_obl_encoded = [
    encoder.encode(row["obl"])
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_obl_encoded = [
    encoder.encode(row["obl"])
    for _, row
    in X_test_unkn.iterrows()
]

X_train_advmod_encoded = [
    encoder.encode(row["advmod"])
    for _, row
    in X_train.iterrows()
]

X_test_holdout_advmod_encoded = [
    encoder.encode(row["advmod"])
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_advmod_encoded = [
    encoder.encode(row["advmod"])
    for _, row
    in X_test_unkn.iterrows()
]

In [44]:
X_train_all_syntax = [
    np.append(
        encoder.encode(row["nsubj"]),
        np.append(
            encoder.encode(row["obj"]),
            np.append(
                encoder.encode(row["obl"]),
                encoder.encode(row["advmod"]),
            )
        )
    )
    for _, row
    in X_train.iterrows()
]

X_test_holdout_all_syntax = [
    np.append(
        encoder.encode(row["nsubj"]),
        np.append(
            encoder.encode(row["obj"]),
            np.append(
                encoder.encode(row["obl"]),
                encoder.encode(row["advmod"]),
            )
        )
    )
    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_all_syntax = [
    np.append(
        encoder.encode(row["nsubj"]),
        np.append(
            encoder.encode(row["obj"]),
            np.append(
                encoder.encode(row["obl"]),
                encoder.encode(row["advmod"]),
            )
        )
    )
    for _, row
    in X_test_unkn.iterrows()
]

In [45]:
X_train_ctx_encoded = [

        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )

    for _, row
    in X_train.iterrows()
]

X_test_holdout_ctx_encoded = [

        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )

    for _, row
    in X_test_holdout.iterrows()
]

X_test_unkn_ctx_encoded = [

        encoder.encode(
            " ".join([row["l_tok_3"], row["l_tok_2"], row["l_tok_1"], row["r_tok_1"], row["r_tok_2"], row["r_tok_3"],])
        )
    
    for _, row
    in X_test_unkn.iterrows()
]

In [46]:
experiments = [
    lambda model: experiment(
        "word embeddings word+nsubj",
        "W2V_ext",
        model,
        X_train_words_and_nsubj_encoded,
        X_test_holdout_words_and_nsubj_encoded,
        X_test_unkn_words_and_nsubj_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings word+obj",
        "W2V_ext",
        model,
        X_train_words_and_obj_encoded,
        X_test_holdout_words_and_obj_encoded,
        X_test_unkn_words_and_obj_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings word+obl",
        "W2V_ext",
        model,
        X_train_words_and_obj_encoded,
        X_test_holdout_words_and_obl_encoded,
        X_test_unkn_words_and_obl_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings word+advmod",
        "W2V_ext",
        model,
        X_train_words_and_advmod_encoded,
        X_test_holdout_words_and_advmod_encoded,
        X_test_unkn_words_and_advmod_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings word+ctx",
        "W2V_ext",
        model,
        X_train_words_and_ctx_encoded,
        X_test_holdout_words_and_ctx_encoded,
        X_test_unkn_words_and_ctx_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),


    lambda model: experiment(
        "word embeddings nsubj",
        "W2V_ext",
        model,
        X_train_nsubj_encoded,
        X_test_holdout_nsubj_encoded,
        X_test_unkn_nsubj_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings obj",
        "W2V_ext",
        model,
        X_train_obj_encoded,
        X_test_holdout_obj_encoded,
        X_test_unkn_obj_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings obl",
        "W2V_ext",
        model,
        X_train_obj_encoded,
        X_test_holdout_obl_encoded,
        X_test_unkn_obl_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings advmod",
        "W2V_ext",
        model,
        X_train_advmod_encoded,
        X_test_holdout_advmod_encoded,
        X_test_unkn_advmod_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings all syntax",
        "W2V_ext",
        model,
        X_train_all_syntax,
        X_test_holdout_all_syntax,
        X_test_unkn_all_syntax,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),
    lambda model: experiment(
        "word embeddings ctx",
        "W2V_ext",
        model,
        X_train_ctx_encoded,
        X_test_holdout_ctx_encoded,
        X_test_unkn_ctx_encoded,
        y_train["aspect"],
        y_test_holdout["aspect"],
        y_test_unkn["aspect"],
        cv=True,
        log=LOG,
    ),

]

In [47]:
for e in experiments:
    for model in models:
        e(model)

550 550

--- word embeddings word+nsubj: LogisticRegression ---
 

F1 weighted (Holdout): 0.958 

F1 weighted (Unknown): 0.775 

CV F1: 0.939 

550 550

--- word embeddings word+nsubj: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.955 

F1 weighted (Unknown): 0.793 

CV F1: 0.933 

550 550

--- word embeddings word+nsubj: LGBMClassifier ---
 

F1 weighted (Holdout): 0.985 

F1 weighted (Unknown): 0.923 

CV F1: 0.970 

550 550

--- word embeddings word+nsubj: MLPClassifier ---
 

F1 weighted (Holdout): 0.984 

F1 weighted (Unknown): 0.910 

CV F1: 0.969 

550 550

--- word embeddings word+obj: LogisticRegression ---
 

F1 weighted (Holdout): 0.958 

F1 weighted (Unknown): 0.751 

CV F1: 0.942 

550 550

--- word embeddings word+obj: DecisionTreeClassifier ---
 

F1 weighted (Holdout): 0.960 

F1 weighted (Unknown): 0.820 

CV F1: 0.933 

550 550

--- word embeddings word+obj: LGBMClassifier ---
 

F1 weighted (Holdout): 0.989 

F1 weighted (Unknown): 0.933 

CV F1: 0.971 

550

In [48]:
raise ZeroDivisionError("Stop right there!")

ZeroDivisionError: Stop right there!

### Собираем финальный инструмент

In [None]:
categorical_lgbm = LGBMClassifier(
    **{
        'learning_rate': 0.05,
        'n_estimators': 50,
        'max_depth': -1,
        'num_leaves': 30,
        'subsample': 0.6,
        'verbose': -1
    }
)

categorical_lgbm.fit(
    one_hot_encode(
        X_train_graphemes_and_grammar_categories,
        X_test_holdout_graphemes_and_grammar_categories,
        X_test_unkn_graphemes_and_grammar_categories,
    )[0],
    y_train["aspect"]
)

In [None]:
w2v_lgbm = LGBMClassifier(
    **{
        'learning_rate': 0.1,
        'n_estimators': 100,
        'max_depth': 7,
        'num_leaves': 30,
        'subsample': 0.6,
        'verbose': -1
    }
)

w2v_lgbm.fit(
    np.array(X_train_words_encoded + X_train_words_encoded),
    np.array(y_train["aspect"].values.tolist() + y_train["aspect"].values.tolist())
)

In [None]:
categorical_columns = X_train_graphemes_and_grammar_categories.columns.tolist()
categorical_columns

['l_gr_1',
 'l_gr_2',
 'r_gr_1',
 'r_gr_2',
 'Gender',
 'Mood',
 'Number',
 'Person',
 'Tense',
 'VerbForm',
 'Voice']

In [None]:
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
onehot_encoder.fit(X_train[categorical_columns])

In [None]:
class Predictor:
    """
        Пытается сделать предсказание по Word2Vec
        Если вектора нет, берет грамматику и графемы
    """

    def __init__(
        self,
        string_encoder,
        categorical_encoder,
        categorical_classifier,
        embedding_classifier
    ):
        self.string_encoder = string_encoder
        self.categorical_encoder = categorical_encoder
        self.categorical_classifier = categorical_classifier
        self.embedding_classifier = embedding_classifier

    def predict_(self, x):
        word, *categorical = x
        embedding = np.array(self.string_encoder.encode(word))

        if np.all(embedding == 0):
            oh_vector = self.categorical_encoder.transform(np.array(categorical).reshape(1, -1))
            pred = self.categorical_classifier.predict(oh_vector)
        else:
            pred = self.embedding_classifier.predict(embedding.reshape(1, -1))

        return pred[0]

    def predict(self, X):
        """
            X -- слово, категориальные фичи
        """

        result = [self.predict_(x) for x in X]
        return result

In [None]:
predictor = Predictor(encoder, onehot_encoder, categorical_lgbm, w2v_lgbm)

In [None]:
predictor.predict([["raditi", 1 ,2 ,3]])

['imp']

In [None]:
predictor.predict([["radio", 1 ,2 ,3]])

['imp']

In [None]:
predictor.predict([["radila", 1 ,2 ,3]])

['imp']

In [None]:
predictor.predict([["uraditi", 1 ,2 ,3]])

['perf']

In [None]:
predictor.predict([["uradio", 1 ,2 ,3]])

['perf']

In [None]:
predictor.predict([["uradila", 1 ,2 ,3]])

['perf']

In [None]:
categorical_columns

['l_gr_1',
 'l_gr_2',
 'r_gr_1',
 'r_gr_2',
 'Gender',
 'Mood',
 'Number',
 'Person',
 'Tense',
 'VerbForm',
 'Voice']

In [None]:
predictor.predict([["dobila", "d", "o", "a", "l", "Fem", "[PAD]", "Sing", "3", "Past", "Part", "Act"]])

['perf']

In [None]:
X_test_holdout_final = X_test_holdout[["word"] + categorical_columns]
X_test_holdout_final.head()

Unnamed: 0,word,l_gr_1,l_gr_2,r_gr_1,r_gr_2,Gender,Mood,Number,Person,Tense,VerbForm,Voice
3437,razgovara,r,a,a,r,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
1781,obećao,o,b,o,a,Masc,[PAD],Sing,[PAD],Past,Part,Act
4583,vidi,v,i,i,d,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
519,ima,i,m,a,m,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
571,imaju,i,m,u,j,[PAD],Ind,Plur,3,Pres,Fin,[PAD]


In [None]:
X_test_unkn_final = X_test_unkn[["word"] + categorical_columns]
X_test_unkn_final.head()

Unnamed: 0,word,l_gr_1,l_gr_2,r_gr_1,r_gr_2,Gender,Mood,Number,Person,Tense,VerbForm,Voice
36,boravi,b,o,i,v,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
37,boravi,b,o,i,v,[PAD],Ind,Sing,3,Pres,Fin,[PAD]
38,boravila,b,o,a,l,Fem,[PAD],Sing,[PAD],Past,Part,Act
39,boravila,b,o,a,l,Fem,[PAD],Sing,[PAD],Past,Part,Act
40,boravio,b,o,o,i,Masc,[PAD],Sing,[PAD],Past,Part,Act


In [None]:
preds_holdout = predictor.predict(X_test_holdout_final.to_numpy())
preds_unkn    = predictor.predict(X_test_unkn_final.to_numpy())

to_print = [f"\n--- FINAL ---\n"]

f1_wtd_holdout = f1_score(y_test_holdout["aspect"], preds_holdout, average="weighted")
f1_wtd_unkn    = f1_score(y_test_unkn["aspect"], preds_unkn, average="weighted")
to_print.append(f"F1 weighted (Holdout): {f1_wtd_holdout:.3f}")
to_print.append(f"F1 weighted (Unknown): {f1_wtd_unkn:.3f}")

confusions_holdount = confusion_matrix(y_test_holdout["aspect"], preds_holdout)
confusions_unkn     = confusion_matrix(y_test_unkn["aspect"], preds_unkn)
to_print.append("Holdout confusion matrix")
to_print.append(confusions_holdount)
to_print.append("Unknown confusion matrix")
to_print.append(confusions_unkn)

holdout_report = classification_report(
    y_test_holdout["aspect"],
    preds_holdout,
    zero_division=1.0
)
unkn_report = classification_report(
    y_test_unkn["aspect"],
    preds_unkn,
    zero_division=1.0
)
to_print.append("Holdout report:")
to_print.append(holdout_report)
to_print.append("Unknown report:")
to_print.append(unkn_report)

list(
    map(lambda string: print(string, "\n"), to_print)
)


--- FINAL ---
 

F1 weighted (Holdout): 0.987 

F1 weighted (Unknown): 0.953 

Holdout confusion matrix 

[[ 17   0   2]
 [  0 258   3]
 [  0   2 268]] 

Unknown confusion matrix 

[[290  15]
 [ 11 234]] 

Holdout report: 

              precision    recall  f1-score   support

        both       1.00      0.89      0.94        19
         imp       0.99      0.99      0.99       261
        perf       0.98      0.99      0.99       270

    accuracy                           0.99       550
   macro avg       0.99      0.96      0.97       550
weighted avg       0.99      0.99      0.99       550
 

Unknown report: 

              precision    recall  f1-score   support

         imp       0.96      0.95      0.96       305
        perf       0.94      0.96      0.95       245

    accuracy                           0.95       550
   macro avg       0.95      0.95      0.95       550
weighted avg       0.95      0.95      0.95       550
 



[None, None, None, None, None, None, None, None, None, None, None]