In [1]:
import pickle
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from collections import namedtuple

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import tqdm

import pandas as pd

from itertools import combinations

## Load embeds

Each embed object is dict with `name` (name of the model), `X` (embedding matrix), `Y` (labels) keys

Now we expect that `X` from each model is in the same order (but in future we should add `ids`)

In [2]:
embed_filenames = (
    'elmo_ru-news_wmt11-16_1.5M_steps.pickle',
    'fasttext-hh-test.pickle',
    'glove-hh-test.pickle',
    'word2vec-hh-test.pickle',
    'models_data.pickle'
)

In [3]:
embeds = []
for embed_filename in embed_filenames:
    with open(
        "/home/vera/projects/embeddings-test/{}".format(embed_filename),
        "rb"
    ) as handle:
        embeds += pickle.load(handle)

### Subset embeds from list

In [4]:
SELECTED_EMBEDS = [
    "elmo_ru-news_wmt11-16_1.5M_steps",
    "fasttext-hh-test",
    'glove-hh-test',
    'word2vec-hh-test',
    'araneum_upos_skipgram_600_2_2017',
    'news_mystem_skipgram_1000_20_2015',
    'ruscorpora_upos_skipgram_600_10_2017',
    'ruwikiruscorpora_upos_skipgram_300_2_2018',
    'taiga_upos_skipgram_300_2_2018',
    'web_upos_cbow_300_20_2017',
    'araneum_none_fasttextskipgram_300_5_2018'
]

In [5]:
embeds = [embed for embed in embeds if embed["name"] in SELECTED_EMBEDS]

## Define classificators

Define classificators and grid search params for them

In [6]:
Classifier = namedtuple("Classifier", "name gcv")

In [7]:
classifiers = [
    Classifier(
        "Logistic",
        GridSearchCV(
            OneVsRestClassifier(LogisticRegression()),
            {
                "estimator__C": [0.1, 1, 10]
            }
        )
    ),
    Classifier(
        "KNN",
        GridSearchCV(
            KNeighborsClassifier(),
            {
                "n_neighbors": [1, 3, 5, 10],
                "metric": ["minkowski", "euclidean"]
            }
        )
    ),
    Classifier(
        "SVC",
        GridSearchCV(
            OneVsRestClassifier(SVC()),
            {
                "estimator__kernel": ["linear", "rbf"]
            }
        )
    )
]

## Iterate trhough embeddings

and check cross-validation for each classificatir

In [8]:
def get_combinations(values, min_len, max_len):
    """Smart combination getter
    
    Get all unique elements combinations
    with length in [min_len; max_len] from specified list
    """
    result_combinations = []
    
    for combination_len in range(min_len, max_len + 1):
        result_combinations += list(combinations(values, combination_len))
    
    return result_combinations

In [9]:
cv = KFold(n_splits=10, shuffle=True, random_state=23)

In [None]:
result_datas = []

for embed_combination in tqdm.tqdm_notebook(get_combinations(embeds, 1, 3)):
    # stack all X matricies
    X = np.concatenate([embed["x"] for embed in embed_combination], axis=1)
    
    # subset Y just from first embed
    Y = embed_combination[0]["y"]
    
    name = " ".join([embed["name"] for embed in embed_combination])
    
    for classifier in classifiers:
        # set own cv to classifier
        classifier.gcv.cv = cv
        
        # fit
        classifier.gcv.fit(X, Y)
        
        result_datas.append((
            name,
            classifier.name,
            classifier.gcv.best_score_,
            str(classifier.gcv.best_params_)
        ))
        
results_datas_df = pd.DataFrame(result_datas)
results_datas_df.columns = ["embedding_name", "classifier_name", "accuracy", "params"]

HBox(children=(IntProgress(value=0, max=231), HTML(value='')))

## Select best and check with large cv option

### Show top results

In [None]:
results_datas_df.sort_values(["accuracy"], ascending=False).head(5)

In [None]:
results_datas_df.sort_values(["accuracy"], ascending=False).to_csv(
    "../results/results.csv",
    index=False
)

### Set best

In [None]:
best_names = results_datas_df.sort_values(
    ["accuracy"], ascending=False
)["embedding_name"].values[0].split(" ")
print(best_names)

embed_combination = [embed for embed in embeds if embed["name"] in best_names]
print(len(embed_combination))

X = np.concatenate([embed["x"] for embed in embed_combination], axis=1)
Y = embed_combination[0]["y"]

scores = cross_val_score(
    LogisticRegression(),
    X,
    Y,
    cv=KFold(n_splits=168, shuffle=True, random_state=34)
)

print(np.mean(scores))