In [25]:
import pandas as pd
import numpy as np
import editdistance

from pymystem3 import Mystem
import json

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, linear_model

from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

from gensim.models import KeyedVectors
from gensim.models import FastText
from sklearn.model_selection import GridSearchCV

import pickle

In [26]:
from enum import Enum
class Tagset(Enum):
    UPOS = 1
    MYSTEM = 2

class ModelType(Enum):
    W2V = 1
    FASTTEXT = 2

folder_path = '/mnt/data/embedding_models/'
config = [
    {'path': folder_path+'araneum_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'araneum_upos_skipgram_600_2_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'news_mystem_skipgram_1000_20_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM},
    {'path': folder_path+'news_upos_cbow_300_2_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'news_upos_cbow_600_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruscorpora_mystem_cbow_300_2_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM}, 
    {'path': folder_path+'ruscorpora_upos_skipgram_300_10_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruscorpora_upos_skipgram_300_5_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruscorpora_upos_skipgram_600_10_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora-superbigrams_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora_upos_cbow_300_20_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'taiga_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'web_mystem_skipgram_500_2_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM},
    {'path': folder_path+'web_upos_cbow_300_20_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model', 'type': ModelType.FASTTEXT, 'tagset': None},
    {'path': folder_path+'araneum_none_fasttextskipgram_300_5_2018/araneum_none_fasttextskipgram_300_5_2018.model', 'type': ModelType.FASTTEXT, 'tagset': None},
]
# ruwikiruscorpora_mystem_cbow_500_2_2015.bin.gz - damaged, can't unpack

In [27]:
stemmer = Mystem()

mystem_to_upos = {
    "A": "ADJ",
    "ADV": "ADV",
    "ADVPRO": "ADV",
    "ANUM": "ADJ",
    "APRO": "DET",
    "COM": "ADJ",
    "CONJ": "SCONJ",
    "INTJ": "INTJ",
    "NONLEX": "X",
    "NUM": "NUM",
    "PART": "PART",
    "PR": "ADP",
    "S": "NOUN",
    "SPRO": "PRON",
    "UNKN": "X",
    "V": "VERB",
}

def tag(word, tagset=None):
    """Get lexema from word and add tag"""
    if not tagset:
        return word.lower()
        
    processed = stemmer.analyze(word)[0]
    
    try:
        lemma = processed["analysis"][0]["lex"].lower().strip()

        pos = processed["analysis"][0]["gr"].split(',')[0]
        pos = pos.split('=')[0].strip()

        if tagset == Tagset.UPOS:
            return "{}_{}".format(lemma, mystem_to_upos[pos])
        else:
            return "{}_{}".format(lemma, pos)
    except:
        return "word"

tag("красивый", Tagset.MYSTEM)

'красивый_A'

In [28]:
def get_words(text):
    return " ".join([
        tag(word) for word in text.split()
    ])

In [29]:
def calculate_vector(text, model, dim, tagset, model_type=ModelType.W2V):                                          
    vector = np.zeros(dim)
    words = [tag(word, tagset) for word in text.split()]

    words_in_model = 0  
    
    if model_type == ModelType.FASTTEXT:
        for word in words:                                               
            if word in model.wv.vocab:                                              
                vector += model[word]                           
                words_in_model += 1      
    else:
        for word in words:                                               
            if word in model:                                              
                vector += model.get_vector(word)                           
                words_in_model += 1    

    if words_in_model > 0:                                                  
        vector /= words_in_model                                            

    return vector 

In [30]:

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)   

<Handlers.SIG_DFL: 0>

In [49]:
knn = KNeighborsClassifier(n_neighbors=3, metric="manhattan")
svm_cl = svm.SVC()
logreg = linear_model.LogisticRegression(C=1e5)
ovr = OneVsRestClassifier(linear_model.LogisticRegression(C=1e5))

parameters_knn = {'metric':('manhattan', 'euclidean', 'chebyshev', 'minkowski'), 'n_neighbors':[2, 3, 5, 10, 20]}
parameters_svm = {'kernel':('linear', 'poly', 'rbf')}
# parameters_logreg = {'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'multi_class': ('ovr', 'multinomial')}
parameters_logreg = {'solver': ('newton-cg', 'liblinear')}

loo = LeaveOneOut()


In [50]:
models_list = list()
models_list.append({'model': knn, 'parameters': parameters_knn})
models_list.append({'model': svm_cl, 'parameters': parameters_svm})
models_list.append({'model': logreg, 'parameters': parameters_logreg})

In [None]:
models_data_list = list()
with open('models_data.pickle', 'rb') as handle:
    models_data_list = pickle.load(handle)

best_of_all = dict()
best_of_all['score'] =0
for line in models_data_list:
    for mod in models_list:
        X = line['x']
        Y = line['y']
        y_true, y_pred = [], []

    #     grd = GridSearchCV(clf, parameters, cv = loo)
        grd = GridSearchCV(mod['model'], mod['parameters'], cv = loo)
        grd.fit(X, Y)
        print(line['name'])
        print(grd.best_score_)
        print(grd.best_params_)

        if grd.best_score_ > best_of_all['score']:
            best_of_all['score'] = grd.best_score_
            best_of_all['name'] = line['name']
            best_of_all['params'] = grd.best_params_

print(best_of_all)
    
    