In [26]:
import pandas as pd
import numpy as np
import editdistance

from pymystem3 import Mystem
import json

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, linear_model

from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

from gensim.models import KeyedVectors
from gensim.models import FastText
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

import pickle

In [2]:
from enum import Enum
class Tagset(Enum):
    UPOS = 1
    MYSTEM = 2

class ModelType(Enum):
    W2V = 1
    FASTTEXT = 2

folder_path = '/mnt/data/embedding_models/'
config = [
#     {'path': folder_path+'araneum_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'araneum_upos_skipgram_600_2_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'news_mystem_skipgram_1000_20_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM},
#     {'path': folder_path+'news_upos_cbow_300_2_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'news_upos_cbow_600_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruscorpora_mystem_cbow_300_2_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM}, 
    {'path': folder_path+'ruscorpora_upos_skipgram_300_10_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
#     {'path': folder_path+'ruscorpora_upos_skipgram_300_5_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruscorpora_upos_skipgram_600_10_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
#     {'path': folder_path+'ruwikiruscorpora-superbigrams_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'ruwikiruscorpora_upos_cbow_300_20_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
#     {'path': folder_path+'ruwikiruscorpora_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'taiga_upos_skipgram_300_2_2018.vec', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
    {'path': folder_path+'web_mystem_skipgram_500_2_2015.bin', 'type': ModelType.W2V, 'tagset': Tagset.MYSTEM},
    {'path': folder_path+'web_upos_cbow_300_20_2017.bin', 'type': ModelType.W2V, 'tagset': Tagset.UPOS},
#     {'path': folder_path+'araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model', 'type': ModelType.FASTTEXT, 'tagset': None},
    {'path': folder_path+'araneum_none_fasttextskipgram_300_5_2018/araneum_none_fasttextskipgram_300_5_2018.model', 'type': ModelType.FASTTEXT, 'tagset': None},
]
# ruwikiruscorpora_mystem_cbow_500_2_2015.bin.gz - damaged, can't unpack

In [3]:
stemmer = Mystem()

mystem_to_upos = {
    "A": "ADJ",
    "ADV": "ADV",
    "ADVPRO": "ADV",
    "ANUM": "ADJ",
    "APRO": "DET",
    "COM": "ADJ",
    "CONJ": "SCONJ",
    "INTJ": "INTJ",
    "NONLEX": "X",
    "NUM": "NUM",
    "PART": "PART",
    "PR": "ADP",
    "S": "NOUN",
    "SPRO": "PRON",
    "UNKN": "X",
    "V": "VERB",
}

def tag(word, tagset=None):
    """Get lexema from word and add tag"""
    if not tagset:
        return word.lower()
        
    processed = stemmer.analyze(word)[0]
    
    try:
        lemma = processed["analysis"][0]["lex"].lower().strip()

        pos = processed["analysis"][0]["gr"].split(',')[0]
        pos = pos.split('=')[0].strip()

        if tagset == Tagset.UPOS:
            return "{}_{}".format(lemma, mystem_to_upos[pos])
        else:
            return "{}_{}".format(lemma, pos)
    except:
        return "word"

tag("красивый", Tagset.MYSTEM)

'красивый_A'

In [4]:
def get_words(text):
    return " ".join([
        tag(word) for word in text.split()
    ])

In [5]:
def calculate_vector(text, model, dim, tagset, model_type=ModelType.W2V):                                          
    vector = np.zeros(dim)
    words = [tag(word, tagset) for word in text.split()]

    words_in_model = 0  
    
    if model_type == ModelType.FASTTEXT:
        for word in words:                                               
            if word in model.wv.vocab:                                              
                vector += model[word]                           
                words_in_model += 1      
    else:
        for word in words:                                               
            if word in model:                                              
                vector += model.get_vector(word)                           
                words_in_model += 1    

    if words_in_model > 0:                                                  
        vector /= words_in_model                                            

    return vector 

In [6]:

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)   

<Handlers.SIG_IGN: 1>

In [34]:
knn = KNeighborsClassifier(n_neighbors=3, metric="manhattan")
svm_cl = svm.SVC()
logreg = linear_model.LogisticRegression(solver='liblinear')
nb = GaussianNB()
# ovr = OneVsRestClassifier(linear_model.LogisticRegression(C=1e5))

parameters_knn = {'metric':('manhattan', 'euclidean'), 'n_neighbors':[2, 3, 10]}
parameters_svm = {'kernel':('linear', 'rbf')}
# parameters_logreg = {'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'multi_class': ('ovr', 'multinomial')}
parameters_logreg = {'C': (0.1, 1, 10, 100)}

loo = LeaveOneOut()
kf = KFold(n_splits=10, shuffle=True)


In [40]:
models_list = list()
models_list.append({'model': knn, 'name': 'knn', 'parameters': parameters_knn})
models_list.append({'model': svm_cl, 'name': 'svm', 'parameters': parameters_svm})
models_list.append({'model': logreg, 'name': 'log regr', 'parameters': parameters_logreg})

In [37]:
embed_names = (
    'elmo_ru-news_wmt11-16_1.5M_steps.pickle',
    'fasttext-hh-test.pickle',
    'glove-hh-test.pickle',
    'word2vec-hh-test.pickle',
    'models_data.pickle'
)

In [59]:
def run_gridsearch(X, Y, name, results_file):
    start_time = time.time()
    scores = cross_val_score(nb, X, Y, cv=kf)
    print('time:', time.time()-start_time)
    print('bayes')
    print(name)
    print(scores.mean(), '\n')
        
    for mod in models_list:
        start_time = time.time()
        grd = GridSearchCV(mod['model'], mod['parameters'], cv = kf)
        grd.fit(X, Y)
        print('time:', time.time()-start_time)
        print(mod['name'])
        print(name)
        print(grd.best_score_)
        print(grd.best_params_)
#         print(grd.cv_results_)
        print('\n')

        if grd.best_score_ > best_of_all['score']:
            best_of_all['score'] = grd.best_score_
            best_of_all['emb_name'] = name
            best_of_all['clf_name'] = mod['name']
            best_of_all['params'] = grd.best_params_

In [60]:
import time

models_data_list = list()

for embed in embed_names:
    with open(embed, 'rb') as handle:
        embeds = pickle.load(handle)
        models_data_list+=embeds
    
results_file = 'res.txt'

for i in range(len(models_data_list)):
    print(i)
    embed1 = models_data_list[i]
    X = embed1['x']
    Y = embed1['y']
    emb_name = embed1['name']
    run_gridsearch(X, Y, emb_name, results_file)
    
    for j in range(i+1, len(models_data_list)):
        embed2 = models_data_list[j]
        X = np.concatenate((embed1['x'], embed2['x']), axis=1)
        emb_name = embed1['name']+ ' + ' + embed2['name']
        run_gridsearch(X, Y, emb_name, results_file)
        
        for k in range(j+1, len(models_data_list)):
            embed3 = models_data_list[k]
            X = np.concatenate((embed1['x'], embed2['x'], embed3['x']), axis=1)
            emb_name = embed1['name']+ ' + ' + embed2['name']+ ' + ' + embed3['name']
            run_gridsearch(X, Y, emb_name, results_file)


    
    

0
time: 0.08033275604248047
bayes
elmo_ru-news_wmt11-16_1.5M_steps
0.43382352941176466 

time: 2.556548833847046
knn
elmo_ru-news_wmt11-16_1.5M_steps
0.6011904761904762
{'metric': 'manhattan', 'n_neighbors': 3}
{'mean_fit_time': array([0.00195398, 0.00188003, 0.00186899, 0.00193706, 0.00188134,
       0.00203099]), 'std_fit_time': array([2.08464164e-04, 4.22048297e-05, 6.38063933e-05, 5.45932943e-05,
       6.20258157e-05, 2.89846325e-04]), 'mean_score_time': array([0.00464745, 0.00455112, 0.00463138, 0.00470605, 0.00473371,
       0.00477629]), 'std_score_time': array([0.00015614, 0.00015473, 0.0001191 , 0.00017106, 0.00019208,
       0.00010257]), 'param_metric': masked_array(data=['manhattan', 'manhattan', 'manhattan', 'euclidean',
                   'euclidean', 'euclidean'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_neighbors': masked_array(data=[2, 3, 10, 2, 3, 10],
             mask=[False, False, Fal

KeyboardInterrupt: 