In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as scs

import json, pymystem3, re

from collections import defaultdict, Counter

from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import SnowballStemmer

from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

#### Define word2vec class

In [2]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec, morph=None):
        self.word2vec = word2vec
        self.all_stopwords = self.get_russian_stopwords() + \
            self.get_russian_alphabet() + self.get_english_alphabet()
        self.dim = word2vec.vector_size
        self.tfidf = TfidfVectorizer(stop_words=self.all_stopwords,
                                     analyzer=lambda x: x)
        self.word2weight = None
        self.morph = morph if morph else pymystem3.Mystem()

    @staticmethod
    def get_punctuation():
        return ") ( , ; : ? - _ * [ ] } { \" ' > < ! . * # \` / \\ | & ^ % $ @ ~ ± ... = +" \
               "0 1 2 3 4 5 6 7 8 9".split()

    @staticmethod
    def get_russian_alphabet():
        return "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я " \
               "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я".split()

    @staticmethod
    def get_english_alphabet():
        return "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z " \
               "a b c d e f g h i j k l m n o p q r s t u v w x y z".split()

    @staticmethod
    def get_russian_stopwords():
        return [word for word in stopwords.words('russian')
                if word not in ['не', 'нет', 'ни', 'хорошо', 'плохо', 'хороший', 'плохой', 'против',
                                'правильно', 'неправильно', 'правильный', 'неправильный', 'без',
                                'лучше', 'хуже', 'можно', 'нельзя', 'отлично', 'верно', 'неверно',
                                'никому', 'никогда', 'нигде', 'ничего', 'ничем', 'верный', 'неверный',
                                'некому', 'накогда', 'негде', 'нечего', 'нечем']]
    
    def lemmatized_texts(self, X):
        self.morph.start()
        X_new = [' '.join(self.morph.lemmatize(x)) for x in X]
        self.morph.close()
        
        return X_new

    def get_stopwords(self):
        return self.all_stopwords

    def get_tfidf(self):
        return self.tfidf

    def tokenize(self, text):
        return text.split()

    def fit(self, X, y=None):
        X = self.lemmatized_texts(X)
        self.tfidf.fit_transform(X)

        max_idf = max(self.tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            {w: self.tfidf.idf_[i] for w, i in self.tfidf.vocabulary_.items()}
        )

        return self

    def transform(self, X, y=None):
        return scs.csc_matrix(np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in self.tokenize(words)
                     if w not in self.tfidf.stop_words
                     and w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ]))

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [3]:
morph = pymystem3.Mystem()

In [4]:
def lemmatized_texts_my_w2v(X, morph):
    morph.start()
    X_new = [list(filter(lambda t: not t.isspace(), morph.lemmatize(x))) for x in X]
    morph.close()
        
    return X_new

### Read train data

All training/testing samples you can find here: https://www.kaggle.com/c/sentiment-analysis-in-russian/overview

In [5]:
with open('input/train.json') as f:
    raw_train = json.load(f)

In [6]:
with open('input/test.json') as f:
    raw_test = json.load(f)

In [7]:
raw_train[:10]

[{'id': 1945,
  'sentiment': 'negative',
  'text': 'Досудебное расследование по факту покупки ЕНПФ пакета облигаций ТОО "Бузгул Аурум" было начато по инициативе Национального банка РК, сообщил директор департамента защиты прав потребителей и финансовых услуг Нацбанка Казахстана Александр Терентьев.\n"Основанием для досудебного расследования стало обращение Национального банка, письмо от 25 ноября 2016 года. Было обращение Национального банка в правоохранительные органы. Нам эта сделка показалась сомнительной, недостаточно корректной, поэтому Нацбанк 25 ноября 2016 года обратился в правоохранительные органы. Это то, что я могу озвучить на сегодня. Идёт следствие, проводится проверка", – сказал Терентьев.\n28 декабря в Нацбанке заявили, что не знают, что стало основанием для проверки ЕНПФ.\n23 декабря факт проведения проверки в АО "Единый накопительный пенсионный фонд" подтвердился. Пресс-служба Национального банка сообщила, что проверку проводят по операциям, совершённым АО "ЕНПФ" в отн

In [8]:
len(raw_train)

8263

In [9]:
raw_test[:10]

[{'id': 0,
  'text': 'Как сообщает пресс-служба акимата Алматы, для ликвидиции последствий снегопада задействовали 415 единиц спецтехники, 150 самосвалов, 20 погрузчиков и 6 автогрейдеров. Уборка снега началась еще ночью. Сначала был убран снег, потом на дорогах использовали более 4 тысяч тонн противогололедных реагентов. По состоянию на 7 утра 24 января высота снежного покрова в Алматы составила до 20 сантиметров в горной и до 15 сантиметров в предгорной местностях.\n'},
 {'id': 1,
  'text': 'Казахстанские авиакомпании перевозят 250 тысяч транзитных пассажиров в год  со ссылкой на пресс-службу Министерства по инвестициям и развитию.  \n\n«В настоящее время отечественными авиакомпаниями через территорию Казахстана перевозится 250 тысяч транзитных пассажиров в год, что в 10 раз больше показателя 2010 года. В среднем около 70 процентов всех транзитных пассажиров приходится на столичный аэропорт. Это 175 тысяч транзитных пассажиров приходится на Астану и 75 тысяч на Алматы», — об этом зая

In [10]:
len(raw_test)

2056

In [11]:
Counter([i['sentiment'] for i in raw_train])

Counter({'negative': 1434, 'neutral': 4034, 'positive': 2795})

### Train tf-idf & lda

In [12]:
ru_stem = SnowballStemmer('russian')

In [13]:
def ru_pos(string):
    """russian tokenize based on nltk.tag.pos_tag. only russian letter remaind."""
    return ['%s' % p.split('=')[0] for w, p in pos_tag(word_tokenize(string), lang='rus')
            if p not in ['PR', 'PRON', 'CONJ', 'S-PRO']]

In [14]:
X_all = [i['text'] for i in raw_train + raw_test]

In [15]:
%%time
sentences = lemmatized_texts_my_w2v(X_all, morph)

CPU times: user 1min 14s, sys: 7.21 s, total: 1min 21s
Wall time: 6min 25s


In [66]:
%%time
model = Word2Vec(sentences, sg=1, min_count=3, size=500, window=7, iter=2)

CPU times: user 5min 41s, sys: 3.5 s, total: 5min 44s
Wall time: 2min 13s


In [67]:
w2v_model = model

In [68]:
%%time
vectorizer = MeanEmbeddingVectorizer(w2v_model)

CPU times: user 2.26 ms, sys: 3.34 ms, total: 5.6 ms
Wall time: 5.08 ms


In [69]:
%%time
vectorizer.fit(X_all)

CPU times: user 1min 25s, sys: 8.09 s, total: 1min 33s
Wall time: 6min 2s


<__main__.MeanEmbeddingVectorizer at 0x1a4d22b9e8>

In [23]:
pos_params = {}
pos_params['tokenizer'] = ru_pos
pos_params['ngram_range'] = (2, 3)
pos_params['max_df'] = 0.95
pos_params['max_features'] = 25

In [24]:
pos_tfidf = TfidfVectorizer(**pos_params)

In [25]:
%%time
pos_tfidf.fit([i['text'] for i in raw_train + raw_test])

CPU times: user 10min 31s, sys: 2.56 s, total: 10min 33s
Wall time: 10min 37s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=25, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function ru_pos at 0x1a160c38c8>, use_idf=True,
        vocabulary=None)

### Train validation set split

In [26]:
train = {}
val = {}
tmp = defaultdict(list)
for e in raw_train:
    tmp[e['sentiment']].append(e['text'])
for l in tmp:
    train[l], val[l] = train_test_split(tmp[l], test_size=0.2, random_state=2018)

### Unsampling align for balance

In [27]:
def upsampling_align(some_dict, random_state=2018):
    rand = np.random.RandomState(random_state)
    upper = max([len(some_dict[l]) for l in some_dict])
    print('upper bound: {}'.format(upper))
    tmp = {}
    for l in some_dict:
        if len(some_dict[l]) < upper:
            repeat_time = int(upper/len(some_dict[l]))
            remainder = upper % len(some_dict[l])
            _tmp = some_dict[l].copy()
            rand.shuffle(_tmp)
            tmp[l] = some_dict[l] * repeat_time + _tmp[:remainder]
            rand.shuffle(tmp[l])
        else:
            tmp[l] = some_dict[l]
    return tmp

In [28]:
btrain = upsampling_align(train)

upper bound: 3227


### features generation

In [29]:
def count_of_digits(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.match(r'\d+([\.,]\d+)?', i)]) / len(tokens)

def count_of_capslocks(tokens):
    # tokens = word_tokenize(text)
    return sum([i.isupper() for i in tokens]) / len(tokens)

def count_of_titles(tokens):
    # tokens = word_tokenize(text)
    return sum([(i.istitle() and not i.isupper()) for i in tokens]) / len(tokens)

def count_of_chars(tokens):
    # tokens = word_tokenize(text)
    return sum([(i.isalpha() and len(i) <= 1) for i in tokens]) / len(tokens)

def count_of_latin_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.match(r'[a-zA-Z]+$', i)]) / len(tokens)

def count_of_part_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^н[еи]', i, re.I)]) / len(tokens)

def count_of_bezs_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^бе[зс]', i, re.I)]) / len(tokens)

def count_of_vozs_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^во[зс]', i, re.I)]) / len(tokens)

def count_of_prie_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^пр[ие]', i, re.I)]) / len(tokens)

def count_of_pro_kontr_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^(против|контр)', i, re.I)]) / len(tokens)

def count_of_u_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^у\w+', i, re.I)]) / len(tokens)

def count_of_na_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^на\w+', i, re.I)]) / len(tokens)

def count_of_ot_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^от\w+', i, re.I)]) / len(tokens)

def count_of_do_words(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'^до\w+', i, re.I)]) / len(tokens)

def count_of_quotes(tokens):
    # tokens = word_tokenize(text)
    return (0.5 * len([i for i in tokens if re.search(r'[\"\«\»]', i)])) / len(tokens)

def count_of_exclaminations(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.search(r'!+', i)]) / len(tokens)

def count_of_questions(tokens):
    # tokens = word_tokenize(text)
    return len([i for i in tokens if re.match(r'\?+', i)]) / len(tokens)

def count_of_short_words(tokens):
    # tokens = word_tokenize(text)
    return sum([len(i) <= 3 for i in tokens]) / len(tokens)

def count_of_adverbs(pos_tags, tokens):
    tokens = word_tokenize(text)
    return sum([(w.isalpha() and p.startswith('ADV')) 
                for w, p in pos_tags]) / len(tokens)

def count_of_adjs(pos_tags, tokens):
    # tokens = word_tokenize(text)
    return sum([(w.isalpha() and p.startswith('A=')) 
                for w, p in pos_tags]) / len(tokens)

def count_of_verbs(pos_tags, tokens):
    # tokens = word_tokenize(text)
    return sum([(w.isalpha() and p.startswith('V')) 
                for w, p in pos_tags]) / len(tokens)


def featgen(texts):
    try:
        result = []
        for text in texts:
            tokens = word_tokenize(text)
            pos_tags = pos_tag(tokens, lang='rus')
            result.append(
                [
                    count_of_digits(tokens),
                    count_of_capslocks(tokens),
                    count_of_titles(tokens),
                    count_of_chars(tokens),
                    count_of_latin_words(tokens),
                    count_of_short_words(tokens),
                    count_of_part_words(tokens),
                    count_of_bezs_words(tokens),
                    count_of_vozs_words(tokens),
                    count_of_prie_words(tokens),
                    count_of_u_words(tokens),
                    count_of_pro_kontr_words(tokens),
                    count_of_do_words(tokens),
                    count_of_ot_words(tokens),
                    count_of_quotes(tokens),
                    count_of_exclaminations(tokens),
                    count_of_questions(tokens)
                ]
            )

        return np.array(result)
    except:
        print(text)


### SVM model training

In [30]:
def lemmatized_texts(X, morph):
    morph.start()
    X_new = [' '.join(morph.lemmatize(x)) for x in X]
    morph.close()
        
    return X_new

In [31]:
m_params = {}
m_params['solver'] = 'saga'
m_params['multi_class'] = 'multinomial'

In [32]:
# softmax = MLPClassifier((100, 100, 50), learning_rate_init=0.01, 
#                         activation='logistic') # LogisticRegression(**m_params)

softmax = VotingClassifier([
    ('gbm', LGBMClassifier(n_estimators=300, objective='multiclass')),
    ('svm', LinearSVC(max_iter=300)),
    ('lr', LogisticRegression(**m_params)),
    ('mlp', MLPClassifier((100, 100, 50), learning_rate_init=0.01,
                        activation='logistic'))
])

In [33]:
train_x = [j for i in sorted(btrain.keys()) for j in btrain[i]]
train_y = [i for i in sorted(btrain.keys()) for j in btrain[i]]

In [34]:
%%time
train_x_w2v = lemmatized_texts(train_x, morph)

CPU times: user 1min 14s, sys: 7.98 s, total: 1min 22s
Wall time: 5min 24s


In [70]:
%%time
features = featgen(train_x)

CPU times: user 11min 10s, sys: 2.84 s, total: 11min 13s
Wall time: 11min 22s


In [71]:
%%time
X_train = scs.hstack([
    vectorizer.transform(train_x_w2v), 
    pos_tfidf.transform(train_x),
    features
])



CPU times: user 11min 25s, sys: 4.44 s, total: 11min 29s
Wall time: 21min 42s


In [72]:
%%time
softmax.fit(X_train, train_y)



CPU times: user 3min 58s, sys: 11 s, total: 4min 9s
Wall time: 1min 52s


VotingClassifier(estimators=[('gbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,
        n_jobs=-1, num_leaves=31, objective='multiclass',
        ran...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

### evaluate the SVM model

In [38]:
test_x = [j for i in sorted(val.keys()) for j in val[i]]
true = [i for i in sorted(val.keys()) for j in val[i]]

In [39]:
%%time
test_x_w2v = lemmatized_texts(test_x, morph)

CPU times: user 10.6 s, sys: 2.15 s, total: 12.7 s
Wall time: 1min 7s


In [73]:
%%time
features = featgen(test_x)

CPU times: user 2min 13s, sys: 1.96 s, total: 2min 15s
Wall time: 2min 34s


In [74]:
%%time
X_test = scs.hstack([
    vectorizer.transform(test_x_w2v), 
    pos_tfidf.transform(test_x),
    features
])



CPU times: user 2min 13s, sys: 2.83 s, total: 2min 16s
Wall time: 2min 32s


In [75]:
%%time
pred = softmax.predict(X_test)



CPU times: user 568 ms, sys: 32.9 ms, total: 601 ms
Wall time: 497 ms


In [76]:
accuracy_score(true, pred)

0.65638233514821531

### macro recall

In [77]:
lab = LabelEncoder()
c_true = lab.fit_transform(true)
c_pred = lab.transform(pred)
print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))

             precision    recall  f1-score   support

   negative    0.51836   0.83624   0.64000       287
    neutral    0.73197   0.57869   0.64637       807
   positive    0.68478   0.67621   0.68047       559

avg / total    0.67893   0.65638   0.65679      1653



### balance score

In [45]:
bval = upsampling_align(val)

upper bound: 807


In [46]:
b_test_x = [j for i in sorted(bval.keys()) for j in bval[i]]
b_true = [i for i in sorted(bval.keys()) for j in bval[i]]

In [47]:
%%time
b_test_x_w2v = lemmatized_texts(b_test_x, morph)

CPU times: user 15.3 s, sys: 1.75 s, total: 17.1 s
Wall time: 1min 28s


In [78]:
%%time
features = featgen(b_test_x)

CPU times: user 2min 56s, sys: 1.77 s, total: 2min 58s
Wall time: 3min 4s


In [79]:
%%time
X_b_test = scs.hstack([
    vectorizer.transform(b_test_x_w2v), 
    pos_tfidf.transform(b_test_x),
    features
])



CPU times: user 3min 5s, sys: 2.45 s, total: 3min 7s
Wall time: 2min 58s


In [80]:
b_pred = softmax.predict(X_b_test)



In [81]:
lab = LabelEncoder()
c_true = lab.fit_transform(b_true)
c_pred = lab.transform(b_pred)
print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))

             precision    recall  f1-score   support

   negative    0.73449   0.83643   0.78216       807
    neutral    0.62433   0.57869   0.60064       807
   positive    0.73475   0.68649   0.70980       807

avg / total    0.69786   0.70054   0.69753      2421



### predict

In [None]:
X_sub = [i['text'] for i in raw_test]

In [None]:
%%time
X_sub_w2v = lemmatized_texts(X_sub, morph)

In [None]:
%%time
features = featgen(X_sub)

In [None]:
%%time
X_sub_test = scs.hstack([
    vectorizer.transform(X_sub_w2v), 
    pos_tfidf.transform(X_sub),
    features
])

In [None]:
sub_pred = softmax.predict(X_sub_test)
sub_df = pd.DataFrame()
sub_df['id'] = [i['id'] for i in raw_test]
sub_df['sentiment'] = sub_pred

In [None]:
sub_df.head()

In [None]:
sub_df.to_csv('output/mlp_my_w2v_pos_manual_feats.csv', index=False)