In [None]:
#import libs
import gensim
from gensim.corpora import Dictionary
from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score,balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import *
from nltk.stem import WordNetLemmatizer
import pymorphy2
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier 

from bs4 import BeautifulSoup
import re
import csv

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from random import shuffle
import itertools

import multiprocessing
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter

In [None]:
#preprocessing
def clean_text(text):
    header = ""
    if "Заголовок:" in text:
        header = text.split("Заголовок:")[1].split("Оценка:")[0]
        
    body = text.split("Текст")[1].split("Адрес отзыва:")[0].split('Aдрес вопроса:')[0]
    text = header+body

    text = BeautifulSoup(text,'lxml').text
    text = re.sub(r'\|\|\|',r' ', text)
    text = re.sub(r'http\S+',r'<URL>', text)
    text = text.lower()
    text = text.replace('x','')
    text = text.replace('\n',' ')
    text = text.replace('%',' <проценты>')
    return text

stemmer = SnowballStemmer('russian')
date_list=['года','месяца','января','февраля','марта','апреля','мая','июня','июля','августа','сентября','октября','ноября','декабря',
          'месяце','январе','феврале','марте','апреле','мае','июне','июле','августе','сентябре','октябре','ноябре','декабре']
rubles_list=['руб']
location_list=['ул','улица','кор','пр','пр-кт','проезд',"проспект",'гор',"пр-т"]

my_stopwords_rus=[]
with open('my_stopwords_rus.txt', encoding = "cp1251") as file:
    my_stopwords_rus = [line.strip() for line in file]
    
def preprocess(text):
    result = []
    text = clean_text(text)
    for token in gensim.utils.simple_preprocess(text,min_len=2,max_len=30):
        if token not in stopwords.words('russian') and token not in my_stopwords_rus:
            if token in date_list:
                token='<дата>'
            elif token in rubles_list:
                token='рублей'
            elif token in location_list:
                token='<локация>'
            #stemmed = stemmer.stem(WordNetLemmatizer().lemmatize(token,pos='v'))
            #result.append(stemmed)
            result.append(token)
    return result

def get_len(a):
    b=len(a)
    k=0.001
    return b*k

def tfidf_clean_text(text):
    text=text.lower()
    text = BeautifulSoup(text,'lxml').text
    text = re.sub(r'\|\|\|',r' ', text)
    text = re.sub(r'http\S+',r'<URL>', text)
    text = text.replace('x','')
    text = text.replace('\n',' ')
    text = text.replace('%',' <проценты>')
    return text

morph= pymorphy2.MorphAnalyzer()
    
def tfidf_preprocess(text):
    result = []
    text = tfidf_clean_text(text)
    for token in gensim.utils.simple_preprocess(text,min_len=2,max_len=30):
        if token not in stopwords.words('russian') and token not in my_stopwords_rus:
            if token in date_list:
                token='<дата>'
                result.append(token)
            elif token in rubles_list:
                token='рублей'
                result.append(token)
            elif token in location_list:
                token='<локация>'
                result.append(token)
            else:
                #stemmed = stemmer.stem(WordNetLemmatizer().lemmatize(token,pos='v'))
                #result.append(stemmed)
                norm=morph.parse(token)[0].normal_form
                if norm not in stopwords.words('russian') and norm not in my_stopwords_rus:
                    result.append(norm)
    return result


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    plt.rcParams.update({'font.size':22})
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
file_path=None
data=pd.read_csv(file_path,error_bad_lines=False, encoding = "cp1251", sep=';')

In [None]:
%%time
data['PROCESSED']=data['MSG_TEXT'].apply(preprocess)
data['PROCESSED_LEMMA']=data['MSG_TEXT'].apply(tfidf_preprocess)

# 4. Tf-Idf

In [None]:
tfidf= TfidfVectorizer(use_idf=True, tokenizer=tfidf_preprocess, analyzer='word', stop_words = my_stopwords_rus, ngram_range=(1,2),max_df=0.8,min_df=5)
tfidf.fit(data['MSG_TEXT'])

In [None]:
x_tf_train = tfidf.transform(data['MSG_TEXT'])
y_tf_train = data["CLASS"]

In [None]:
svd = TruncatedSVD(n_components=300,random_state=42)
x_tf_train_SVD = svd.fit_transform(x_tf_train)
x_tf_final = pd.DataFrame({'VEC':list(x_tf_train_SVD),'NAR_REI':data['NAR_REI'],'MARK':data['MARK']})
x_tf_final = x_tf_final.values
x_tf_final = [np.append(x[0],[x[1],x[2]]) for x in x_tf_final]

In [None]:
x_tf_test_SVD = svd.fit_transform(tfidf.transform(test_dataset['MSG_TEXT']))
x_tf_test_final = pd.DataFrame({'VEC':list(x_tf_test_SVD),'NAR_REI':test_dataset['NAR_REI'],'MARK':test_dataset['MARK']})
x_tf_test_final = x_tf_test_final.values
x_tf_test_final = [np.append(x[0],[x[1],x[2]]) for x in x_tf_test_final]
y_tf_test = test_dataset["CLASS"]


## 4.1 Tf-Idf. CV LogReg, CV XGB

#### a) LogReg

In [None]:
%%time
logreg = LogisticRegression(penalty='l2', solver='lbfgs',n_jobs=-1, multi_class="multinomial")
logreg_params={'C':[1e5]}
best_logreg = GridSearchCV(logreg,logreg_params,cv=10,n_jobs=-1,refit ='balanced_accuracy',verbose=True, scoring=['balanced_accuracy','f1_macro'])
best_logreg.fit(x_tf_final,y_tf_train)


In [None]:
best_logreg.best_params_

In [None]:
best_logreg.best_score_

#### b) XGB

In [None]:
%%time
xgb = XGBClassifier(learning_rate=0.1,n_estimators=150,max_depth=10,min_child_weight=7,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=1e-05)
xgb_params={}

best_xgb = GridSearchCV(xgb,xgb_params,cv=10,n_jobs=-1,refit ='balanced_accuracy',verbose=True, scoring=['balanced_accuracy','f1_macro'])
best_xgb.fit(np.asarray(x_tf_final),y_tf_train)


In [None]:
best_xgb.best_params_

In [None]:
best_xgb.best_score_

# 5. Doc2Vec

In [None]:
data['ID']=data.index.values

data_tagged = data.apply(lambda r: TaggedDocument(r['PROCESSED'], tags=[r.CLASS]), axis=1)
doc2vec_tagged = data.apply(lambda r: TaggedDocument(r['PROCESSED'], tags=[r.ID]), axis=1)


cores= multiprocessing.cpu_count()

model_dbow = Doc2Vec(doc2vec_tagged.values, dm=0, window=5, vector_size=300,negative=5,hs=0,min_count=5,sample=1e-5,workers=cores)
doc_list=doc2vec_tagged.values[:]
shuffle(doc_list)
model_dbow.train(doc_list, total_examples=len(doc2vec_tagged.values), epochs=30)

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets,regressors = zip(*[(doc.tags[0],model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

y_all,x_all = vec_for_learning(model_dbow, data_tagged)

In [None]:
x_all = pd.DataFrame({'VEC':list(x_all),'NAR_REI':data['NAR_REI'],'MARK':data['MARK']})
x_all = x_all.values
x_all = [np.append(x[0],[x[1],x[2]]) for x in x_all]

x_test = [model_dbow.infer_vector(x) for x in test_dataset['PROCESSED']]
x_test = pd.DataFrame({'VEC':list(x_test),'NAR_REI':test_dataset['NAR_REI'],'MARK':test_dataset['MARK']})
x_test = x_test.values
x_test = [np.append(x[0],[x[1],x[2]]) for x in x_test]

### 5.1 Doc2Vec. CV LogReg, CV XGB

#### a) LogReg

In [None]:
%%time
logreg = LogisticRegression(penalty='l2', solver='lbfgs',n_jobs=-1, multi_class="multinomial")
logreg_params={'C':[1e5]}
best_logreg = GridSearchCV(logreg,logreg_params,cv=10,n_jobs=-1,refit ='balanced_accuracy',verbose=True, scoring=['balanced_accuracy','f1_macro'])
best_logreg.fit(x_all,y_all)

In [None]:
best_logreg.best_score_

#### b) XGB

In [None]:
%%time
xgb = XGBClassifier(learning_rate=0.1,n_estimators=150,max_depth=10,min_child_weight=7,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=1e-05)
xgb_params={}
best_xgb = GridSearchCV(xgb,xgb_params,cv=10,n_jobs=-1,refit ='balanced_accuracy',verbose=True, scoring=['balanced_accuracy','f1_macro'])
best_xgb.fit(np.asarray(x_tf_final),y_tf_train)

In [None]:
best_xgb.best_score_

### 5.2 Doc2Vec. Test LogReg, Test XGB

In [None]:
%%time

np_x_all=np.array(x_all)
np_y_all=np.array(y_all)

sampler = SMOTE()
res_x,res_y = sampler.fit_resample(np_x_all,np_y_all)

classifier = LogisticRegression(C=1e5, penalty='l2', solver='lbfgs',n_jobs=-1, multi_class="multinomial")
classifier.fit(res_x,res_y)
y_test_pred = classifier.predict(x_test)

print ("F1 W: {}".format(f1_score(y_tf_test,y_test_pred,average="weighted")))
print ("F1 MAC: {}".format(f1_score(y_tf_test,y_test_pred,average="macro")))
print ("B ACC: {}".format(balanced_accuracy_score(y_tf_test,y_test_pred)))

In [None]:
%%time
classifier = XGBClassifier(max_depth=10,min_child_weight=7,learning_rate=0.1,n_estimators=150,seed=0,subsample=0.8,colsample_bytree=0.8,objective='reg:logistic',n_jobs=-1)
classifier.fit(res_x,res_y)
y_test_pred = classifier.predict(x_test)

print ("F1 W: {}".format(f1_score(y_tf_test,y_test_pred,average="weighted")))
print ("F1 MAC: {}".format(f1_score(y_tf_test,y_test_pred,average="macro")))
print ("B ACC: {}".format(balanced_accuracy_score(y_tf_test,y_test_pred)))

## 5.3 Doc2Vec Test XGB Confusion Matrix

In [None]:
import matplotlib
matplotlib.rc('xtick',labelsize=20)
matplotlib.rc('ytick',labelsize=20)
class_names=test_dataset['CLASS'].unique().tolist()
print(class_names)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true=y_tf_test,y_pred=y_test_pred,labels=class_names)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(20,20))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(20,20))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()