### import dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import collections
import numpy as np
import operator

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pymorphy2
import re
from stop_words import get_stop_words
import pickle

### load data

In [2]:
fields = ['sku', 'comment', 'commentNegative', 'commentPositive', 'reting']
raw_data = pd.read_csv('feedback.csv/X_train.csv', delimiter=',', usecols=fields)
print('Data frame size:',raw_data.shape)

Data frame size: (15587, 5)


### feedbacks by rating, test dataset

In [3]:
feedbacks_keys = ['1', '2', '3', '4', '5', '123', '1234', '12345', '23'];
feedbacks = {}
rows = list(map(lambda x: x[1], raw_data.iterrows()))
np.random.shuffle(rows)
train_size = int(raw_data.shape[0]*.95)
rows_all = rows[:train_size]
rows_test = rows[train_size:]

for row in rows_all:
    if not row['reting'].is_integer():
        continue
    rating = row['reting']
    for key in feedbacks_keys:
        if str(int(rating)) in key:
            if not key in feedbacks:
                feedbacks[key] = []
            feedbacks[key].append(row)

for f in feedbacks:
    np.random.shuffle(feedbacks[f])

### functions to normilize text and evaluate results

In [4]:
morph = pymorphy2.MorphAnalyzer()

regex_clear = re.compile('[^а-яё]',re.DOTALL)
regex_space = re.compile('\s{1,}',re.DOTALL)

def clear(text):
    tmp = re.sub(regex_clear, ' ', text.lower().replace('ё', 'е'))
    return re.sub(regex_space, ' ', tmp)

def prepare(text):
    return text.lower()

def to_normal(word):
    return morph.parse(word)[0].normal_form
    
def normalize(text):
    words = []
    for word in text.split(' '):
        words.append(to_normal(word))
    norm=' '.join(words)  
    return norm.replace('ё', 'е')

class TextNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        cleared = [clear(text) for text in texts]
        return [normalize(text) for text in texts]
    
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [prepare(text) for text in texts]
    
def evaluate_cross_validation(clf, X, y, K):
    cv = StratifiedKFold(K, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv) 
    print(scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(scores.mean(), scores.std()))
    
class RandomForestClassifierWithCoef(RandomForestClassifier):
    def fit(self, *args, **kwargs):
        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
        self.coef_ = self.feature_importances_

### feature union

In [5]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            ('count_vectorizer', Pipeline([
                ('selector', TextCleaner()),
                ('cv', CountVectorizer(min_df=0.0001, max_df=0.95, ngram_range=(4,4), analyzer="char_wb")),
                #('best', TruncatedSVD(n_components=100)),
            ])),
            ('tfidf', Pipeline([
                 ('selector', TextCleaner()),
                 ('tfidf', TfidfVectorizer(analyzer="word", stop_words=get_stop_words('ru'))),
                 #('best', TruncatedSVD(n_components=10)), 
             ])),
            ('count_vectorizer_default', Pipeline([
                ('selector', TextNormalizer()),
                ('cv', CountVectorizer(min_df=0.0001, max_df=0.95, stop_words=get_stop_words('ru'), analyzer='word')),                                
            ])),

        ],
        transformer_weights={ 
            'count_vectorizer': 1,
            'tfidf': 1,
            'count_vectorizer_default': 1
       },
    )),

    #('model', SVC(kernel='linear')) #rbf, sigmoid, polynomial
    ('model', RandomForestClassifierWithCoef(n_estimators=512, min_samples_leaf=1, n_jobs=-1))
])

### normilized and shuffled datasets to fit classifiers

In [6]:
def getShuffledNormalizedFeedbacks(keys):
    result = []
    feedbacks_by_keys = []
    feedbacks_limit = 0
    for key in keys:
        f = feedbacks[key]
        feedbacks_limit = len(f) if feedbacks_limit == 0 else min(feedbacks_limit, len(f))
        feedbacks_by_keys.append(f)
    for f in feedbacks_by_keys:
        result.extend(list(f[:feedbacks_limit]))
    np.random.shuffle(result)
    return result

def toInput(num, val):
    if num == val:
        return 1
    else:
        return -1

feedbacks_5 = getShuffledNormalizedFeedbacks(['5', '1234']);
feedbacks_4 = getShuffledNormalizedFeedbacks(['4', '123']);
feedbacks_1 = getShuffledNormalizedFeedbacks(['1', '23']);
feedbacks_2 = getShuffledNormalizedFeedbacks(['2', '3']);

In [7]:
def to_dataframe(rows, num):
    tmp_df = pd.DataFrame()
    tmp_df['comment'] = list(r['comment'] for r in rows)
    if num > 0:
        tmp_df['result'] = list(toInput(r['reting'],num) for r in rows)
    else:
        tmp_df['result'] = list(int(r['reting']) for r in rows)
    return tmp_df

### fit classifiers

#### five or below classifier

In [8]:
feedback_5_df = to_dataframe(feedbacks_5, 5)
X_train, X_test, y_train, y_test = train_test_split(feedback_5_df['comment'], feedback_5_df['result'], test_size = 0.05)
pipeline.fit(X_train, y_train)
print('5 against 1234 report:')
print(classification_report(pipeline.predict(X_test), y_test))
pickle.dump(pipeline, open('mvideo.feedback.pickle.5', 'wb'))

5 against 1234 report:
             precision    recall  f1-score   support

         -1       0.78      0.78      0.78       306
          1       0.77      0.77      0.77       282

avg / total       0.78      0.78      0.78       588



#### four or below classifier

In [9]:
feedback_4_df = to_dataframe(feedbacks_4, 4)
X_train, X_test, y_train, y_test = train_test_split(feedback_4_df['comment'], feedback_4_df['result'], test_size = 0.05)
pipeline.fit(X_train, y_train)
print('4 against 123 report:')
print(classification_report(pipeline.predict(X_test), y_test))
pickle.dump(pipeline, open('mvideo.feedback.pickle.4', 'wb'))

4 against 123 report:
             precision    recall  f1-score   support

         -1       0.65      0.72      0.69       112
          1       0.75      0.69      0.72       138

avg / total       0.71      0.70      0.70       250



#### one or above classifier

In [10]:
feedbacks_1_df = to_dataframe(feedbacks_1, 1)
X_train, X_test, y_train, y_test = train_test_split(feedbacks_1_df['comment'], feedbacks_1_df['result'], test_size = 0.05)
pipeline.fit(X_train, y_train)
print('1 against 23 report:')
print(classification_report(pipeline.predict(X_test), y_test))
pickle.dump(pipeline, open('mvideo.feedback.pickle.1', 'wb'))

1 against 23 report:
             precision    recall  f1-score   support

         -1       0.79      0.59      0.68        83
          1       0.56      0.77      0.65        57

avg / total       0.70      0.66      0.67       140



#### two or three classifier

In [11]:
feedbacks_2_df = to_dataframe(feedbacks_2, 2)
X_train, X_test, y_train, y_test = train_test_split(feedbacks_2_df['comment'], feedbacks_2_df['result'], test_size = 0.05)
pipeline.fit(X_train, y_train)
print('2 against 3 report:')
print(classification_report(pipeline.predict(X_test), y_test))
pickle.dump(pipeline, open('mvideo.feedback.pickle.2', 'wb'))

2 against 3 report:
             precision    recall  f1-score   support

         -1       0.64      0.61      0.63        44
          1       0.57      0.61      0.59        38

avg / total       0.61      0.61      0.61        82



### feedback prediction functions

In [12]:
pipeline_5 = pickle.load(open('mvideo.feedback.pickle.5', 'rb'))
pipeline_4 = pickle.load(open('mvideo.feedback.pickle.4', 'rb'))
pipeline_2 = pickle.load(open('mvideo.feedback.pickle.2', 'rb'))
pipeline_1 = pickle.load(open('mvideo.feedback.pickle.1', 'rb'))

def predict(text):
    if pipeline_5.predict([text])[0] == 1:
        return 5
    elif pipeline_4.predict([text])[0] == 1:
        return 4
    elif pipeline_1.predict([text])[0] == 1:
        return 1
    elif pipeline_2.predict([text])[0] == 1:
        return 2
    else:
        return 3

def predict_all(text_array):
    result = []
    for text in text_array:
        result.append(predict(text))
    return result

### prediction examples

In [13]:
for row in rows_all[:20]:
    print(row['comment'][:50],'... actual:',row['reting'],'predicted:',predict(row['comment']))

Стильный,хорошо смотрится ... actual: 1.0 predicted: 1
Компьютер очень качественный не считая того, что н ... actual: 5.0 predicted: 5
Класс!!!!!  Можно камеру подключить к  ps3 ... actual: 5.0 predicted: 5
Брали в М.Видео месяц назад и пока не пожалели. От ... actual: 5.0 predicted: 5
Рекомендую посмотреть всем мужчинам, несмотря на т ... actual: 5.0 predicted: 5
Телефончик подарил муж.Спасибо тебе милый!Я в вост ... actual: 5.0 predicted: 5
Самый Coolьный телефон по всей планете! ... actual: 5.0 predicted: 5
Отлично работает, очень приятный дизайн. ... actual: 5.0 predicted: 5
Телевизор за свои деньги устроил полностью, всё чт ... actual: 4.0 predicted: 4
Очень классный плеер, всё в нём круто кроме батаре ... actual: 5.0 predicted: 5
Симпатичный дизайн, хорошо вписалась в ванную. Оче ... actual: 5.0 predicted: 5
Этот тостер нам с супругой подарила теща. Поначалу ... actual: 3.0 predicted: 3
Использование около года, вкус воды лучше чем в пл ... actual: 5.0 predicted: 5
Я пользуюсь уж

### results on the test set

In [14]:
feedbacks_test = to_dataframe(rows_test, 0)
print(classification_report(predict_all(feedbacks_test['comment']), feedbacks_test['result']))

             precision    recall  f1-score   support

          1       0.67      0.49      0.56       102
          2       0.20      0.19      0.19        43
          3       0.15      0.27      0.19        41
          4       0.46      0.31      0.37       178
          5       0.74      0.84      0.78       416

avg / total       0.61      0.61      0.60       780

