import dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import collections
import numpy as np
import operator

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pymorphy2
import re
from stop_words import get_stop_words
import pickle

In [2]:
fields = ['sku', 'comment', 'commentNegative', 'commentPositive', 'reting']
raw_data = pd.read_csv('feedback.csv/X_train.csv', delimiter=',', usecols=fields)
print('Data frame size:',raw_data.shape)

Data frame size: (15587, 5)


In [37]:
feedbacks = { '1234': [], '123': [], '5': [], '4': [] }
rows_all = list(map(lambda x: x[1], raw_data.iterrows()))

for row in rows_all:
    if not row['reting'].is_integer():
        continue
    rating = row['reting']
    if rating == 5:
        feedbacks['5'].append(row)
    elif rating == 4:
        feedbacks['4'].append(row)
        feedbacks['1234'].append(row)
    else:
        feedbacks['123'].append(row)
        feedbacks['1234'].append(row)

for f in feedbacks:
    np.random.shuffle(feedbacks[f])

In [38]:
morph = pymorphy2.MorphAnalyzer()

regex_clear = re.compile('[^а-яё]',re.DOTALL)
regex_space = re.compile('\s{1,}',re.DOTALL)

def clear(text):
    tmp = re.sub(regex_clear, ' ', text.lower().replace('ё', 'е'))
    return re.sub(regex_space, ' ', tmp)

def prepare(text):
    return text.lower()

def to_normal(word):
    return morph.parse(word)[0].normal_form
    
def normalize(text):
    words = []
    for word in text.split(' '):
        words.append(to_normal(word))
    norm=' '.join(words)  
    return norm.replace('ё', 'е')

class TextNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        cleared = [clear(text) for text in texts]
        return [normalize(text) for text in texts]
    
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [prepare(text) for text in texts]
    
def evaluate_cross_validation(clf, X, y, K):
    cv = StratifiedKFold(K, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv) 
    print(scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(scores.mean(), scores.std()))
    
class RandomForestClassifierWithCoef(RandomForestClassifier):
    def fit(self, *args, **kwargs):
        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
        self.coef_ = self.feature_importances_

In [39]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            ('count_vectorizer', Pipeline([
                ('selector', TextCleaner()),
                ('cv', CountVectorizer(min_df=0.0001, max_df=0.95, ngram_range=(4,4), analyzer="char_wb")),
                #('best', TruncatedSVD(n_components=100)),
            ])),
            ('tfidf', Pipeline([
                 ('selector', TextCleaner()),
                 ('tfidf', TfidfVectorizer(analyzer="word", stop_words=stop_words)),
                 #('best', TruncatedSVD(n_components=10)), 
             ])),
            ('count_vectorizer_default', Pipeline([
                ('selector', TextNormalizer()),
                ('cv', CountVectorizer(min_df=0.0001, max_df=0.95, stop_words=get_stop_words('ru'), analyzer='word')),                                
            ])),

        ],
        transformer_weights={ 
            'count_vectorizer': 1,
            'tfidf': 1,
            'count_vectorizer_default': 1
       },
    )),

    #('model', SVC(kernel='linear')) #rbf, sigmoid, polynomial
    ('model', RandomForestClassifierWithCoef(n_estimators=512, min_samples_leaf=5, n_jobs=-1))
])

In [47]:
def getShuffledNormalizedFeedbacks(first, second):
    result = []
    feedbacksNumber = min(len(feedbacks[first]), len(feedbacks[second]))
    result = list(feedbacks[first][:feedbacksNumber])
    result.extend(list(feedbacks[second][:feedbacksNumber]))
    np.random.shuffle(result)
    return result

print(len(feedbacks['5']))
print(len(feedbacks['1234']))
print(len(getShuffledNormalizedFeedbacks('5', '1234')))

def isFive(num):
    if(num == 5):
        return 1
    else:
        return -1

feedbacks_5 = getShuffledNormalizedFeedbacks('5', '1234');

9211
6176
12352


In [48]:
count_vectorizer_five = CountVectorizer(min_df=0.0001, max_df=0.95, analyzer="word", stop_words=stop_words)

X = count_vectorizer_five.fit_transform((r['comment'] for r in feedbacks_5))
y = list(isFive(r['reting'])  for r in feedbacks_5)

frequent_words = dict(zip(count_vectorizer_five.get_feature_names(),np.asarray(X.sum(axis=0)).ravel()))
sorted_frequent_words = sorted(frequent_words.items(), key=operator.itemgetter(1))
print('the most frequent words', sorted_frequent_words[-20:])

clf_five = RandomForestClassifierWithCoef(n_estimators=512, min_samples_leaf=10, n_jobs=-1)
evaluate_cross_validation(clf_five, X, y, 6)

the most frequent words [('довольна', 701), ('деньги', 707), ('супер', 707), ('телефон', 713), ('купили', 714), ('рекомендую', 733), ('цена', 748), ('купила', 751), ('модель', 824), ('хороший', 826), ('звук', 874), ('отличный', 878), ('вообще', 901), ('отлично', 913), ('быстро', 921), ('советую', 986), ('пользуюсь', 997), ('качество', 1086), ('купил', 1391), ('работает', 1665)]
[ 0.75582524  0.74757282  0.77016521  0.77259475  0.76676385  0.77065112]
Mean score: 0.764 (+/-0.009)
