In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

In [2]:
# Read data
data = pd.read_csv("X_train.csv")

In [3]:
data

Unnamed: 0,sku,categoryLevel1Id,categoryLevel2Id,brandId,property,userName,reting,date,comment,commentNegative,commentPositive
0,20005023,401,4010201,826,"[{34: 'f982777489055c6563d68c005fd24aad'}, {36...",b2898a81b45310b30beb8fc0c0a9ce1e,2.0,2013-06-28,"2,5 года работала и все...устала! Лампочка гор...",,
1,20020647,403,4030101,1425,"[{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...",538c73d64461e13907bb95c51c38bfbc,2.0,2010-07-04,Через 2 месяца после истечении гарантийного ср...,,
2,20020701,401,4010401,124,"[{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...",ddca2d0101513a6209db7868eed8be05,4.0,2010-05-27,пользуюсь уже три недели. нареканий ни каких н...,,
3,30012256,203,2030301,93,"[{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...",289c20015b3713a82ba5ddf774d996f7,5.0,2016-10-11,Ребят этот системный блок подойдёт для игры кс...,,
4,30011341,205,2050201,656,"[{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...",5576f82d149d4f688644fef2322c63ef,5.0,2010-02-26,"я считаю, что яри замечательный телефон! Прият...",,
5,20023626,405,4050102,829,"[{12671: 'e04af96afe53462f72f39331b209a810'}, ...",2daa7d6326bc2918c6dd46e35ace6b6d,1.0,2014-05-21,Сегодня купила 2 таких вентилятора! Ужасный. С...,,
6,30011639,203,2030201,995,"[{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...",d6c4575df3a246fe76b975dec9201a84,5.0,2010-02-27,"привезли ноут, по качеству корпуса и дизайну 5...",,
7,20022938,404,4040203,759,"[{897: 'e4da3b7fbbce2345d7772b0674a318d5'}, {2...",720167647dffa170deceda1549c0c906,2.0,2015-01-16,"Купила этот увлажнитель, шумный, подсветка меш...",,
8,20007867,406,4060101,48,"[{769: '6b101662e3fb18552fa38924077c789a'}, {1...",7bb1d51e3697a6f9596e9c817c421962,5.0,2010-08-06,Комбайн отличный. Пользуюсь 2 года. Пользуюсь ...,,
9,20002766,412,4120101,11,"[{34: 'f982777489055c6563d68c005fd24aad'}, {36...",009763d797b076d86bc5ce262cbf1de0,5.0,2012-07-23,Отличный аппарат. авто выключение вообще преле...,,


In [4]:
comments = data["comment"].values
comments_pos = data["commentPositive"].values
comments_neg = data["commentNegative"].values
ratings = data["reting"].values

In [5]:
neg_inds = np.arange(comments_neg.size)[~pd.isnull(comments_neg)]
pos_inds = np.arange(comments_pos.size)[~pd.isnull(comments_pos)]

In [6]:
# Merge all comments with positive and negative ones
split_array = np.array([' ' for _ in range(comments.size)])
comments[neg_inds] += split_array[neg_inds] + comments_neg[neg_inds]
comments[pos_inds] += split_array[pos_inds] + comments_pos[pos_inds]

In [7]:
# Some enhancements
word_punct_tokenizer = WordPunctTokenizer()
snowball_stemmer = SnowballStemmer("russian")

for i in range(comments.size):
    comments[i] = ' '.join([snowball_stemmer.stem(word) for word in word_punct_tokenizer.tokenize(comments[i]) if word not in punctuation])

In [8]:
# Train-test inds split
train_inds, test_inds = train_test_split(np.arange(ratings.size), test_size=0.33, random_state=42)

In [9]:
X_train, y_train = comments[train_inds], ratings[train_inds]
X_test, y_test = comments[test_inds], ratings[test_inds]

In [10]:
# Some tf-idf
vectorizer = TfidfVectorizer(max_df=0.95)  
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
vectorizer.vocabulary_

{'пережима': 13504,
 'услыш': 20335,
 'блик': 4063,
 'денежк': 6337,
 'l110': 2264,
 'включаеш': 4764,
 'высочен': 5603,
 'вскрыва': 5186,
 'вылезет': 5455,
 'продегустирова': 16041,
 'залог': 7509,
 '2a': 523,
 'топк': 19638,
 'n97': 2493,
 'дисков': 6488,
 'худеет': 20963,
 'пошив': 15280,
 'оче': 13164,
 'жироуловител': 7112,
 'txt': 3071,
 'удра': 20092,
 'ext1': 1878,
 'джостик': 6432,
 'трубк': 19823,
 'архаичн': 3667,
 'надп': 10876,
 'монстрик': 10559,
 'грудинк': 6106,
 'fs21': 1960,
 'принцип': 15768,
 'коллекторн': 8999,
 'micke': 2400,
 'помешива': 14810,
 'поинт': 14543,
 'богс': 4103,
 'заеха': 7360,
 'изыска': 8229,
 'чашечк': 21101,
 'перемалова': 13562,
 'достанет': 6741,
 'крик': 9453,
 'шум': 21485,
 'azurexcel': 1420,
 'триколоровск': 19797,
 'ном': 11905,
 'выбра': 5308,
 'гремет': 6063,
 'сутк': 19167,
 'blu': 1489,
 'негреет': 11362,
 'начальник': 11254,
 'двухсторон': 6263,
 'усбшник': 20312,
 'подставочк': 14397,
 'стиран': 18982,
 'прогон': 16002,
 'bolsho': 1

In [12]:
X_train

<10443x21854 sparse matrix of type '<class 'numpy.float64'>'
	with 394186 stored elements in Compressed Sparse Row format>

In [13]:
X_test

<5144x21854 sparse matrix of type '<class 'numpy.float64'>'
	with 187242 stored elements in Compressed Sparse Row format>

In [14]:
# We will solve the classification task as experiments showed that it has better performance over regression
y_train_clf = np.round(y_train)
b = y_train_clf.min()
y_train_clf -= b

In [15]:
def modelfit(alg, X_train, Y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = len(set(Y_train))
        xgtrain = xgb.DMatrix(X_train, label=Y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                          stratified=True, early_stopping_rounds=early_stopping_rounds)
        print("CV results: ", cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, Y_train)

In [16]:
clf = XGBClassifier(n_estimators=10000, learning_rate=0.04, objective='multi:softprob') 
modelfit(clf, X_train, y_train_clf)

CV results:        test-merror-mean  test-merror-std  train-merror-mean  train-merror-std
0             0.396148         0.004969           0.383918          0.001825
1             0.393754         0.003844           0.382146          0.003187
2             0.394809         0.001443           0.383438          0.002447
3             0.393467         0.002859           0.382601          0.003442
4             0.392608         0.002730           0.380709          0.002843
5             0.392703         0.002084           0.382720          0.002047
6             0.392511         0.002560           0.382649          0.003028
7             0.392989         0.002001           0.382912          0.003166
8             0.393181         0.001819           0.383486          0.002372
9             0.393181         0.001213           0.382816          0.003764
10            0.393564         0.001750           0.383414          0.002913
11            0.393756         0.001614           0.383869     

In [17]:
# A little tricky prediction (but still the best performance)
y_pred = clf.predict_proba(X_test).dot(np.arange(1, 6))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MedAE: ", median_absolute_error(y_test, y_pred))

MSE:  0.939456817954
MAE:  0.693058360245
MedAE:  0.479364490602
