In [28]:
import pandas as pd
import numpy as np

# Download CSV files

In [2]:
df_learn = pd.read_csv('rozetka_learn.csv')
df_test = pd.read_csv('rozetka_test.csv')

# Look on the data

In [3]:
print(f"Learn set shape: {df_learn.shape}")
print(f"Test set shape: {df_test.shape}")

Learn set shape: (3800, 5)
Test set shape: (1629, 5)


In [6]:
df_learn.head()

Unnamed: 0,goods_code,stars,review,author,permalink
0,13252316,1,Шановні! Де обіцяна знижка 5% при оплаті карто...,Андрей Дмитриев,#tab=comments;id=34200034
1,61270053,5,"Дизайн зайде не кожному, но є аналог Xiaomi Re...",Виталий Александрович,#tab=comments;id=35938497
2,45499672,5,колір тільки синій чи чорний теж є?,Арт Лаз,#tab=comments;id=30229184
3,48116526,4,"Швидкий та зручний телефон, має презентабельни...",Дима Лисунов,#tab=comments;id=36726816
4,55379058,5,Купувався мамі на подарунок. Дуже задоволена. ...,Микола,#tab=comments;id=38966133


In [9]:
df_learn.groupby('stars').count()

Unnamed: 0_level_0,goods_code,review,author,permalink
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,173,173,173,173
2,122,122,122,122
3,222,222,222,222
4,746,746,746,746
5,2537,2537,2537,2537


In [10]:
df_test.groupby('stars').count()

Unnamed: 0_level_0,goods_code,review,author,permalink
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,93,93,93,93
2,59,59,59,59
3,101,101,101,101
4,316,316,316,316
5,1060,1060,1060,1060


# Sentiment analizer - baseline

In [30]:
X_learn = df_learn['review'].values
X_test = df_test['review'].values
y_learn = df_learn['stars'].values
y_test = df_test['stars'].values

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [34]:
def classifier_nb_tfidf(X_learn, y_learn, X_test, y_test):
    tfidf = TfidfVectorizer()
    features_learn = tfidf.fit_transform(X_learn)
    features_test = tfidf.transform(X_test)
    print(f"Dictionary size = {len(tfidf.vocabulary_)}")
    clf = BernoulliNB()
    clf.fit(features_learn, y_learn)
    y_predicted = clf.predict(features_test)
    return y_predicted

In [37]:
predictions = classifier_nb_tfidf(X_learn, y_learn, X_test, y_test)

print(y_test[:1000])
print(predictions[:1000])

Dictionary size = 18097
[5 1 3 5 5 5 4 5 4 5 5 5 5 5 5 4 4 5 5 5 2 5 5 3 5 5 5 1 4 5 3 5 4 3 5 5 5
 5 2 5 5 5 5 5 5 4 5 5 5 5 5 3 2 5 5 2 3 3 5 1 2 5 5 1 5 2 5 5 5 4 5 5 5 5
 5 1 5 4 5 5 5 5 5 1 5 5 4 4 5 5 5 5 5 5 3 5 3 5 5 5 3 5 5 4 1 5 5 5 5 5 4
 5 5 5 1 5 5 5 4 3 5 5 5 5 5 5 5 3 4 5 3 5 5 5 1 4 5 5 4 4 5 5 5 1 5 4 5 5
 5 3 4 5 5 5 5 5 5 5 4 1 3 3 5 2 5 5 5 5 5 5 5 5 5 4 5 3 4 5 5 5 5 5 5 4 5
 4 5 4 5 2 4 5 4 5 1 5 5 5 5 5 5 5 5 5 5 5 4 5 4 5 5 4 5 4 5 1 5 5 1 5 5 4
 4 5 5 5 5 5 5 4 5 5 5 5 5 5 4 5 5 5 4 4 5 4 4 4 5 5 5 5 5 5 5 5 1 5 4 1 4
 3 5 5 5 5 5 5 5 3 5 3 3 4 5 5 1 5 4 5 5 5 5 5 1 4 5 4 5 4 4 5 5 5 5 1 3 4
 5 1 4 5 5 5 1 5 5 5 5 5 5 5 5 5 5 1 4 3 5 1 4 5 5 5 5 4 5 3 5 5 3 5 5 4 5
 5 5 5 5 5 5 4 5 3 4 1 5 4 2 5 5 5 5 5 5 4 5 1 2 2 5 5 3 1 5 5 4 5 3 4 5 2
 5 2 5 5 5 5 5 4 5 4 5 4 5 5 3 4 1 4 5 5 4 5 4 2 5 1 5 5 1 5 5 4 5 5 4 5 5
 5 5 5 5 4 4 5 5 5 5 4 5 5 5 5 5 1 2 5 5 5 5 4 5 5 5 4 5 5 4 5 5 1 5 1 3 2
 5 5 5 4 5 5 2 5 5 1 5 2 4 5 5 5 3 5 5 5 5 5 5 1 5 5 5 3 1 5 5 5 5 4 5 5 5
 