In [1]:
import numpy as np
from pandas import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from tokenize_uk.tokenize_uk import tokenize_words
from sklearn.metrics import classification_report
import pymorphy2

In [2]:
df = pd.read_csv('data/input/comments.tsv', sep='\t', index_col=0)
df_tones = pd.read_csv('data/input/tone-dict-uk.tsv', sep='\t', index_col=0, names=['tone'])
df = df.fillna('')
print('rows count:', len(df))
df.head()

rows count: 14949


Unnamed: 0_level_0,rating,item_bought,review,pros,cons,upvotes,downvotes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23415273,5,1,"сервіс відмінний, морозильна камера працює добре.",,,0,0
33284346,5,1,гарна та якісна морозильна камера. при першому...,недорогий,нема,0,0
33040878,4,1,"камеру привезли, все працює, все добре, все як...",,немає,2,0
38902893,5,1,справляється із своєю роботою,ціна якість,,0,0
20357268,5,1,"користуюсь міс., дуже задоволений!","все чудово працює, тихий.",поки не виявлено.,0,0


In [3]:
df.groupby('rating').count()

Unnamed: 0_level_0,item_bought,review,pros,cons,upvotes,downvotes
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,730,730,730,730,730,730
2,475,475,475,475,475,475
3,752,752,752,752,752,752
4,2863,2863,2863,2863,2863,2863
5,10129,10129,10129,10129,10129,10129


In [4]:
stopwords = set(["а","або","б","би","бо","був","буде","була","були","було","бути","в","вам","вами","вас","ваш","ваша","ваше","вашим","вашими","ваших","ваші","вашій","вашого","вашої","вашому","вашою","вашу","вже","ви","від","він","вона","вони","воно","всі","де","для","до","дуже","є","з","за","зі","і","із","її","їй","їм","їх","й","його","йому","ким","кого","коли","кому","лише","має","мене","мені","ми","мій","мною","мого","моє","моєї","моєму","моєю","можна","мої","моїй","моїм","моїми","моїх","мою","моя","на","нам","нами","нас","наш","наша","наше","нашим","нашими","наших","наші","нашій","нашого","нашої","нашому","нашою","нашу","неї","нею","ним","ними","них","ній","нім","ну","нього","ньому","під","після","по","при","про","саме","себе","собі","та","так","також","там","твій","твого","твоє","твоєї","твоєму","твоєю","твої","твоїй","твоїм","твоїми","твоїх","твою","твоя","те","тебе","ти","тим","тими","тих","ті","тієї","тією","тій","тільки","тім","то","тобі","тобою","того","тоді","той","тому","ту","тут","у","хто","це","цей","ці","цього","цьому","через","чи","чиє","чиєї","чиєму","чиї","чиїй","чиїм","чиїми","чиїх","чий","чийого","чийому","чим","чию","чия","чого","чому","що","щоб","щодо","щось","я","як","яка","який","які","якщо"])

In [5]:
df_1 = df[df['rating'] == 1].sample(n=475, random_state=1)
df_2 = df[df['rating'] == 2].sample(n=475, random_state=1)
df_3 = df[df['rating'] == 3].sample(n=475, random_state=1)
df_4 = df[df['rating'] == 4].sample(n=475, random_state=1)
df_5 = df[df['rating'] == 5].sample(n=475, random_state=1)
df = pd.concat([df_1, df_2, df_3, df_4, df_5])
df.groupby('rating').count()

Unnamed: 0_level_0,item_bought,review,pros,cons,upvotes,downvotes
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,475,475,475,475,475,475
2,475,475,475,475,475,475
3,475,475,475,475,475,475
4,475,475,475,475,475,475
5,475,475,475,475,475,475


In [6]:
morph = pymorphy2.MorphAnalyzer(lang='uk')
def tokenizer(string):
    return [morph.parse(word)[0].normal_form for word in tokenize_words(string) if word and word not in stopwords]

def sentiment_tokenizer(tokens):
    result = []
    for token in tokens:
        if token in df_tones.index:
            result.append(df_tones.at[token, 'tone'])
        else:
            result.append(token)
    return result

In [7]:
def calc_accuracy(df):
    result = {}
    for rating in range(1, 6):
        filtered = df[df['truth'] == rating]
        result[rating] = np.mean(filtered.truth.values == filtered.prediction.values)
    return result

In [8]:
def split_n_predict(x, y, classifier):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    
    result = y_test.to_frame(name='truth')
    result['prediction'] = predicted
    print(classification_report(result.truth, result.prediction))

    return calc_accuracy(result)

In [9]:
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word')),
    ('clf', MultinomialNB()),
])

svm_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss = 'hinge', penalty = 'l2', 
                          alpha = 1e-4, random_state = 1,
                          max_iter = 50, tol = None)),
])

## Using tones

In [10]:
df_with_tones = df.copy()
df_with_tones['review_tones'] = df_with_tones['review'].apply(tokenizer).apply(sentiment_tokenizer).str.join('_rev ')
df_with_tones = df_with_tones.dropna(subset=['review_tones'])

split_n_predict(df_with_tones['review_tones'], df_with_tones['rating'], text_clf)

              precision    recall  f1-score   support

           1       0.47      0.56      0.51        55
           2       0.23      0.49      0.32        39
           3       0.28      0.26      0.27        46
           4       0.32      0.18      0.23        44
           5       0.57      0.11      0.18        38

   micro avg       0.33      0.33      0.33       222
   macro avg       0.37      0.32      0.30       222
weighted avg       0.38      0.33      0.31       222



{1: 0.5636363636363636,
 2: 0.48717948717948717,
 3: 0.2608695652173913,
 4: 0.18181818181818182,
 5: 0.10526315789473684}

In [11]:
split_n_predict(df_with_tones['review_tones'], df_with_tones['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.50      0.49      0.50        55
           2       0.24      0.31      0.27        39
           3       0.27      0.28      0.28        46
           4       0.21      0.20      0.21        44
           5       0.29      0.21      0.24        38

   micro avg       0.31      0.31      0.31       222
   macro avg       0.30      0.30      0.30       222
weighted avg       0.31      0.31      0.31       222





{1: 0.4909090909090909,
 2: 0.3076923076923077,
 3: 0.2826086956521739,
 4: 0.20454545454545456,
 5: 0.21052631578947367}

### adding pros and cons

In [12]:
pros = df_with_tones['pros'].apply(tokenizer).str.join('_pros ')
cons = df_with_tones['cons'].apply(tokenizer).str.join('_cons ')
with_adv = df_with_tones['review_tones'].str.cat([pros, cons], sep=' ')

split_n_predict(with_adv, df_with_tones['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.54      0.56      0.55        55
           2       0.19      0.26      0.22        39
           3       0.29      0.26      0.28        46
           4       0.20      0.20      0.20        44
           5       0.42      0.29      0.34        38

   micro avg       0.33      0.33      0.33       222
   macro avg       0.33      0.31      0.32       222
weighted avg       0.34      0.33      0.33       222



{1: 0.5636363636363636,
 2: 0.2564102564102564,
 3: 0.2608695652173913,
 4: 0.20454545454545456,
 5: 0.2894736842105263}

### adding votes

In [13]:
upvotes = df_with_tones['upvotes'].astype(str) + '_upv'
downvotes = df_with_tones['downvotes'].astype(str) + '_down'

with_votes = with_adv.str.cat([upvotes, downvotes], ' ')

split_n_predict(with_votes, df_with_tones['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.53      0.56      0.54        55
           2       0.23      0.31      0.26        39
           3       0.28      0.24      0.26        46
           4       0.16      0.16      0.16        44
           5       0.37      0.26      0.31        38

   micro avg       0.32      0.32      0.32       222
   macro avg       0.31      0.31      0.31       222
weighted avg       0.32      0.32      0.32       222



{1: 0.5636363636363636,
 2: 0.3076923076923077,
 3: 0.2391304347826087,
 4: 0.1590909090909091,
 5: 0.2631578947368421}

### considering whether the item was bought

In [14]:
item_bought = df_with_tones['item_bought'].astype(str) + '_bought'
with_bought = with_adv.str.cat(item_bought, sep=' ')

split_n_predict(with_bought, df_with_tones['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.48      0.53      0.50        55
           2       0.19      0.26      0.22        39
           3       0.35      0.30      0.33        46
           4       0.24      0.20      0.22        44
           5       0.45      0.37      0.41        38

   micro avg       0.34      0.34      0.34       222
   macro avg       0.34      0.33      0.33       222
weighted avg       0.35      0.34      0.34       222



{1: 0.5272727272727272,
 2: 0.2564102564102564,
 3: 0.30434782608695654,
 4: 0.20454545454545456,
 5: 0.3684210526315789}

## Without tones

In [15]:
reviews = df['review'].apply(tokenizer).str.join('_rev ')

In [16]:
split_n_predict(reviews, df['rating'], text_clf)

              precision    recall  f1-score   support

           1       0.37      0.55      0.45       132
           2       0.31      0.38      0.34       148
           3       0.39      0.21      0.28       163
           4       0.26      0.23      0.24       137
           5       0.42      0.41      0.41       133

   micro avg       0.35      0.35      0.35       713
   macro avg       0.35      0.36      0.34       713
weighted avg       0.35      0.35      0.34       713



{1: 0.553030303030303,
 2: 0.3783783783783784,
 3: 0.2147239263803681,
 4: 0.22627737226277372,
 5: 0.40601503759398494}

In [17]:
split_n_predict(reviews, df['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.39      0.46      0.42       132
           2       0.32      0.29      0.30       148
           3       0.34      0.25      0.29       163
           4       0.28      0.32      0.30       137
           5       0.39      0.44      0.41       133

   micro avg       0.35      0.35      0.35       713
   macro avg       0.34      0.35      0.35       713
weighted avg       0.34      0.35      0.34       713





{1: 0.4621212121212121,
 2: 0.2905405405405405,
 3: 0.24539877300613497,
 4: 0.32116788321167883,
 5: 0.43609022556390975}

### adding pros and cons

In [18]:
pros = df['pros'].apply(tokenizer).str.join('_pros ')
cons = df['cons'].apply(tokenizer).str.join('_cons ')
with_adv = reviews.str.cat([pros, cons], sep=' ')

split_n_predict(with_adv, df['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.41      0.46      0.44       132
           2       0.41      0.36      0.39       148
           3       0.39      0.30      0.34       163
           4       0.33      0.36      0.35       137
           5       0.52      0.62      0.57       133

   micro avg       0.42      0.42      0.42       713
   macro avg       0.41      0.42      0.41       713
weighted avg       0.41      0.42      0.41       713



{1: 0.4621212121212121,
 2: 0.36486486486486486,
 3: 0.3006134969325153,
 4: 0.35766423357664234,
 5: 0.6240601503759399}

### adding votes

In [19]:
upvotes = df['upvotes'].astype(str) + '_upv'
downvotes = df['downvotes'].astype(str) + '_down'

with_votes = with_adv.str.cat([upvotes, downvotes], ' ')

split_n_predict(with_votes, df['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.41      0.50      0.45       132
           2       0.44      0.36      0.39       148
           3       0.42      0.30      0.35       163
           4       0.34      0.38      0.36       137
           5       0.50      0.62      0.55       133

   micro avg       0.42      0.42      0.42       713
   macro avg       0.42      0.43      0.42       713
weighted avg       0.42      0.42      0.42       713



{1: 0.5,
 2: 0.3581081081081081,
 3: 0.3006134969325153,
 4: 0.3795620437956204,
 5: 0.6165413533834586}

### considering whether the item was bought

In [20]:
item_bought = df['item_bought'].astype(str) + '_bought'
with_bought = with_adv.str.cat(item_bought, sep=' ')

split_n_predict(with_bought, df['rating'], svm_clf)

              precision    recall  f1-score   support

           1       0.40      0.48      0.44       132
           2       0.40      0.34      0.37       148
           3       0.39      0.28      0.32       163
           4       0.35      0.39      0.37       137
           5       0.52      0.64      0.58       133

   micro avg       0.42      0.42      0.42       713
   macro avg       0.41      0.42      0.41       713
weighted avg       0.41      0.42      0.41       713



{1: 0.48484848484848486,
 2: 0.33783783783783783,
 3: 0.27607361963190186,
 4: 0.38686131386861317,
 5: 0.6390977443609023}