In [1]:
import numpy as np
from pandas import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from tokenize_uk.tokenize_uk import tokenize_words
import pymorphy2

In [2]:
df = pd.read_csv('data/input/comments.tsv', sep='\t', index_col=0)
df_tones = pd.read_csv('data/input/tone-dict-uk.tsv', sep='\t', index_col=0, names=['tone'])
df = df.fillna('')
print('rows count:', len(df))
df.head()

rows count: 14949


Unnamed: 0_level_0,rating,item_bought,review,pros,cons,upvotes,downvotes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23415273,5,1,"сервіс відмінний, морозильна камера працює добре.",,,0,0
33284346,5,1,гарна та якісна морозильна камера. при першому...,недорогий,нема,0,0
33040878,4,1,"камеру привезли, все працює, все добре, все як...",,немає,2,0
38902893,5,1,справляється із своєю роботою,ціна якість,,0,0
20357268,5,1,"користуюсь міс., дуже задоволений!","все чудово працює, тихий.",поки не виявлено.,0,0


In [3]:
df.groupby('rating').count()

Unnamed: 0_level_0,item_bought,review,pros,cons,upvotes,downvotes
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,730,730,730,730,730,730
2,475,475,475,475,475,475
3,752,752,752,752,752,752
4,2863,2863,2863,2863,2863,2863
5,10129,10129,10129,10129,10129,10129


In [4]:
stopwords = set(["а","або","б","би","бо","був","буде","була","були","було","бути","в","вам","вами","вас","ваш","ваша","ваше","вашим","вашими","ваших","ваші","вашій","вашого","вашої","вашому","вашою","вашу","вже","ви","від","він","вона","вони","воно","всі","де","для","до","дуже","є","з","за","зі","і","із","її","їй","їм","їх","й","його","йому","ким","кого","коли","кому","лише","має","мене","мені","ми","мій","мною","мого","моє","моєї","моєму","моєю","можна","мої","моїй","моїм","моїми","моїх","мою","моя","на","нам","нами","нас","наш","наша","наше","нашим","нашими","наших","наші","нашій","нашого","нашої","нашому","нашою","нашу","неї","нею","ним","ними","них","ній","нім","ну","нього","ньому","під","після","по","при","про","саме","себе","собі","та","так","також","там","твій","твого","твоє","твоєї","твоєму","твоєю","твої","твоїй","твоїм","твоїми","твоїх","твою","твоя","те","тебе","ти","тим","тими","тих","ті","тієї","тією","тій","тільки","тім","то","тобі","тобою","того","тоді","той","тому","ту","тут","у","хто","це","цей","ці","цього","цьому","через","чи","чиє","чиєї","чиєму","чиї","чиїй","чиїм","чиїми","чиїх","чий","чийого","чийому","чим","чию","чия","чого","чому","що","щоб","щодо","щось","я","як","яка","який","які","якщо"])

In [5]:
df_1 = df[df['rating'] == 1].sample(n=475, random_state=1)
df_2 = df[df['rating'] == 2].sample(n=475, random_state=1)
df_3 = df[df['rating'] == 3].sample(n=475, random_state=1)
df_4 = df[df['rating'] == 4].sample(n=475, random_state=1)
df_5 = df[df['rating'] == 5].sample(n=475, random_state=1)
df = pd.concat([df_1, df_2, df_3, df_4, df_5])
df.groupby('rating').count()

Unnamed: 0_level_0,item_bought,review,pros,cons,upvotes,downvotes
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,475,475,475,475,475,475
2,475,475,475,475,475,475
3,475,475,475,475,475,475
4,475,475,475,475,475,475
5,475,475,475,475,475,475


In [6]:
morph = pymorphy2.MorphAnalyzer(lang='uk')
def tokenizer(string):
    return [morph.parse(word)[0].normal_form for word in tokenize_words(string) if word and word not in stopwords]

def sentiment_tokenizer(tokens):
    result = []
    for token in tokens:
        if token in df_tones.index:
            result.append(df_tones.at[token, 'tone'])
        else:
            result.append(token)
    return result

In [7]:
def calc_accuracy(df):
    result = {}
    for rating in range(1, 6):
        filtered = df[df['truth'] == rating]
        result[rating] = np.mean(filtered.truth.values == filtered.prediction.values)
    return result

In [8]:
def split_n_predict(x, y, classifier):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    
    result = y_test.to_frame(name='truth')
    result['prediction'] = predicted

    return calc_accuracy(result)

In [21]:
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word')),
    ('clf', MultinomialNB()),
])

svm_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss = 'hinge', penalty = 'l2', 
                          alpha = 1e-4, random_state = 1,
                          max_iter = 50, tol = None)),
])

## Using tones

In [10]:
df_with_tones = df.copy()
df_with_tones['review_tones'] = df_with_tones['review'].apply(tokenizer).apply(sentiment_tokenizer).str.join('_rev ')
df_with_tones = df_with_tones.dropna(subset=['review_tones'])

split_n_predict(df_with_tones['review_tones'], df_with_tones['rating'], text_clf)

{1: 0.5636363636363636,
 2: 0.48717948717948717,
 3: 0.2608695652173913,
 4: 0.18181818181818182,
 5: 0.10526315789473684}

In [11]:
split_n_predict(df_with_tones['review_tones'], df_with_tones['rating'], svm_clf)



{1: 0.45454545454545453,
 2: 0.23076923076923078,
 3: 0.32608695652173914,
 4: 0.2727272727272727,
 5: 0.3157894736842105}

### adding pros and cons

In [12]:
pros = df_with_tones['pros'].apply(tokenizer).str.join('_pros ')
cons = df_with_tones['cons'].apply(tokenizer).str.join('_cons ')
with_adv = df_with_tones['review_tones'].str.cat([pros, cons], sep=' ')

split_n_predict(with_adv, df_with_tones['rating'], svm_clf)

{1: 0.45454545454545453,
 2: 0.4358974358974359,
 3: 0.2608695652173913,
 4: 0.11363636363636363,
 5: 0.42105263157894735}

### adding votes

In [13]:
upvotes = df_with_tones['upvotes'].astype(str) + '_upv'
downvotes = df_with_tones['downvotes'].astype(str) + '_down'

with_votes = with_adv.str.cat([upvotes, downvotes], ' ')

split_n_predict(with_votes, df_with_tones['rating'], svm_clf)

{1: 0.45454545454545453,
 2: 0.4358974358974359,
 3: 0.21739130434782608,
 4: 0.2727272727272727,
 5: 0.3157894736842105}

### considering whether the item was bought

In [14]:
item_bought = df_with_tones['item_bought'].astype(str) + '_bought'
with_bought = with_adv.str.cat(item_bought, sep=' ')

split_n_predict(with_bought, df_with_tones['rating'], svm_clf)

{1: 0.4909090909090909,
 2: 0.23076923076923078,
 3: 0.2391304347826087,
 4: 0.13636363636363635,
 5: 0.34210526315789475}

## Without tones

In [15]:
reviews = df['review'].apply(tokenizer).str.join('_rev ')

In [16]:
split_n_predict(reviews, df['rating'], text_clf)

{1: 0.553030303030303,
 2: 0.3783783783783784,
 3: 0.2147239263803681,
 4: 0.22627737226277372,
 5: 0.40601503759398494}

In [17]:
split_n_predict(reviews, df['rating'], svm_clf)



{1: 0.5227272727272727,
 2: 0.28378378378378377,
 3: 0.17791411042944785,
 4: 0.40875912408759124,
 5: 0.3684210526315789}

### adding pros and cons

In [18]:
pros = df['pros'].apply(tokenizer).str.join('_pros ')
cons = df['cons'].apply(tokenizer).str.join('_cons ')
with_adv = reviews.str.cat([pros, cons], sep=' ')

split_n_predict(with_adv, df['rating'], svm_clf)

{1: 0.4621212121212121,
 2: 0.3310810810810811,
 3: 0.2392638036809816,
 4: 0.2773722627737226,
 5: 0.6390977443609023}

### adding votes

In [19]:
upvotes = df['upvotes'].astype(str) + '_upv'
downvotes = df['downvotes'].astype(str) + '_down'

with_votes = with_adv.str.cat([upvotes, downvotes], ' ')

split_n_predict(with_votes, df['rating'], svm_clf)

{1: 0.5151515151515151,
 2: 0.28378378378378377,
 3: 0.34355828220858897,
 4: 0.30656934306569344,
 5: 0.6090225563909775}

### considering whether the item was bought

In [20]:
item_bought = df['item_bought'].astype(str) + '_bought'
with_bought = with_adv.str.cat(item_bought, sep=' ')

split_n_predict(with_bought, df['rating'], svm_clf)

{1: 0.45454545454545453,
 2: 0.3918918918918919,
 3: 0.25153374233128833,
 4: 0.30656934306569344,
 5: 0.6015037593984962}