In [2]:
import codecs

# Read dataset
review_data = []

f = codecs.open('reviews_rozetka.txt', "r", "utf-8")
text = f.read()
lines = text.splitlines()

for line in lines:
    item = line.split("/////")
    if (len(item) != 2):
        print 'wrong item: ', item
        continue
    item[0] = item[0].encode('utf-8')
    review_data.append(item)

In [3]:
# Split data to train and test set

train_set = sorted(review_data, key=lambda tup: tup[1])
test_set = []
for i, item in enumerate(train_set):
    if (i % 9 == 0) or (i % 8 == 0):
        test_set.append(item)
        train_set.remove(item)

print 'train set size:', str(len(train_set))
print 'test set size:', str(len(test_set))

train set size: 4832
test set size: 1074


In [4]:
# Check star frequencies

star_probability = {}
for star in range(1, 6):
    train_for_star = len(list(x for x in train_set if x[1] == str(star)))
    print 'train ', str(star), ': ', str(train_for_star)
    test_for_star = len(list(x for x in test_set if x[1] == str(star)))
    print 'test ', str(star), ': ', str(test_for_star)

    star_probability[star] = train_for_star / float(len(train_set))

print star_probability

train  1 :  157
test  1 :  35
train  2 :  180
test  2 :  41
train  3 :  277
test  3 :  61
train  4 :  909
test  4 :  202
train  5 :  3309
test  5 :  735
{1: 0.032491721854304635, 2: 0.037251655629139076, 3: 0.05732615894039735, 4: 0.18812086092715233, 5: 0.6848096026490066}


In [5]:
# form vocabulary

import polyglot
from polyglot.text import Text, Word
import numpy as np

vocabulary = {}
for item in train_set:
    text = Text(item[0])
    for word in text.words:
        word_text = word.lower()
        if not vocabulary.has_key(word_text):
            vocabulary[word_text] = 0
        vocabulary[word_text] += 1

print 'full vocabulary size:', len(vocabulary)


No handlers could be found for logger "polyglot.detect.base"


full vocabulary size: 15987


In [6]:
# improvements:
full_vocabulary = dict(vocabulary)
stop_keyword = u'розет'
for it, k in full_vocabulary.iteritems():
    if it.isdigit():
        del vocabulary[it]
    elif (it.find('.') >= 0) or (it.find(',') >= 0) or (it.find('!') >= 0) or (it.find(':') >= 0):
        if len(it) > 1:
            del vocabulary[it]
    elif it[0:5] == stop_keyword:
        del vocabulary[it]
        
print 'final vocabulary size:', len(vocabulary)



final vocabulary size: 15439


In [7]:
# Read tone dictionary
tone_dictionary = {}

f = codecs.open('tone-dict-uk.tsv', "r", "utf-8")
text = f.read()
lines = text.splitlines()
for line in lines:
    item = line.split("\t")
    if (len(item) != 2):
        print 'wrong item: ', item
        continue
    item[0] = item[0].encode('utf-8')
    tone_dictionary[item[0]] = item[1]

In [25]:
# calculate tones for words in our vocabulary
# use stemming to find words from reviews in tones dictionary
# if more than one dictionary item fits a stem, use average tone
word_tones = {}
vocab_words = vocabulary.keys()
for word in vocab_words:
    stem = ''
    if len(word) > 6:
        stem = word[0:len(word) - 2]
    else:
        stem = word[0:len(word) - 1]
        
    if len(stem) <= 1:
        continue
    tone_items = list(int(t) for word, t in tone_dictionary.iteritems() if word == stem.encode('utf-8'))  
    if len(tone_items) <= 0:
        tone_items = list(int(t) for word, t in tone_dictionary.iteritems() \
                          if (word[0:2*len(stem)] == stem.encode('utf-8')) \
                          and (len(stem.encode('utf-8')) / float(len(word)) > 0.8))
    if len(tone_items) > 0:
        av_tone = sum(tone_items) / float(len(tone_items))
        word_tones[word] = av_tone

In [26]:
# scale tones to be in range(0, 1]

min_tone = min(word_tones.values())
max_tone = max(word_tones.values())
print min_tone, max_tone

for key in word_tones:
    word_tones[key] = (word_tones[key] - min_tone + 1) / float(max_tone - min_tone + 1)

min_tone = min(word_tones.values())
max_tone = max(word_tones.values())
print min_tone, max_tone

-2.0 2.0
0.2 1.0


In [27]:
print len(word_tones)

for word in word_tones.keys()[0:20]:
    print word, word_tones[word]

375
чудовому 1.0
використати 0.8
чистими 0.8
поганий 0.4
дірками 0.4
поганим 0.4
морочитися 0.2
рекомендації 0.8
хорошийта 0.8
позитивного 0.8
міцними 0.8
захвата 1.0
адекватною 0.8
незручнихз 0.4
захвату 1.0
професійних 0.8
дивний 0.4
захваті 1.0
вигідний 0.8
чарівними 1.0


In [28]:
# Convert text to vectors 
# 0 - word is not in text, 0.2 - 1 word tone  (0.6 is neutral and means that word is in text)
# form train and test vector sets
vocab_words = vocabulary.keys()
vocab_word_tones = word_tones.keys()

train_vectors_tone = []
train_classes = []
for item in train_set:
    text = Text(item[0])
    vector_x = np.zeros(len(vocab_words))
    for word in text.words:
        word_text = word.lower()
        if word_text in vocab_word_tones:
            vector_x[vocab_words.index(word_text)] = word_tones[word_text]
        elif word_text in vocab_words:
            vector_x[vocab_words.index(word_text)] = 0.6 #3/5
    train_vectors_tone.append(vector_x)
    train_classes.append(int(item[1]))


test_vectors_tone = []
test_classes = []
for item in test_set:
    text = Text(item[0])
    vector_x = np.zeros(len(vocab_words))
    for word in text.words:
        word_text = word.lower()
        if word_text in vocab_word_tones:
            vector_x[vocab_words.index(word_text)] = word_tones[word_text]
        elif word_text in vocab_words:
            vector_x[vocab_words.index(word_text)] = 0.6 #3/5
    test_vectors_tone.append(vector_x)
    test_classes.append(int(item[1]))


In [29]:
# fit SVM model, train one-vs-rest classifiers for multi-class classification
# use balanced class weights to handle significant bias (68% of data are 5*)
from sklearn import svm
lin_clf_tone = svm.LinearSVC(class_weight='balanced')
lin_clf_tone.fit(train_vectors_tone, train_classes)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [30]:
# run classifier on test set

prediced_classes_tone = lin_clf_tone.predict(test_vectors_tone)
common = 0
errors_freqencies = {}
errors = {}
for i, clas in enumerate(test_classes):
    if (clas == prediced_classes_tone[i]):
        common += 1
    else:

        key = str(clas) + ' -> ' + str(prediced_classes_tone[i])
        
        if not errors_freqencies.has_key(key):
            errors_freqencies[key] = 0
        errors_freqencies[key] += 1
        
        if not errors.has_key(key):
            errors[key] = []
        errors[key].append(test_set[i])
        
print 'correct: ', str(common), '/', str(len(test_set))
print 'errors (correct -> classified : count)'
for key, freq in errors_freqencies.iteritems():
    print key, ':', freq 

correct:  825 / 1074
errors (correct -> classified : count)
1 -> 4 : 5
1 -> 5 : 10
1 -> 2 : 5
1 -> 3 : 3
4 -> 3 : 5
4 -> 2 : 3
4 -> 1 : 1
5 -> 3 : 13
5 -> 4 : 61
4 -> 5 : 80
3 -> 2 : 4
3 -> 4 : 7
5 -> 1 : 5
2 -> 5 : 8
2 -> 4 : 4
3 -> 1 : 3
2 -> 1 : 3
5 -> 2 : 5
2 -> 3 : 6
3 -> 5 : 18


In [None]:
# run this to display wrong classification examples
for key, e in errors.iteritems():
    print key
    for er in e:
        print er[0]
