In [3]:
import codecs

# Read dataset
review_data = []

f = codecs.open('reviews_rozetka.txt', "r", "utf-8")
text = f.read()
lines = text.splitlines()

for line in lines:
    item = line.split("/////")
    if (len(item) != 2):
        print 'wrong item: ', item
        continue
    item[0] = item[0].encode('utf-8')
    review_data.append(item)

In [4]:
# Split data to train and test set

train_set = sorted(review_data, key=lambda tup: tup[1])
test_set = []
for i, item in enumerate(train_set):
    if (i % 9 == 0) or (i % 8 == 0):
        test_set.append(item)
        train_set.remove(item)

print 'train set size:', str(len(train_set))
print 'test set size:', str(len(test_set))

train set size: 4832
test set size: 1074


In [5]:
# Check star frequencies

star_probability = {}
for star in range(1, 6):
    train_for_star = len(list(x for x in train_set if x[1] == str(star)))
    print 'train ', str(star), ': ', str(train_for_star)
    test_for_star = len(list(x for x in test_set if x[1] == str(star)))
    print 'test ', str(star), ': ', str(test_for_star)

    star_probability[star] = train_for_star / float(len(train_set))

print star_probability

train  1 :  157
test  1 :  35
train  2 :  180
test  2 :  41
train  3 :  277
test  3 :  61
train  4 :  909
test  4 :  202
train  5 :  3309
test  5 :  735
{1: 0.032491721854304635, 2: 0.037251655629139076, 3: 0.05732615894039735, 4: 0.18812086092715233, 5: 0.6848096026490066}


In [6]:
# form vocabulary

import polyglot
from polyglot.text import Text, Word
import numpy as np

vocabulary = {}
for item in train_set:
    text = Text(item[0])
    for word in text.words:
        word_text = word.lower()
        if not vocabulary.has_key(word_text):
            vocabulary[word_text] = 0
        vocabulary[word_text] += 1

print 'full vocabulary size:', len(vocabulary)


No handlers could be found for logger "polyglot.detect.base"


full vocabulary size: 15987


In [7]:
# remove digits
full_vocabulary = dict(vocabulary)
for it, k in full_vocabulary.iteritems():
    if it.isdigit():
        del vocabulary[it]

print 'remove digits vocabulary size:', len(vocabulary)


remove digits vocabulary size: 15837


In [8]:
# remove wrong tokens
full_vocabulary = dict(vocabulary)
for it, k in full_vocabulary.iteritems():
    if (it.find('.') >= 0) or (it.find(',') >= 0) or (it.find('!') >= 0) or (it.find(':') >= 0):
        if len(it) > 1:
            del vocabulary[it]

print 'remowe wrong words size:', len(vocabulary)

remowe wrong words size: 15447


In [9]:
# remove rozetk
full_vocabulary = dict(vocabulary)
keyword = u'розет'
for it, k in full_vocabulary.iteritems():
    if it[0:5] == keyword:
        del vocabulary[it]

print 'remowe rozet size:', len(vocabulary)

remowe rozet size: 15439


In [10]:
# Сonvert text to vectors (zero-one vectors of presence), form train and test vector sets

vocab_words = vocabulary.keys()

train_vectors = []
train_classes = []
for item in train_set:
    text = Text(item[0])
    vector_x = np.zeros(len(vocab_words))
    for word in text.words:
        word_text = word.lower()
        if word_text in vocab_words:
            vector_x[vocab_words.index(word_text)] = 1

    train_vectors.append(vector_x)
    train_classes.append(int(item[1]))


test_vectors = []
test_classes = []
for item in test_set:
    text = Text(item[0])
    vector_x = np.zeros(len(vocab_words))
    for word in text.words:
        word_text = word.lower()
        if word_text in vocab_words:
            vector_x[vocab_words.index(word_text)] = 1

    test_vectors.append(vector_x)
    test_classes.append(int(item[1]))

In [11]:
# fit SVM model, train one-vs-rest classifiers for multi-class classification
# use balanced class weights to handle significant bias (68% of data are 5*)

from sklearn import svm

lin_clf = svm.LinearSVC(class_weight='balanced')
lin_clf.fit(train_vectors, train_classes)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [13]:
# run classifier on test set

prediced_classes = lin_clf.predict(test_vectors)
common = 0
errors_freqencies = {}
errors = {}
for i, clas in enumerate(test_classes):
    if (clas == prediced_classes[i]):
        common += 1
    else:
        key = str(clas) + ' -> ' + str(prediced_classes[i])
        if not errors_freqencies.has_key(key):
            errors_freqencies[key] = 0
        errors_freqencies[key] += 1
        
        if not errors.has_key(key):
            errors[key] = []
        errors[key].append(test_set[i])
print 'correct: ', str(common), '/', str(len(test_set))
print 'errors (correct -> classified : count)'
for key, freq in errors_freqencies.iteritems():
    print key, ':', freq 



correct:  829 / 1074
errors (correct -> classified : count)
1 -> 4 : 5
1 -> 5 : 11
1 -> 2 : 5
1 -> 3 : 3
4 -> 3 : 4
4 -> 2 : 3
4 -> 1 : 1
5 -> 3 : 13
5 -> 4 : 63
4 -> 5 : 77
3 -> 2 : 3
3 -> 4 : 7
5 -> 1 : 4
2 -> 5 : 8
2 -> 4 : 4
3 -> 1 : 3
2 -> 1 : 2
5 -> 2 : 6
2 -> 3 : 6
3 -> 5 : 17
