In [6]:
# Data access and preparation

from sklearn.feature_extraction import DictVectorizer as dv
from sklearn.model_selection import train_test_split

import operator
import tokenize_uk
import pymorphy2

from pprint import pprint as pp

from persistent_index import PersistentKeyValueStorage as kvs


def freq_analysis(comments):
    """Simple frequency analysis"""
    
    freq_dic = {}

    for comment in comments:
        for word in comment.split():
            if word not in freq_dic:
                freq_dic[word] = 1
            else:
                freq_dic[word] += 1

    sorted_d = sorted(freq_dic.items(), key=operator.itemgetter(1), reverse=True)

    pp(sorted_d)

    
def generate_bows(comments, stop_words, morph):
    """Generate bags-of-words from comment texts"""
    
    bags = []

    for comment in comments:
        bag = {}
        tokens = tokenize_uk.tokenize_words(comment)
        
        for token in tokens:
            
            # Lemmatize
            token = morph.parse(token)[0].normal_form.lower()
            
            if token not in stop_words:
                if token not in bag:
                    bag[token] = 1
                else:
                    bag[token] += 1
        bags.append(bag)
    
    return bags


def n_grams(tokens, n=1):
    """Returns an iterator over the n-grams given a list of tokens"""
    shiftToken = lambda i: (el for j,el in enumerate(tokens) if j>=i)
    shiftedTokens = (shiftToken(i) for i in range(n))
    tupleNGrams = zip(*shiftedTokens)
    return tupleNGrams # if join in generator : (" ".join(i) for i in tupleNGrams)


def range_ngrams(tokens, ngramRange=(1,2)):
    """Returns an itirator over all n-grams for n in range(ngramRange) given a list of tokens."""
    return chain(*(n_grams(tokens, i) for i in range(*ngramRange)))


def bag_of_ngrams(comment, stop_words, morph, n=2):
    
    bag = {}
    tokens = []
    doc = tokenize_uk.tokenize_words(comment)
    
    # Lemmatize, normalize text
    for token in doc:
        lemma = morph.parse(token)[0].normal_form.lower()
        if lemma not in stop_words:
            tokens.append(lemma)
        
    for ngram in n_grams(tokens, n):
        if ngram not in bag:
            bag[ngram] = 1
        else:
            bag[ngram] += 1

    return bag


def generate_bags_of_ngrams(comments, stop_words, morph, n=2):
    """Generate bags-of-words from comment texts"""
    return [bag_of_ngrams(comment, stop_words, morph, n=2) for comment in comments]


def access_data(dataset_name):
    """Access data from storage and do very basic filtering"""
    
    storage = kvs(db_filename=f'./persistent_index/{dataset_name}.sqlite', 
                  table_name='kvs')   

    eligible_scores = {5, 2, 1}

    data = [entry for entry in storage.get_all() if entry[1] in eligible_scores]

    comments = [entry[0] for entry in data]
    scores = [entry[1] for entry in data]

    return comments, scores


def upsample_by_score(comments, scores, scores_to_upsample, multiplier=3):
    
    n = len(comments)
    upsampled_comments = []
    upsampled_scores = []
    
    for i in range(0, n):
        if scores[i] in scores_to_upsample:
            for j in range(0, multiplier):
                upsampled_comments.append(comments[i])
                upsampled_scores.append(scores[i])
        else:
            upsampled_comments.append(comments[i])
            upsampled_scores.append(scores[i])
            
    return upsampled_comments, upsampled_scores
    

# freq_analysis(comments)

normalize_rules = {
    '\n\n': '\n',
}

stop_words = {
    'Достоинства', 
    'Недостатки',
    'і',
    'з',
    'на',
    'в',
    'що',
    'як',
    'то',
    'у',
    'це',
    'для',
    '.', ',', ')', '(', ':',
}


# Fetch from storage
comments, scores = access_data('phones')

# Process comments into bags of words 

morph = pymorphy2.MorphAnalyzer(lang='uk')

bags = generate_bows(comments, stop_words, morph)
# bags = generate_bags_of_ngrams(comments, stop_words, morph, n=2)

v = dv(sparse=False)
X = v.fit_transform(bags)
Y = []

# Homogenize scores into two classes - positive (1) and negative(0)

for score in scores:
    if score <= 3:
        Y.append(0)
    if score == 5:
        Y.append(1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

# Upsample negative tags
scores_to_upsample = {0}
mult = 5
X_train, Y_train = upsample_by_score(X_train, Y_train, scores_to_upsample, multiplier=mult)
X_test, Y_test = upsample_by_score(X_test, Y_test, scores_to_upsample, multiplier=mult)

# Print ratio of negative and positive ratings
print(sum(Y)/len(Y))

0.904688463911166


In [7]:
# Classification and evaluation

from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn import svm

from sklearn.metrics import classification_report

def dtree_classify(X_train, X_test, Y_train, Y_test):
    """Run classification with Decision trees"""
    
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('DTree Classification:')
    print(classification_report(Y_test, Y_pred))
    
    return clf

    
def nbayes_classify(X_train, X_test, Y_train, Y_test):
    clf = GaussianNB()
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('Gaussian Naive Bayes Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf


    
def knn_classify(X_train, X_test, Y_train, Y_test):
    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('kNN Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def logreg_classify(X_train, X_test, Y_train, Y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial')
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('LogReg Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def svm_classify(X_train, X_test, Y_train, Y_test):
    clf = svm.SVC(gamma='scale')
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('SVM Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def perceptron_classify(X_train, X_test, Y_train, Y_test):
    clf = Perceptron(max_iter=100, tol=1e-2)
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('Perceptron Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

dtree_classify(X_train, X_test, Y_train, Y_test)
nbayes_classify(X_train, X_test, Y_train, Y_test)
knn_classify(X_train, X_test, Y_train, Y_test)
logreg_classify(X_train, X_test, Y_train, Y_test)
svm_classify(X_train, X_test, Y_train, Y_test)
perceptron_classify(X_train, X_test, Y_train, Y_test)


DTree Classification:
              precision    recall  f1-score   support

           0       0.75      0.41      0.53       540
           1       0.74      0.92      0.82       962

   micro avg       0.74      0.74      0.74      1502
   macro avg       0.74      0.67      0.67      1502
weighted avg       0.74      0.74      0.71      1502

Gaussian Naive Bayes Classification:
              precision    recall  f1-score   support

           0       0.39      0.24      0.30       540
           1       0.65      0.79      0.71       962

   micro avg       0.59      0.59      0.59      1502
   macro avg       0.52      0.52      0.51      1502
weighted avg       0.56      0.59      0.56      1502

kNN Classification:
              precision    recall  f1-score   support

           0       0.58      0.08      0.15       540
           1       0.65      0.97      0.78       962

   micro avg       0.65      0.65      0.65      1502
   macro avg       0.62      0.53      0.46      

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=100, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.01,
      validation_fraction=0.1, verbose=0, warm_start=False)

## Conclusions

* Larger and more diverse datasets guarantee better results. This is obvious, but still, it needs to be stated that smaller categories (proteins, laptops) yield worse model qaulity than larger ones (mobile phones).
* Among all classifiers tested above, decision trees and logistic regression appeared to have the highest overall scores.
* Experiments show that, counterintuitively, bags of words appear to perform better than collections of 2-grams. n-grams make learning and classification __much__ slower, though.
* Again, counterintuitively, in our case, both lemmatization and stopword filtering have been somewhat effective for bags of __words__ and removing these processes from the pipeline decreased model scores.
* Tweaking classifier parameters may be useful to improve model quality, but only up to a certain point.
* Finally, it has been observed that upsamlping data by underrepresented tags __drastically__ improves recall and f1 for the underrepresented class. However, oversampling the same data negatively impacts quality for both classes.