In [64]:
import re
from collections import Counter, defaultdict
from spacy.lang.xx import MultiLanguage
import pymorphy2
import pandas as pd
import numpy as np
morph = pymorphy2.MorphAnalyzer(lang="uk")
nlp = MultiLanguage()
tokenizer = MultiLanguage().Defaults.create_tokenizer(nlp)
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
with open("stop-words.txt", "r") as fl:
    stop_words = fl.read().split("\n")
ADVANTANGE_TXT = "Переваги:\xa0"
DRAWBACK_TXT = "Недоліки:\xa0"
SCRAPE_FILE = "scrape_feedbacks.csv"

In [65]:
df = pd.read_csv(SCRAPE_FILE)
#remove duplicates
df = df.drop_duplicates()
df['stars'].value_counts()
# df['stars'] = df['stars'].map({1:1,2:1,3:2,4:2,5:3})

5    2353
4     998
3     239
1     135
2     128
Name: stars, dtype: int64

Data looks kinda biased. I tried to cut 5 and 4 but does not really help. Let's split it to train and test

In [66]:

from sklearn.model_selection import train_test_split
test_size = 0.2
x = df["text"] +" "+df["advantage"].astype(str)+" "+df["drawback"].astype(str)
y = df["stars"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y)

NB methods:

In [67]:

tone_dict = {}
def count_occurences(x_train, y_train):
    occurences = defaultdict(Counter)
    
    for text, cls in zip(x_train, y_train):
        for token in tokenizer(text):
            normal_form = morph.parse(token.text)[0].normal_form
            occurences[cls].update([normal_form])
    print("Number of words by classes, {}".format([(cls, len(dc)) for cls, dc in occurences.items()]))
    return occurences

def get_metrics(y_true, prediction):
    print("RECALL: {}".format(recall_score(y_true, prediction, average='weighted')))
    print("PRECISION: {}".format(precision_score(y_true, prediction, average='weighted')))
    print("F1: {}".format(f1_score(y_true, prediction, average='weighted')))
    print("ACCURACY: {}".format(accuracy_score(y_true, prediction)))


def predict(vocabulary_pr, texts):
    prediction = []
    tone_val = [0.0001 for i in range(len(classes))]
    for text in texts:
        probs_stats = []
#         probs_tone = []
        for token in tokenizer(text):
            normal_form = morph.parse(token.text)[0].normal_form
            if not normal_form in stop_words:
                if normal_form in vocabulary_pr:
                    probs_stats.append(vocabulary_pr[normal_form])
                        
#                         print(normal_form, vocabulary_pr[normal_form])
                if normal_form in tone_dict:
                    tone_val_c = tone_val.copy()
                    tone_val_c[tone_dict[normal_form]-1] = 0.999
                    probs_stats.append(tone_val_c)
        probs_stats = np.prod(np.array(probs_stats), axis=0)
#         if probs_tone:
#             import ipdb; ipdb.set_trace()
#             probs_tone = np.prod(np.array(probs_tone), axis=0)
            
        prediction.append(np.argmax(probs_stats)+1)
    return prediction


In [18]:
occurences = count_occurences(x_train, y_train)

Number of words by classes, [(5, 8715), (4, 5789), (1, 1808), (2, 1954), (3, 2881)]


In [36]:
classes = sorted(occurences.keys())
print(classes)
occurences_by_cls = y_train.value_counts()
vocabulary = set([item for sublist in [occurences[cls].keys() for cls in classes] for item in sublist])
vocabulary_pr = {}
def train():
    for word in vocabulary:
        if word not in stop_words:
            occurences_word_by_cls = {cls: occurences[cls][word] for cls in classes}
            occurences_word_total = sum(occurences_word_by_cls.values())
            probs = []
            for cls in classes:
#                 print("occurences_word_by_cls", occurences_word_by_cls[cls])
#                 print("occurences_by_cls", occurences_by_cls[cls])
                pr = ((occurences_word_by_cls[cls]/occurences_by_cls[cls])*(occurences_by_cls[cls]/len(x_train)))/((occurences_word_total)/(len(x_train)))
                if pr == 0:
                    pr = 0.0001
                elif pr >= 1:
                    pr = 0.9999
                probs.append(pr)
            vocabulary_pr[word] = probs
#             print(word, probs)  
  
train()
len(vocabulary_pr.keys())

[1, 2, 3, 4, 5]


12567

In [37]:
print("TRAINING SET METRICS:")
get_metrics(y_train, predict(vocabulary_pr, x_train))
print("===================")
print("TEST SET METRICS:")
get_metrics(y_test, predict(vocabulary_pr, x_test))


TRAINING SET METRICS:
RECALL: 0.7495133030499675
PRECISION: 0.7748653466808507
F1: 0.7201803576103017
ACCURACY: 0.7495133030499675
TEST SET METRICS:
RECALL: 0.5330739299610895
PRECISION: 0.43824155181997665
F1: 0.4779723303069608
ACCURACY: 0.5330739299610895


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


It doesn't look very precise, let's try scikit tools:

In [78]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": stop_words,
#              "ngram_range": (3, 3),
#              "min_df": 1,
#              "max_df": 1.0,
#              "preprocessor": None,#Preprocessor(),
#              "max_features": 3500,
#              "norm": None,
             "use_idf": True
             }
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(x_train)
test = vectorizer.transform(x_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
print("TEST SET METRICS:")
get_metrics(y_test, pred)

TEST SET METRICS:
RECALL: 0.6108949416342413
PRECISION: 0.37319262971430306
F1: 0.4633357769882893
ACCURACY: 0.6108949416342413


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


It's pretty much the same result. Let'saAdd tones .)

In [63]:
tone_dict = pd.read_csv("tone-dict-uk.tsv", delimiter='\t', names=["word", "mark"])
#bad(1), neutral(2) and good(3):
tone_mapping = {-2: 1, -1: 2, 0:3, 1:4, 2:5}
tone_dict["mark"] = tone_dict["mark"].map(tone_mapping)
tone_dict = tone_dict.set_index('word').T.to_dict('int')['mark']

In [39]:
# import ipdb; ipdb.set_trace()
print("TRAINING SET METRICS:")
get_metrics(y_train, predict(vocabulary_pr, x_train))
print("===================")
print("TEST SET METRICS:")
get_metrics(y_test, predict(vocabulary_pr, x_test))

TRAINING SET METRICS:
RECALL: 0.7495133030499675
PRECISION: 0.7748653466808507
F1: 0.7201803576103017
ACCURACY: 0.7495133030499675
TEST SET METRICS:
RECALL: 0.5330739299610895
PRECISION: 0.43824155181997665
F1: 0.4779723303069608
ACCURACY: 0.5330739299610895


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [40]:
#try to make it better
prev_len = len(vocabulary_pr)
for i in vocabulary:
    #numbers
    try:
        float(i)
        vocabulary_pr.pop(i)
        continue
    except:
        continue
    #special symbols
    if i in (".", "?", "!", "=", "+", "/" ,";" ":", "*", "(", ")"):
        vocabulary_pr.pop(i)
    #short
    elif len(i) < 3:
        vocabulary_pr.pop(i)
        
#     elif sum([occurences[cls][i] for cls in classes]) <2:
#         print(i)
#         vocabulary_pr.pop(i)
#     print(sum([occurences[cls][i] for cls in classes]))
print("changed vocabulary length from {} to {}".format(prev_len, len(vocabulary_pr)))
print("TRAINING SET METRICS:")
get_metrics(y_train, predict(vocabulary_pr, x_train))
print("===================")
print("TEST SET METRICS:")
get_metrics(y_test, predict(vocabulary_pr, x_test))

changed vocabulary length from 12567 to 12298
TRAINING SET METRICS:
RECALL: 0.7498377676833226
PRECISION: 0.7739448156782516
F1: 0.7236799205156947
ACCURACY: 0.7498377676833226
TEST SET METRICS:
RECALL: 0.5291828793774319
PRECISION: 0.43813585715588804
F1: 0.4768450629441803
ACCURACY: 0.5291828793774319


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
