In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [5]:
import json
from pandas.io.json import json_normalize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score,
                             classification_report)
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler, MaxAbsScaler
import numpy as np
from nltk.corpus import stopwords
import xgboost as xgb
from pathlib import Path
from tokenize_uk.tokenize_uk import tokenize_words

In [7]:
with open("stop_words.txt") as f:
    stop = f.readlines()
stop = [el.strip() for el in stop]

In [80]:
tone = pd.read_csv("tone-dict.tsv", sep='\t', header=None, names=["word", "sentiment"])
tone["word"] = tone["word"].str.lower()
tone_map = tone.set_index("word").to_dict()["sentiment"]

In [9]:
def calc_metrics(y_test, pred, proba=None, labels=["1", "2", "3", "4", "5"], print_=True,
                 average="macro", report=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred, average=average)
    output["Precision"] = precision_score(y_test, pred, average=average)
    output["F1"] = f1_score(y_test, pred, average=average)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    if report:
        print(classification_report(y_test, pred, labels))
    return output

In [10]:
data = []
for file in Path().glob("items*.json"):
    with open(file, "r") as f:
        data.extend(json.load(f))

In [11]:
df = json_normalize(data, record_path="reviews", meta=["path", "price", "title"])

In [13]:
df["review"] = df.apply(lambda x: x['text'] + " " + x['pros']+ " " + x['cons'], axis=1)
#frq = df.groupby("stars")["text"].count()
#df["weights"] = df["stars"].map({"1": 1, "2": 1, "3": 1, "4": 5, "5": 3})

In [8]:
df.head(1)
df["stars"].describe()

Unnamed: 0,author,cons,date,link,pros,stars,text,path,price,title,review
0,Наталка,не виявили,2016-7-07,https://bt.rozetka.com.ua/2030437/p2030437/com...,"Якісний вироб, доступна ціна",5,"Чудова морозильна камера.Придбали ще взимку, в...","[Интернет-супермаркет №, Бытовая техника, инте...",11199,Встраиваемая морозильная камера Freggia LSB0010,"Чудова морозильна камера.Придбали ще взимку, в..."


count    2755.000000
mean        4.458802
std         0.956587
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: stars, dtype: float64

In [398]:
df.loc[df.stars==4, ["review", "pros", "cons"]]

Unnamed: 0,review,pros,cons
15,"Питання: чаша на скільки літрів, в описі 4 л, ...",Не знаю,Не знаю
33,Дуже стильний холодильник. Працює майже безшум...,"Ціна, зовнішній вигляд","Тонка пластмаса на ящичках, наш був зломаним."
35,"добрий хлолдильник, мені подобається ціна та о...",ціна та об'єм,немає підставки для яєць
38,Адмін скільки буде коштувати доставка в Любеші...,,
42,"Це моя друга мультиварка. Довго вибирала , але...",Гарна.недорога.,Якість зборки. Чаша ззовні стирається .
43,В загальному мультіваркою задоволений. Класний...,"Класний вигляд, добре готує, функціональне меню.","короткий сетевий кабель, але можна купити інши..."
52,"Прикро, що функція ""рис"" не запускається, а в ...","Стильна, та багатофункціональна.","Функція рис не запускається, табло показує ""пр..."
53,"Привіт! Придбав сьогодні, пока неможу сказати ...",Низька ціна.,
54,"В загальному мультварка хороша Чудова функція""...","Чудова функція""тушіння""","Довго відбувається смаження , книга рецептів с..."
55,Мені подобається. Жодних нарікань на роботу не...,,


#### Distribution of classes

In [17]:
s = df.stars.value_counts()
pd.concat([s, s / s.sum()], axis=1)

Unnamed: 0,stars,stars.1
5,1832,0.664973
4,610,0.221416
3,153,0.055535
1,95,0.034483
2,65,0.023593


In [19]:
# Analyze cons
df["cons"].value_counts().head(10)
patt = "не має|не знайш|немає|не вияв|відсутн|нема|--"
cond = ((df["cons"].str.lower().str.contains(patt, regex=True)) | (df["cons"]==""))
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

               700
Немає           92
немає           85
-               34
не виявлено     31
Не виявлено     20
нема            20
Відсутні        19
Не виявила      16
Нема            15
Name: cons, dtype: int64

4.088316467341306

4.700239808153477

In [20]:
df["is_cons"] = cond.astype(int)

In [21]:
# Analyze pros
df["pros"].value_counts().head(10)
patt = "ціна|якість"
cond1 = (df["pros"].str.lower().str.contains(patt, regex=True)) | (df["pros"].str.len() < 10)
cond2 = df["pros"]==""
cond = cond2
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

                  626
Ціна               27
ціна               13
Дизайн              7
дешевий             6
Тиха                5
Ціна та якість      5
немає               5
+                   5
Дешевий             5
Name: pros, dtype: int64

4.465476749647722

4.436102236421725

#### Train / test split

In [22]:
y = df["stars"]
X = df.loc[:, ~df.columns.isin(["stars"])]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 2204, Num. of test: 551


#### TF - IDF

In [160]:
def validate_array(x):
    if len(x) > 0:
        return x
    return np.array([0])

In [240]:
def f(x):
    x = validate_array(np.array(x))
    nonzero = np.nonzero(x)[0].shape[0]
    pos = validate_array(x[x > 0])
    neg = validate_array(x[x < 0])
    max_pos = np.max(pos)
    min_pos = np.min(pos)
    mean_pos = np.mean(pos)
    std_pos = np.std(pos)
    sum_pos = np.sum(pos)
    max_neg = np.max(neg)
    min_neg = np.min(neg)
    mean_neg = np.mean(neg)
    std_neg = np.std(neg)
    sum_neg = np.sum(neg)
    return pd.Series({"nonzero": nonzero, 
                      "mean": np.mean(x),
                      "max": np.max(x),
                      "min": np.min(x),
                      "std": np.std(x),
                      "sum": np.sum(x),
                      "max_pos": max_pos,
                      "min_pos": min_pos,
                      "mean_pos": mean_pos,
                      "std_pos": std_pos,
                      "sum_pos": sum_pos,
                      "max_neg": max_neg,
                      "min_neg": min_neg,
                      "mean_neg": mean_neg,
                      "std_neg": std_neg,
                      "sum_neg": sum_neg 
                       })

In [202]:
def add_tone(X, tone_map=tone_map, var="review"):
    temp = X[var].map(lambda x: [tone_map.get(word.lower(), 0) for word in tokenize_words(x)])
    return temp.apply(f)

In [250]:
def apply_transformer(train, test, transformer):
    if transformer is not None:
        train = transformer.fit_transform(train)
        test = transformer.transform(test)
        return train, test
    return train, test

In [402]:
def build_features(X_train, X_test, transformer=None, var="review", features=None,
                   vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf_text":
            train = vectorizer.fit_transform(X_train["text"]).toarray()
            test = vectorizer.transform(X_test["text"]).toarray()
            f_train.append(train)
            f_test.append(test)
        elif feature == "tfidf_pros":
            train = vectorizer.fit_transform(X_train["pros"]).toarray()
            test = vectorizer.transform(X_test["pros"]).toarray()
            f_train.append(train)
            f_test.append(test)
        elif feature == "tfidf_cons":
            train = vectorizer.fit_transform(X_train["cons"]).toarray()
            test = vectorizer.transform(X_test["cons"]).toarray()
            f_train.append(train)
            f_test.append(test)
        elif feature == "sentiment":
            train = add_tone(X_train, var=var).values
            test = add_tone(X_test, var=var).values
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature =="is_cons":
            train = X_train["is_cons"].values[:, np.newaxis]
            test = X_test["is_cons"].values[:, np.newaxis]
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature == "len":
            train = X_train[["text", "pros", "cons"]].applymap(len).values
            test = X_test[["text", "pros", "cons"]].applymap(len).values
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [227]:
quant = QuantileTransformer(n_quantiles=10, output_distribution="uniform")
minmax = MinMaxScaler()
std = StandardScaler()
maxabs = MaxAbsScaler()
transformer = minmax

In [405]:
tf_params = {"lowercase": 1,
             "analyzer": "word",
             "stop_words": stop,
             "ngram_range": (1, 1),
             "min_df": 1,
             "max_df": 0.4,
             "preprocessor": None,
             "max_features": 1500*1 or None,
             "norm": 'l2'*1,
             "use_idf": 1,
             "smooth_idf": 0,
             "sublinear_tf": 0, 
             "tokenizer": None#tokenize_words
             }
var = "review"
features = [
            "tfidf_text",
            "tfidf_pros",
            "tfidf_cons",
            'sentiment', 
            'len', 
            'is_cons'
            ]
vectorizer = TfidfVectorizer(**tf_params)
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var)
print(f"Features: {train.shape[1]}")

Features: 4520


In [406]:
clf = MultinomialNB(alpha=2.4, class_prior=[0.2]*5)
#clf = GaussianNB([0.2]*5)
clf.fit(train, y_train)
pred = clf.predict(test)
proba = clf.predict_proba(test)
metrics = calc_metrics(y_test, pred, proba=None, average="weighted")

MultinomialNB(alpha=2.4, class_prior=[0.2, 0.2, 0.2, 0.2, 0.2],
       fit_prior=True)

Recall: 0.668
Precision: 0.555
F1: 0.540
accuracy: 0.668
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0       1      18
2       0       0       0       0      13
3       0       0       0       1      30
4       0       0       0       2     120
5       0       0       0       0     366
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        31
          4       0.50      0.02      0.03       122
          5       0.67      1.00      0.80       366

avg / total       0.56      0.67      0.54       551



#### Fit XGBoost

In [386]:
params = {}
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = 'multi:softmax'
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["mlogloss", "merror"]
params["early_stopping_rounds"] = 50
params['num_class'] = df["stars"].nunique()

In [407]:
dtrain = xgb.DMatrix(train, y_train-1)#, weight=X_train["weights"])
dtest = xgb.DMatrix(test, y_test-1)#, weight=X_test["weights"])
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [408]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-mlogloss:1.50747	train-merror:0.334846	eval-mlogloss:1.50713	eval-merror:0.335753
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 50 rounds.
[50]	train-mlogloss:0.850492	train-merror:0.321234	eval-mlogloss:0.878203	eval-merror:0.319419
Stopping. Best iteration:
[27]	train-mlogloss:0.886996	train-merror:0.326679	eval-mlogloss:0.908064	eval-merror:0.315789



In [409]:
pred_xgb = (model.predict(dtest)+1).astype(int)
xgb_metrics = calc_metrics(y_test, pred_xgb, average="weighted")

Recall: 0.681
Precision: 0.565
F1: 0.604
accuracy: 0.681
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0       9      10
2       0       0       0       4       9
3       0       0       0       7      24
4       0       0       0      25      97
5       0       0       0      16     350
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        31
          4       0.41      0.20      0.27       122
          5       0.71      0.96      0.82       366

avg / total       0.57      0.68      0.60       551

