In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [5]:
import json
from pandas.io.json import json_normalize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score,
                             classification_report)
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler, MaxAbsScaler
import numpy as np
from nltk.corpus import stopwords
import xgboost as xgb
from pathlib import Path
from tokenize_uk.tokenize_uk import tokenize_words
from operator import itemgetter
import pymorphy2

In [107]:
%aimport config
%aimport target_encoding
from config import params
from target_encoding import TargetEncoder

In [7]:
def calc_metrics(y_test, pred, proba=None, labels=["1", "2", "3", "4", "5"], print_=True,
                 average="macro", report=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred, average=average)
    output["Precision"] = precision_score(y_test, pred, average=average)
    output["F1"] = f1_score(y_test, pred, average=average)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    if report:
        output["report"] = classification_report(y_test, pred, labels)
        print(output["report"])
    return output

In [8]:
with open("stop_words.txt") as file:
    stop = file.readlines()
stop = [el.strip() for el in stop]

In [9]:
tone = pd.read_csv("tone-dict.tsv", sep='\t', header=None, names=["word", "sentiment"])
tone["word"] = tone["word"].str.lower()
tone_map = tone.set_index("word").to_dict()["sentiment"]

In [264]:
data = []
for file in Path().glob("smart*.json"):
    with open(file, "r") as f_in:
        data.extend(json.load(f_in))

In [265]:
df = json_normalize(data, record_path="reviews", meta=["path", "price", "title"])

In [266]:
df["review"] = df.apply(lambda x: x['text'] + " " + x['pros']+ " " + x['cons'], axis=1)
df["id"] = df["link"].str.extract(r"/p(?P<id>[0-9]+)")
df["category"] = df["path"].map(lambda x: x[-2])
#frq = df.groupby("stars")["text"].count()
#df["weights"] = df["stars"].map({"1": 1, "2": 1, "3": 1, "4": 5, "5": 3})

In [267]:
df.head(1)
df["stars"].describe()

Unnamed: 0,author,cons,date,link,pros,stars,text,path,price,title,review,id,category
0,балацька лариса,До телефона треба зразу брати і захисник скло ...,2018-3-24,https://rozetka.com.ua/zte_blade_v8_gray/p2573...,Камера і батарея,5,"Купили телефон в подарунок дітям, зразу два зо...","[Интернет-супермаркет №, Смартфоны, ТВ и элект...",4499,ZTE Blade V8 Gray,"Купили телефон в подарунок дітям, зразу два зо...",25738433,Мобильные телефоны


count    3797.000000
mean        4.359494
std         1.004769
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: stars, dtype: float64

#### Distribution of classes

In [268]:
s = df.stars.value_counts()
print(pd.concat([s, s*100 / s.sum()], axis=1).to_string())

   stars      stars
5   2293  60.389781
4    988  26.020543
3    243   6.399789
1    139   3.660785
2    134   3.529102


So, we deal with an imbalanced multiclass classification problem

In [13]:
# Analyze cons
df["cons"].value_counts().head(10)
patt = "не має|не знайш|немає|не вияв|відсутн|нема|--"
cond = ((df["cons"].str.lower().str.contains(patt, regex=True)) | (df["cons"]==""))
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

                                                                                     1045
Немає                                                                                  82
немає                                                                                  40
-                                                                                      32
Нема                                                                                   20
Ціна, батарея,вимикається на морозі,швидко дряпається без захисного скла і чохла.      17
Немає.                                                                                 17
не виявив                                                                              16
Поки що не виявлено недоліків!                                                         16
Крім "серйозної" ціни не знайшов.                                                      16
Name: cons, dtype: int64

4.074193548387097

4.63345379452762

Ratings differ considerably for reviews with words in section Недостатки  as "немає", "не виявив", "не знайшов", "відсутні", тощо

In [14]:
# Analyze pros
df["pros"].value_counts().head(10)
patt = "ціна|якість"
cond1 = (df["pros"].str.lower().str.contains(patt, regex=True)) | (df["pros"].str.len() < 10)
cond2 = df["pros"]==""
cond = cond2
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

                                                                                         1064
Ціна                                                                                       44
ціна                                                                                       25
Батарея                                                                                    19
Зв‘язок, камера, чіткість роботи, зручний.                                                 16
Це супер-машина, а не телефон                                                              16
Прекрасна камера, достатньо пам'яті (я про 128 GB) donating в фонд боротьби зі СНІДом      16
Прекрасна камера, зручний в управлінні!                                                    16
-                                                                                          14
Дизайн                                                                                     10
Name: pros, dtype: int64

4.311745334796926

4.482142857142857

#### Train / test split

Use 20% of data for testing and stratify according to the target variable

In [269]:
y = df["stars"]
X = df.loc[:, ~df.columns.isin(["stars"])]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 3037, Num. of test: 760


#### Baseline

Let's evaluate performance of the very simple baseline prediction algorithm - <b>everything is a 5 star review (just take a majority class)</b>

In [270]:
base_pred = np.full(y_test.shape, 5)
base_metrics = calc_metrics(y_test, base_pred, proba=None, average="weighted")

Recall: 0.604
Precision: 0.365
F1: 0.455
accuracy: 0.604
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0       0      28
2       0       0       0       0      27
3       0       0       0       0      48
4       0       0       0       0     198
5       0       0       0       0     459
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        28
          2       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        48
          4       0.00      0.00      0.00       198
          5       0.60      1.00      0.75       459

avg / total       0.36      0.60      0.45       760



#### TF - IDF & other fesatures

In [35]:
def validate_array(x):
    if len(x) > 0:
        return x
    return np.array([0])

In [36]:
def f(x):
    x = validate_array(np.array(x))
    nonzero = np.nonzero(x)[0].shape[0]
    n_pos = (x > 0).sum()
    n_neg = (x < 0).sum()
    pos = validate_array(x[x > 0])
    neg = validate_array(x[x < 0])
    max_pos = np.max(pos)
    min_pos = np.min(pos)
    mean_pos = np.mean(pos)
    std_pos = np.std(pos)
    sum_pos = np.sum(pos)
    max_neg = np.max(neg)
    min_neg = np.min(neg)
    mean_neg = np.mean(neg)
    std_neg = np.std(neg)
    sum_neg = np.sum(neg)
    return pd.Series({"nonzero": nonzero, 
                      "nonzero_ratio": nonzero / len(x) if len(x)>0 else 0,
                      "n_pos": n_pos,
                      "n_pos_ratio": n_pos / nonzero if nonzero>0 else 0,
                      "n_neg": n_neg,
                      "n_neg_ratio": n_neg / nonzero if nonzero>0 else 0,
                      "mean": np.mean(x),
                      "max": np.max(x),
                      "min": np.min(x),
                      "std": np.std(x),
                      "sum": np.sum(x),
                      "max_pos": max_pos,
                      "min_pos": min_pos,
                      "mean_pos": mean_pos,
                      "std_pos": std_pos,
                      "sum_pos": sum_pos,
                      "max_neg": max_neg,
                      "min_neg": min_neg,
                      "mean_neg": mean_neg,
                      "std_neg": std_neg,
                      "sum_neg": sum_neg 
                       })

In [37]:
def condition_cons(df, var="cons", patt="не має|не знайш|немає|не вияв|відсутн|нема|--"):
    cond = ((df[var].str.lower().str.contains(patt, regex=True)) | (df[var]==""))
    return cond

In [38]:
def add_tone(X, tone_map=tone_map, var="review"):
    temp = X[var].map(lambda x: [tone_map.get(word.lower(), 0) for word in tokenize_words(x)])
    return temp.apply(f)

In [39]:
def apply_transformer(train, test, transformer):
    if transformer is not None:
        train = transformer.fit_transform(train)
        test = transformer.transform(test)
        return train, test
    return train, test

In [85]:
def build_length_features(df):
    df = df[["text", "pros", "cons"]].applymap(len)
    df["text_cons"] = df.apply(lambda x: x["text"]/x["cons"], axis=1)
    df["text_pros"] = df.apply(lambda x: x["text"]/x["cons"], axis=1)
    df["pros_cons"] = df.apply(lambda x: x["pros"]/x["cons"], axis=1)
    df = df.values
    df[~np.isfinite(df)] = 0
    return df

In [230]:
def build_features(X_train, X_test, y_train=None, transformer=None, var="review", features=None,
                   vectorizer=None, encoder=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            train = vectorizer.fit_transform(X_train[var]).toarray()
            test = vectorizer.transform(X_test[var]).toarray()
            f_train.append(train)
            f_test.append(test)
        elif feature == "sentiment":
            train = add_tone(X_train, var=var).values
            test = add_tone(X_test, var=var).values
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature == "len":
            train = build_length_features(X_train)
            test = build_length_features(X_test)
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature == "is_cons":
            train = condition_cons(X_train).astype(int)[:, np.newaxis]
            test = condition_cons(X_test).astype(int)[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
        elif feature == "target_encode":
            train = encoder.fit_transform(X_train[["id", "category"]], y_train).values
            test = encoder.transform(X_test[["id", "category"]]).values
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [231]:
def fit_nb(train, y_train, test, y_test, alpha=2.5, priors=None):
    clf = MultinomialNB(alpha=alpha, class_prior=priors)
    clf.fit(train, y_train)
    pred = clf.predict(test)
    proba = clf.predict_proba(test)
    metrics = calc_metrics(y_test, pred, proba=None, average="weighted")
    return pred, proba, metrics

In [232]:
quant = QuantileTransformer(n_quantiles=10, output_distribution="uniform")
minmax = MinMaxScaler()
std = StandardScaler()
maxabs = MaxAbsScaler()
transformer = minmax
encoder = TargetEncoder(columns=["id", "category"], feature_names={"id": "id_mean",
                                                                   "category": "category_mean"})

Try only TF-IDF (with stop words if we use analyzer=word)

In [271]:
tf_params = {"lowercase": 1,
             "analyzer": "word",
             "stop_words": stop,
             "ngram_range": (1, 1),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,
             "max_features": 3500*1 or None,
             "norm": 'l2'*0,
             "use_idf": 0,
             "smooth_idf": 0,
             "sublinear_tf": 0, 
             "tokenizer": None#tokenize_words
             }
var = "review"
features = [
            "tfidf",
            #"target_encode",
            #"sentiment", 
            #"len", 
            #"is_cons"
            ]
vectorizer = TfidfVectorizer(**tf_params)
train, test = build_features(X_train, X_test, y_train, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var, encoder=encoder)
print(f"Features: {train.shape[1]}")

Features: 3500


In [272]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.634
Precision: 0.561
F1: 0.586
accuracy: 0.634
   pred_1  pred_2  pred_3  pred_4  pred_5
1       2       0       2      13      11
2       1       0       3      13      10
3       2       0       1      27      18
4       0       0       4      73     121
5       0       1       1      51     406
             precision    recall  f1-score   support

          1       0.40      0.07      0.12        28
          2       0.00      0.00      0.00        27
          3       0.09      0.02      0.03        48
          4       0.41      0.37      0.39       198
          5       0.72      0.88      0.79       459

avg / total       0.56      0.63      0.59       760



##### Add sentiments

In [273]:
features = [
            "tfidf",
            "sentiment", 
            #"len", 
            #"is_cons",
            #"target_encode"
            ]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var)
print(f"Features: {train.shape[1]}")

Features: 3521


In [274]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.646
Precision: 0.558
F1: 0.586
accuracy: 0.646
   pred_1  pred_2  pred_3  pred_4  pred_5
1       1       0       2      11      14
2       1       0       2      15       9
3       0       0       0      30      18
4       0       0       0      68     130
5       0       0       0      37     422
             precision    recall  f1-score   support

          1       0.50      0.04      0.07        28
          2       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        48
          4       0.42      0.34      0.38       198
          5       0.71      0.92      0.80       459

avg / total       0.56      0.65      0.59       760



Accuracy as well as other metrics <b>increased</b> after adding sentiment features.

<b>Add more features</b> - target encoding of the id field, length for review/pros/cons sections, divisions of lengths, whether cons section match pattern "не має|не знайш|немає|не вияв|відсутн|нема|--"

In [275]:
features = [
            "tfidf",
            "target_encode",
            "sentiment", 
            "len", 
            "is_cons"
            ]
train, test = build_features(X_train, X_test, y_train, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var, encoder=encoder)
print(f"Features: {train.shape[1]}")

Features: 3530


In [277]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.646
Precision: 0.537
F1: 0.580
accuracy: 0.646
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       1      12      15
2       0       0       0      17      10
3       0       0       0      23      25
4       0       0       2      64     132
5       0       0       1      31     427
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        28
          2       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        48
          4       0.44      0.32      0.37       198
          5       0.70      0.93      0.80       459

avg / total       0.54      0.65      0.58       760



##### Add <b>lemmatization</b>

In [278]:
morph = pymorphy2.MorphAnalyzer()

In [279]:
def get_lemma(word, idx=0, analyzer=morph):
    p = morph.parse(word)
    if p:
        return p[idx].normal_form
    return word

In [280]:
def preprocessor(text, lower=True, idx=0, analyzer=morph, stop=stop):
    words = tokenize_words(text)
    if lower:
        words = [el.lower() for el in words]
    lemmas = [get_lemma(word, idx, analyzer) for word in words]
    if stop:
        lemmas = [lemma for lemma in lemmas if lemma not in stop]
    return " ".join(lemmas)

In [104]:
#df[var] = df[var].map(preprocessor)

In [296]:
tf_params_prep = tf_params.copy()
tf_params_prep["preprocessor"] = preprocessor
features = [
            "tfidf",
            "target_encode",
            "sentiment", 
            "len", 
            "is_cons"
            ]
vectorizer = TfidfVectorizer(**tf_params_prep)
train, test = build_features(X_train, X_test, y_train, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var, encoder=encoder)
print(f"Features: {train.shape[1]}")

Features: 3530


In [297]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.643
Precision: 0.571
F1: 0.581
accuracy: 0.643
   pred_1  pred_2  pred_3  pred_4  pred_5
1       1       0       1      12      14
2       0       0       0      18       9
3       0       0       0      28      20
4       0       0       1      65     132
5       0       0       0      36     423
             precision    recall  f1-score   support

          1       1.00      0.04      0.07        28
          2       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        48
          4       0.41      0.33      0.36       198
          5       0.71      0.92      0.80       459

avg / total       0.57      0.64      0.58       760



It works a bit worse when lemmatization is used

#### Fit XGBoost

In [298]:
tf_params_xgb = tf_params.copy()
tf_params_prep["use_idf"] = 1
tf_params_xgb["norm"] = "l2"
tf_params_xgb["preprocessor"] = preprocessor
features = [
            "tfidf",
            "target_encode",
            "sentiment", 
            "len", 
            "is_cons"
            ]
vectorizer = TfidfVectorizer(**tf_params_xgb)
train, test = build_features(X_train, X_test, y_train, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var, encoder=encoder)
print(f"Features: {train.shape[1]}")

Features: 3530


In [299]:
params['num_class'] = df["stars"].nunique()

In [300]:
dtrain = xgb.DMatrix(train, y_train-1)#, weight=X_train["weights"])
dtest = xgb.DMatrix(test, y_test-1)#, weight=X_test["weights"])
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [305]:
model = xgb.train(dtrain=dtrain, num_boost_round=77,#params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-mlogloss:1.52437	train-merror:0.382614	eval-mlogloss:1.52473	eval-merror:0.384211
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 50 rounds.
[50]	train-mlogloss:0.911149	train-merror:0.362858	eval-mlogloss:0.931587	eval-merror:0.367105
[76]	train-mlogloss:0.888813	train-merror:0.351992	eval-mlogloss:0.922265	eval-merror:0.351316


In [306]:
pred_xgb = (model.predict(dtest)+1).astype(int)
print(f"Best score (accuracy): {1-model.best_score}")
xgb_metrics = calc_metrics(y_test, pred_xgb, average="weighted")

Best score (accuracy): 0.648684
Recall: 0.649
Precision: 0.564
F1: 0.573
accuracy: 0.649
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0      11      17
2       0       0       1      12      14
3       0       0       3      17      28
4       0       0       1      50     147
5       0       0       1      18     440
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        28
          2       0.00      0.00      0.00        27
          3       0.50      0.06      0.11        48
          4       0.46      0.25      0.33       198
          5       0.68      0.96      0.80       459

avg / total       0.56      0.65      0.57       760

