In [131]:
%load_ext autoreload
%autoreload 1

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [41]:
import json
from pandas.io.json import json_normalize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score,
                             classification_report)
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler, MaxAbsScaler
import numpy as np
from nltk.corpus import stopwords
import xgboost as xgb
from pathlib import Path
from tokenize_uk.tokenize_uk import tokenize_words
from operator import itemgetter
from scipy import stats

In [133]:
%aimport config
from config import params

In [35]:
def calc_metrics(y_test, pred, proba=None, labels=["1", "2", "3", "4", "5"], print_=True,
                 average="macro", report=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred, average=average)
    output["Precision"] = precision_score(y_test, pred, average=average)
    output["F1"] = f1_score(y_test, pred, average=average)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    if report:
        output["report"] = classification_report(y_test, pred, labels)
        print(output["report"])
    return output

In [3]:
with open("stop_words.txt") as f:
    stop = f.readlines()
stop = [el.strip() for el in stop]

In [6]:
tone = pd.read_csv("tone-dict.tsv", sep='\t', header=None, names=["word", "sentiment"])
tone["word"] = tone["word"].str.lower()
tone_map = tone.set_index("word").to_dict()["sentiment"]

In [7]:
data = []
for file in Path().glob("items*.json"):
    with open(file, "r") as f:
        data.extend(json.load(f))

In [8]:
df = json_normalize(data, record_path="reviews", meta=["path", "price", "title"])

In [9]:
df["review"] = df.apply(lambda x: x['text'] + " " + x['pros']+ " " + x['cons'], axis=1)
#frq = df.groupby("stars")["text"].count()
#df["weights"] = df["stars"].map({"1": 1, "2": 1, "3": 1, "4": 5, "5": 3})

In [10]:
df.head(1)
df["stars"].describe()

Unnamed: 0,author,cons,date,link,pros,stars,text,path,price,title,review
0,Наталка,не виявили,2016-7-07,https://bt.rozetka.com.ua/2030437/p2030437/com...,"Якісний вироб, доступна ціна",5,"Чудова морозильна камера.Придбали ще взимку, в...","[Интернет-супермаркет №, Бытовая техника, инте...",11199,Встраиваемая морозильная камера Freggia LSB0010,"Чудова морозильна камера.Придбали ще взимку, в..."


count    2755.000000
mean        4.458802
std         0.956587
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: stars, dtype: float64

#### Distribution of classes

In [11]:
s = df.stars.value_counts()
pd.concat([s, s / s.sum()], axis=1)

Unnamed: 0,stars,stars.1
5,1832,0.664973
4,610,0.221416
3,153,0.055535
1,95,0.034483
2,65,0.023593


So, we deal with an imbalanced multiclass classification problem

In [65]:
# Analyze cons
df["cons"].value_counts().head(10)
patt = "не має|не знайш|немає|не вияв|відсутн|нема|--"
cond = ((df["cons"].str.lower().str.contains(patt, regex=True)) | (df["cons"]==""))
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

               700
Немає           92
немає           85
-               34
не виявлено     31
Не виявлено     20
нема            20
Відсутні        19
Не виявила      16
Нема            15
Name: cons, dtype: int64

4.088316467341306

4.700239808153477

In [67]:
# Analyze pros
df["pros"].value_counts().head(10)
patt = "ціна|якість"
cond1 = (df["pros"].str.lower().str.contains(patt, regex=True)) | (df["pros"].str.len() < 10)
cond2 = df["pros"]==""
cond = cond2
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

                  626
Ціна               27
ціна               13
Дизайн              7
дешевий             6
Ціна-якість         5
Дешевий             5
Тиха                5
Ціна та якість      5
немає               5
Name: pros, dtype: int64

4.465476749647722

4.436102236421725

#### Train / test split

Use 20% of data for testing and stratify according to the target variable

In [12]:
y = df["stars"]
X = df.loc[:, ~df.columns.isin(["stars"])]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 2204, Num. of test: 551


#### Baseline

Let's evaluate performance of the very simple baseline prediction algorithm - <b>everything is a 5 star review (just take a majority class)</b>

In [37]:
base_pred = np.full(y_test.shape, 5)
base_metrics = calc_metrics(y_test, base_pred, proba=None, average="weighted")

Recall: 0.664
Precision: 0.441
F1: 0.530
accuracy: 0.664
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0       0      19
2       0       0       0       0      13
3       0       0       0       0      31
4       0       0       0       0     122
5       0       0       0       0     366
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        31
          4       0.00      0.00      0.00       122
          5       0.66      1.00      0.80       366

avg / total       0.44      0.66      0.53       551



#### TF - IDF & other fesatures

In [62]:
def validate_array(x):
    if len(x) > 0:
        return x
    return np.array([0])

In [352]:
def f(x):
    x = validate_array(np.array(x))
    nonzero = np.nonzero(x)[0].shape[0]
    n_pos = (x > 0).sum()
    n_neg = (x < 0).sum()
    pos = validate_array(x[x > 0])
    neg = validate_array(x[x < 0])
    max_pos = np.max(pos)
    min_pos = np.min(pos)
    mean_pos = np.mean(pos)
    std_pos = np.std(pos)
    sum_pos = np.sum(pos)
    max_neg = np.max(neg)
    min_neg = np.min(neg)
    mean_neg = np.mean(neg)
    std_neg = np.std(neg)
    sum_neg = np.sum(neg)
    return pd.Series({"nonzero": nonzero, 
                      "nonzero_ratio": nonzero / len(x) if len(x)>0 else 0,
                      "n_pos": n_pos,
                      "n_pos_ratio": n_pos / nonzero if nonzero>0 else 0,
                      "n_neg": n_neg,
                      "n_neg_ratio": n_neg / nonzero if nonzero>0 else 0,
                      "mean": np.mean(x),
                      "max": np.max(x),
                      "min": np.min(x),
                      "std": np.std(x),
                      "sum": np.sum(x),
                      "max_pos": max_pos,
                      "min_pos": min_pos,
                      "mean_pos": mean_pos,
                      "std_pos": std_pos,
                      "sum_pos": sum_pos,
                      "max_neg": max_neg,
                      "min_neg": min_neg,
                      "mean_neg": mean_neg,
                      "std_neg": std_neg,
                      "sum_neg": sum_neg 
                       })

In [340]:
def condition_cons(df, var="cons", patt="не має|не знайш|немає|не вияв|відсутн|нема|--"):
    cond = ((df[var].str.lower().str.contains(patt, regex=True)) | (df[var]==""))
    return cond

In [341]:
def add_tone(X, tone_map=tone_map, var="review"):
    temp = X[var].map(lambda x: [tone_map.get(word.lower(), 0) for word in tokenize_words(x)])
    return temp.apply(f)

In [58]:
def apply_transformer(train, test, transformer):
    if transformer is not None:
        train = transformer.fit_transform(train)
        test = transformer.transform(test)
        return train, test
    return train, test

In [342]:
def build_features(X_train, X_test, transformer=None, var="review", features=None,
                   vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            train = vectorizer.fit_transform(X_train[var]).toarray()
            test = vectorizer.transform(X_test[var]).toarray()
            f_train.append(train)
            f_test.append(test)
        elif feature == "sentiment":
            train = add_tone(X_train, var=var).values
            test = add_tone(X_test, var=var).values
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature == "len":
            train = X_train[["text", "pros", "cons"]].applymap(len).values
            test = X_test[["text", "pros", "cons"]].applymap(len).values
            train, test = apply_transformer(train, test, transformer)
            f_train.append(train)
            f_test.append(test)
        elif feature == "is_cons":
            train = condition_cons(X_train).astype(int)[:, np.newaxis]
            test = condition_cons(X_test).astype(int)[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [221]:
def fit_nb(train, y_train, test, y_test, alpha=2.5, priors=None):
    clf = MultinomialNB(alpha=alpha, class_prior=priors)
    clf.fit(train, y_train)
    pred = clf.predict(test)
    proba = clf.predict_proba(test)
    metrics = calc_metrics(y_test, pred, proba=None, average="weighted")
    return pred, proba, metrics

In [60]:
quant = QuantileTransformer(n_quantiles=10, output_distribution="uniform")
minmax = MinMaxScaler()
std = StandardScaler()
maxabs = MaxAbsScaler()
transformer = minmax

Try only TF-IDF (with stop words if we use analyzer=word)

In [420]:
tf_params = {"lowercase": 1,
             "analyzer": "word",
             "stop_words": stop,
             "ngram_range": (1, 1),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,
             "max_features": 3500*1 or None,
             "norm": 'l2'*0,
             "use_idf": 0,
             "smooth_idf": 0,
             "sublinear_tf": 0, 
             "tokenizer": None#tokenize_words
             }
var = "review"
features = [
            "tfidf",
            #"sentiment", 
            #"len", 
            #"is_cons"
            ]
vectorizer = TfidfVectorizer(**tf_params)
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var)
print(f"Features: {train.shape[1]}")

Features: 3500


In [421]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.693
Precision: 0.625
F1: 0.617
accuracy: 0.693
   pred_1  pred_2  pred_3  pred_4  pred_5
1       1       0       1       6      11
2       0       0       0       5       8
3       0       0       1       7      23
4       1       0       0      24      97
5       0       0       0      10     356
             precision    recall  f1-score   support

          1       0.50      0.05      0.10        19
          2       0.00      0.00      0.00        13
          3       0.50      0.03      0.06        31
          4       0.46      0.20      0.28       122
          5       0.72      0.97      0.83       366

avg / total       0.63      0.69      0.62       551



##### Let's add sentiments

In [430]:
features = [
            "tfidf",
            #"sentiment", 
            "len", 
            "is_cons"
            ]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer,
                            transformer=transformer, var=var)
print(f"Features: {train.shape[1]}")

Features: 3504


In [431]:
pred, proba, metrics = fit_nb(train, y_train, test, y_test, alpha=2.6)

Recall: 0.690
Precision: 0.623
F1: 0.614
accuracy: 0.690
   pred_1  pred_2  pred_3  pred_4  pred_5
1       2       0       1       6      10
2       0       0       0       5       8
3       0       0       1       7      23
4       1       0       0      22      99
5       0       0       0      11     355
             precision    recall  f1-score   support

          1       0.67      0.11      0.18        19
          2       0.00      0.00      0.00        13
          3       0.50      0.03      0.06        31
          4       0.43      0.18      0.25       122
          5       0.72      0.97      0.82       366

avg / total       0.62      0.69      0.61       551



#### Fit XGBoost

In [380]:
params['num_class'] = df["stars"].nunique()

In [377]:
dtrain = xgb.DMatrix(train, y_train-1)#, weight=X_train["weights"])
dtest = xgb.DMatrix(test, y_test-1)#, weight=X_test["weights"])
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [381]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-mlogloss:1.50738	train-merror:0.325771	eval-mlogloss:1.50779	eval-merror:0.333938
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 50 rounds.
[50]	train-mlogloss:0.847278	train-merror:0.323503	eval-mlogloss:0.87719	eval-merror:0.319419
[100]	train-mlogloss:0.825416	train-merror:0.317151	eval-mlogloss:0.865001	eval-merror:0.317604
Stopping. Best iteration:
[52]	train-mlogloss:0.845629	train-merror:0.323049	eval-mlogloss:0.875408	eval-merror:0.315789



In [382]:
pred_xgb = (model.predict(dtest)+1).astype(int)
print(f"Best score (accuracy): {1-model.best_score}")
xgb_metrics = calc_metrics(y_test, pred_xgb, average="weighted")

Best score (accuracy): 0.684211
Recall: 0.679
Precision: 0.564
F1: 0.605
accuracy: 0.679
   pred_1  pred_2  pred_3  pred_4  pred_5
1       0       0       0      10       9
2       0       0       0       5       8
3       0       0       0       7      24
4       0       0       0      26      96
5       0       0       0      18     348
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        31
          4       0.39      0.21      0.28       122
          5       0.72      0.95      0.82       366

avg / total       0.56      0.68      0.60       551

