In [412]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [281]:
import json
from pandas.io.json import json_normalize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score,
                             classification_report)
import numpy as np
from nltk.corpus import stopwords

In [274]:
with open("stop_words.txt") as f:
    stop = f.readlines()
stop = [el.strip() for el in stop]

In [410]:
def calc_metrics(y_test, pred, proba=None, labels=["1", "2", "3", "4", "5"], print_=True,
                 average="macro", report=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred, average=average)
    output["Precision"] = precision_score(y_test, pred, average=average)
    output["F1"] = f1_score(y_test, pred, average=average)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    if report:
        print(classification_report(y_test, pred, labels))
    return output

In [24]:
filename = "items.jl"
with open(filename, "r") as f:
    data = json.load(f)

In [25]:
df = json_normalize(data, record_path="reviews", meta=["path", "price", "title"])

In [227]:
df["review"] = df.apply(lambda x: x['text'] + " " + x['pros']+ " " + x['cons'], axis=1)

In [228]:
df.head(1)
df["stars"].describe()

Unnamed: 0,author,cons,date,link,pros,stars,text,path,price,title,review
0,Наталка,не виявили,2016-7-07,https://bt.rozetka.com.ua/2030437/p2030437/com...,"Якісний вироб, доступна ціна",5,"Чудова морозильна камера.Придбали ще взимку, в...","[Интернет-супермаркет №, Бытовая техника, инте...",11199,Встраиваемая морозильная камера Freggia LSB0010,"Чудова морозильна камера.Придбали ще взимку, в..."


count    2755.000000
mean        4.458802
std         0.956587
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: stars, dtype: float64

#### Distribution of classes

In [138]:
s = df.stars.value_counts()
pd.concat([s, s / s.sum()], axis=1)

Unnamed: 0,stars,stars.1
5,1832,0.664973
4,610,0.221416
3,153,0.055535
1,95,0.034483
2,65,0.023593


In [326]:
# Analyze cons
df["cons"].value_counts().head(10)
patt = "не має|не знайш|немає|не вияв|відсутн|нема|--"
cond = ((df["cons"].str.lower().str.contains(patt, regex=True)) | (df["cons"]==""))
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

               700
Немає           92
немає           85
-               34
не виявлено     31
Не виявлено     20
нема            20
Відсутні        19
Не виявила      16
Нема            15
Name: cons, dtype: int64

4.088316467341306

4.700239808153477

In [333]:
df["is_cons"] = cond.astype(int)

In [137]:
# Analyze pros
df["pros"].value_counts().head(10)
patt = "ціна|якість"
cond1 = (df["pros"].str.lower().str.contains(patt, regex=True)) | (df["pros"].str.len() < 10)
cond2 = df["pros"]==""
cond = cond2
df.loc[~cond, "stars"].mean()
df.loc[cond, "stars"].mean()

                  626
Ціна               27
ціна               13
Дизайн              7
дешевий             6
Ціна та якість      5
+                   5
Ціна-якість         5
немає               5
Тиха                5
Name: pros, dtype: int64

4.465476749647722

4.436102236421725

#### Train / test split

In [334]:
y = df["stars"]
X = df.loc[:, ~df.columns.isin(["stars"])]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 2204, Num. of test: 551


#### TF - IDF

In [527]:
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler, MaxAbsScaler

In [528]:
quant = QuantileTransformer(n_quantiles=10, output_distribution="uniform")
minmax = MinMaxScaler()
std = StandardScaler()
maxabs = MaxAbsScaler()
transformer = maxabs

In [520]:
def add_features(train, X, type_="train", transformer=None):
    is_cons = X["is_cons"].values[:, np.newaxis]
    if type_ == "train":
        func = transformer.fit_transform
    else:
        func = transformer.transform
    len_text = func(X["text"].map(len).values[:, np.newaxis])
    len_pros = func(X["pros"].map(len).values[:, np.newaxis])
    len_cons = func(X["cons"].map(len).values[:, np.newaxis])
    return np.concatenate((train, is_cons, len_text, len_pros, len_cons), axis=1)

In [521]:
tf_params = {"lowercase": 1,
             "analyzer": "word",
             "stop_words": stop,
             "ngram_range": (1, 1),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,
             "max_features": 4500,
             "norm": 'l2'*0,
             "use_idf": 0,
             "smooth_idf": 0,
             "sublinear_tf": 0
             }

In [529]:
var = "review"
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train[var]).toarray()
test = vectorizer.transform(X_test[var]).toarray()
print(f"Features: {len(vectorizer.get_feature_names())}")

Features: 4500


In [530]:
train = add_features(train, X_train, "train", transformer)#[:, -2:]
test = add_features(test, X_test, "test", transformer)#[:, -2:]

In [531]:
clf = MultinomialNB(alpha=2)#, class_prior=[0.2]*5)
#clf = GaussianNB([0.2]*5)
clf.fit(train, y_train)
pred = clf.predict(test)
proba = clf.predict_proba(test)
metrics = calc_metrics(y_test, pred, proba=None, average="weighted")

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

Recall: 0.693
Precision: 0.624
F1: 0.615
accuracy: 0.693
   pred_1  pred_2  pred_3  pred_4  pred_5
1       1       0       1       6      11
2       0       0       0       6       7
3       0       0       1       6      24
4       1       0       0      23      98
5       0       0       0       9     357
             precision    recall  f1-score   support

          1       0.50      0.05      0.10        19
          2       0.00      0.00      0.00        13
          3       0.50      0.03      0.06        31
          4       0.46      0.19      0.27       122
          5       0.72      0.98      0.83       366

avg / total       0.62      0.69      0.62       551

