# 0. Resumo da Avaliação

**Abordagem**: modelagem de classificação multiclasse utilizando variáveis dummies criadas a partir do texto original, seguindo as seguintes etapas:
- verificação de nulos
- tratamento de acentuação, pontuação e stopwords
- aplicação do método TF-IDF

**Otimização do algoritmo**: Os algoritmos testados foram Naive Bayes (NB, usual em problemas de textos) e Random Forest (RF, ajuste paralelizável e mais rápido). Há três versões de modelos ao final (uma primeira usando apenas o texto, a segunda acrescentando algumas variáveis do reviewer e a terceira após balanceamento de classes). Todas seguem as etapas:
- separação entre treino e teste
- manipulações na base de treino (fit transform) e apenas aplicação na base de teste (transform)
- otimização de hiperparâmetros da RF utilizando grid search com validação cruzada apenas na base de treino
- ajuste final da RF e do NB na base de treino
- cálculo da acurácia global e visualização da matriz de confusão 5x5 para ambos os algoritmos

# 1. Leitura e características da base

In [1]:
from src import my_module

my_module.helper_function()

hello_world


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
orig_df = pd.read_csv('data/train_df.csv')

In [4]:
orig_df.shape

(105898, 15)

In [5]:
orig_df.head(1)

Unnamed: 0,index,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state,overall_rating
0,95851,2018-04-20 11:56:28,c951f3a4511b554a1f34330903c320f34cfccbdf8de357...,111586438,Depilador Elétrico Philips Satinelle HP6403/30,philips,Beleza e Perfumaria,Depilação,Depilador,Yes,Muito eficiente e prático! Depilação rápida em...,1978.0,F,ES,5


In [6]:
orig_df['overall_rating'].value_counts().sort_index()

1    21998
2     6726
3    13004
4    25855
5    38315
Name: overall_rating, dtype: int64

In [7]:
orig_df.isna().sum() #sem nulos nos textos e notas finais; poucos em categ-lvl1

index                        0
submission_date              0
reviewer_id                  0
product_id                   0
product_name                65
product_brand            73194
site_category_lv1            3
site_category_lv2         3212
review_title                 0
recommend_to_a_friend       10
review_text                  0
reviewer_birth_year       4793
reviewer_gender           3316
reviewer_state            3207
overall_rating               0
dtype: int64

# 2. Tratamento do texto

In [8]:
#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
import string
import re
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
def remove_stopwords(text, stopwords):
    output= [word for word in re.split("\W+",text) if word not in stopwords]  # filter out empty words
    return output
def remove_accents(text):
    text = re.sub(u"[àáâãäå]", 'a', text)
    text = re.sub(u"[èéêë]", 'e', text)
    text = re.sub(u"[ìíîï]", 'i', text)
    text = re.sub(u"[òóôõö]", 'o', text)
    text = re.sub(u"[ùúûü]", 'u', text)
    return text 

#remove punctuation and accents
orig_df['clean_text'] = orig_df['review_text'].apply(lambda x:remove_punctuation(x))
orig_df['clean_text'] = orig_df['clean_text'].apply(lambda x:remove_accents(x))

#lower case
orig_df['clean_text'] = orig_df['clean_text'].apply(lambda x: x.lower())

#remove stopwords
pt_stopwords = stopwords.words('portuguese')
orig_df['clean_text'] = orig_df['clean_text'].apply(lambda x: remove_stopwords(x, stopwords = pt_stopwords))

#list to string once again
orig_df['clean_text'] = orig_df['clean_text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

orig_df.head(1)

Unnamed: 0,index,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state,overall_rating,clean_text
0,95851,2018-04-20 11:56:28,c951f3a4511b554a1f34330903c320f34cfccbdf8de357...,111586438,Depilador Elétrico Philips Satinelle HP6403/30,philips,Beleza e Perfumaria,Depilação,Depilador,Yes,Muito eficiente e prático! Depilação rápida em...,1978.0,F,ES,5,eficiente pratico depilaçao rapida qualquer lu...


In [9]:
X_train, X_test, y_train, y_test = train_test_split(orig_df['clean_text'], orig_df['overall_rating'], random_state = 42)

In [10]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=50, norm='l2', 
                        ngram_range=(1, 1), stop_words=pt_stopwords)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
X_train_tfidf.shape, X_test_tfidf.shape

((79423, 2307), (26475, 2307))

# 3. Modelagem

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

## 3.1 Random Forest Grid

In [13]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(100, 300, 5)]

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 21, 9)]

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}

In [14]:
# Create the model to be tuned
rf_base = RandomForestClassifier()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 30, cv = 3, verbose = 10, random_state = 42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train_tfidf, y_train)

# View the best parameters from the random search
rf_random.best_params_

Fitting 3 folds for each of 30 candidates, totalling 90 fits


{'n_estimators': 250,
 'min_samples_split': 6,
 'min_impurity_decrease': 0.0,
 'max_depth': 21,
 'bootstrap': True}

## 3.2 Naive Bayes

In [15]:
#sem grid

## 3.3 Ajuste

In [16]:
# Create the final Naive Bayes
nbayes_final = MultinomialNB()

# Create the final Random Forest
rf_final = RandomForestClassifier(n_estimators = 250,
                                  min_samples_split = 6,
                                  min_impurity_decrease = 0.0,
                                  max_depth = 21,
                                  bootstrap = True)

In [17]:
rf_final.fit(X_train_tfidf, y_train)
y_pred_rf = rf_final.predict(X_test_tfidf)
conf_mat_rf = confusion_matrix(y_test.values, y_pred_rf)
conf_mat_rf

#o desbalanceamento da var resposta pode ser o responsável pela 'escassez' de previsões de notas de 2 a 4

array([[4486,    0,    0,    4, 1078],
       [ 842,    0,    9,   19,  785],
       [ 689,    0,   35,   88, 2508],
       [ 346,    0,    8,   61, 6005],
       [ 360,    0,    2,   21, 9129]], dtype=int64)

In [18]:
np.diag(conf_mat_rf).sum()/conf_mat_rf.sum() #0.53 ao medir no treino, sem sinais de overfit

0.5178847969782814

In [19]:
nbayes_final.fit(X_train_tfidf, y_train)
y_pred_nb = nbayes_final.predict(X_test_tfidf)
conf_mat_nb = confusion_matrix(y_test.values, y_pred_nb)
conf_mat_nb

array([[4914,  124,  178,  105,  247],
       [ 843,  121,  304,  153,  234],
       [ 706,   66,  630,  788, 1130],
       [ 370,   20,  384, 1405, 4241],
       [ 403,    8,  194,  892, 8015]], dtype=int64)

In [20]:
np.diag(conf_mat_nb).sum()/conf_mat_nb.sum() #0.58 ao medir no treino, sem sinais de overfit

0.5697828139754485

# 4. Melhorias no ajuste

## 4.1 Uso das outras variáveis

In [21]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [22]:
df_feats = orig_df.copy()
df_feats['birth_year_binned'] = pd.qcut(df_feats['reviewer_birth_year'], 5)
df_feats.head(1)

Unnamed: 0,index,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state,overall_rating,clean_text,birth_year_binned
0,95851,2018-04-20 11:56:28,c951f3a4511b554a1f34330903c320f34cfccbdf8de357...,111586438,Depilador Elétrico Philips Satinelle HP6403/30,philips,Beleza e Perfumaria,Depilação,Depilador,Yes,Muito eficiente e prático! Depilação rápida em...,1978.0,F,ES,5,eficiente pratico depilaçao rapida qualquer lu...,"(1976.0, 1983.0]"


In [23]:
#df_feats.groupby('birth_year_binned')['overall_rating'].value_counts(normalize=True).sort_index()

In [24]:
dummy_feats = ['site_category_lv1', 'reviewer_gender', 'birth_year_binned']
X_train, X_test, y_train, y_test = train_test_split(df_feats[dummy_feats], df_feats['overall_rating'], random_state = 42)

In [25]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
train_enc = ohe.fit_transform(X_train[dummy_feats])
test_enc = ohe.transform(X_test[dummy_feats])

train_enc.shape, test_enc.shape

#comentário pós-teste: um NaiveBayes com este conjunto tem 36% de acurácia

((79423, 63), (26475, 63))

In [26]:
X_train_final = hstack((X_train_tfidf, train_enc))
X_test_final = hstack((X_test_tfidf, test_enc))

X_train_final.shape, X_test_final.shape

((79423, 2370), (26475, 2370))

## 4.1.1 Grid / Ajustes

In [27]:
# Create the model to be tuned
rf_base = RandomForestClassifier()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 30, cv = 3, verbose = 10, random_state = 42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train_final, y_train)

# View the best parameters from the random search
rf_random.best_params_

Fitting 3 folds for each of 30 candidates, totalling 90 fits


{'n_estimators': 250,
 'min_samples_split': 6,
 'min_impurity_decrease': 0.0,
 'max_depth': 21,
 'bootstrap': True}

In [28]:
# Create the final Naive Bayes
nbayes_final = MultinomialNB()

# Create the final Random Forest
rf_final = RandomForestClassifier(n_estimators = 250,
                                  min_samples_split = 6,
                                  min_impurity_decrease = 0.0,
                                  max_depth = 21,
                                  bootstrap = True)

In [29]:
rf_final.fit(X_train_final, y_train)
y_pred_rf = rf_final.predict(X_test_final)
conf_mat_rf = confusion_matrix(y_test.values, y_pred_rf)
conf_mat_rf

array([[4557,    0,    0,    1, 1010],
       [ 899,    0,    6,   14,  736],
       [ 728,    0,   23,   73, 2496],
       [ 369,    0,    4,   47, 6000],
       [ 379,    0,    0,   15, 9118]], dtype=int64)

In [30]:
np.diag(conf_mat_rf).sum()/conf_mat_rf.sum() 

0.5191690273843248

In [31]:
nbayes_final.fit(X_train_final, y_train)
y_pred_nb = nbayes_final.predict(X_test_final)
conf_mat_nb = confusion_matrix(y_test.values, y_pred_nb)
conf_mat_nb

array([[4810,  152,  195,  142,  269],
       [ 810,  155,  307,  162,  221],
       [ 685,   93,  643,  924,  975],
       [ 359,   28,  414, 1896, 3723],
       [ 389,   17,  206, 1361, 7539]], dtype=int64)

In [32]:
np.diag(conf_mat_nb).sum()/conf_mat_nb.sum()

0.5681964117091596

In [33]:
# NB nem RF não melhoraram com as novas features

## 4.2 Balanceamento da variável resposta no treino

In [None]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler

In [34]:
pd.Series(y_train).value_counts().sort_index()

1    16430
2     5071
3     9684
4    19435
5    28803
Name: overall_rating, dtype: int64

In [35]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train_final, y_train)

In [36]:
X_resampled.shape

(25355, 2370)

In [37]:
pd.Series(y_resampled).value_counts()

1    5071
2    5071
3    5071
4    5071
5    5071
Name: overall_rating, dtype: int64

## 4.2.1 Grid / Ajustes

In [38]:
# Create the model to be tuned
rf_base = RandomForestClassifier()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 30, cv = 3, verbose = 10, random_state = 42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_resampled, y_resampled)

# View the best parameters from the random search
rf_random.best_params_

Fitting 3 folds for each of 30 candidates, totalling 90 fits


{'n_estimators': 250,
 'min_samples_split': 6,
 'min_impurity_decrease': 0.0,
 'max_depth': 21,
 'bootstrap': True}

In [39]:
# Create the final Naive Bayes
nbayes_final = MultinomialNB()

# Create the final Random Forest
rf_final = RandomForestClassifier(n_estimators = 250,
                                  min_samples_split = 6,
                                  min_impurity_decrease = 0.0,
                                  max_depth = 21,
                                  bootstrap = True)

In [40]:
rf_final.fit(X_resampled, y_resampled)
y_pred_rf = rf_final.predict(X_test_final)
conf_mat_rf = confusion_matrix(y_test.values, y_pred_rf)
conf_mat_rf

#previsoes bem mais balanceadas; resultou entretanto em perda de acurácia global

array([[3958, 1259,  211,   70,   70],
       [ 503,  797,  261,   54,   40],
       [ 408,  839, 1026,  684,  363],
       [ 285,  621, 1105, 2272, 2137],
       [ 416,  672, 1073, 2095, 5256]], dtype=int64)

In [41]:
np.diag(conf_mat_rf).sum()/conf_mat_rf.sum() 

0.5027006610009442

In [42]:
nbayes_final.fit(X_resampled, y_resampled)
y_pred_nb = nbayes_final.predict(X_test_final)
conf_mat_nb = confusion_matrix(y_test.values, y_pred_nb)
conf_mat_nb

array([[3889, 1326,  227,   54,   72],
       [ 457,  776,  307,   69,   46],
       [ 386,  733, 1039,  785,  377],
       [ 228,  387, 1167, 2682, 1956],
       [ 318,  368,  903, 2787, 5136]], dtype=int64)

In [43]:
np.diag(conf_mat_nb).sum()/conf_mat_nb.sum() #queda de acurácia para o NaiveBayes também

0.5107459867799811