# Bibliotecas

In [1]:
#Libs ############################################
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import pickle
import nltk
import os
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data extraction
Loads a dataset with product data from a specified path available in the environment variable DATASET_PATH.

In [2]:
try:
    DATASET_PATH = os.environ['DATASET_PATH']
except:
    DATASET_PATH = 'data/sample_products.csv'

df_file = pd.read_csv(DATASET_PATH, sep=',')
df_file.head(2)

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia


# Data formatting
Processes the dataset to use it for training and validation.

In [3]:
df_data = df_file.copy()
df_data.isnull().sum()
import re
import unicodedata
def dedup_esp_caract(txt):
    x = str(txt)
    nfkd = unicodedata.normalize('NFKD', x)
    palavraSemAcento = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    x = re.sub('[^a-zA-Z0-9 \\\]','', palavraSemAcento)
    x = x.split(' ')
    x = ' '.join([str(elem) for elem in set(x)])
    return x

In [4]:
#Label encoder, using only text columns
LE = LabelEncoder()
df_data['cod_category'] = LE.fit_transform(df_data['category'])
df_data['text_train'] = df_data['query'].str.lower() +' ' + df_data['title'].str.lower() + ' ' + df_data['concatenated_tags'].str.lower()
df_data['text_train'] = df_data.apply(lambda row: dedup_esp_caract(row['text_train']), axis=1)

# Modeling
Specifies a model to handle the categorization problem.

In [5]:
train, test = train_test_split(df_data, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text_train
X_test = test.text_train
y_train = train['cod_category']
y_test = test['cod_category']
print(X_train.shape)
print(X_test.shape)

(25460,)
(12540,)


In [6]:
#Logistic Regression #########
LR_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1))
           ])
LR_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86      2283
           1       0.97      0.89      0.93       297
           2       0.88      0.91      0.89      2856
           3       0.87      0.95      0.91      5805
           4       0.91      0.49      0.63       367
           5       0.85      0.63      0.72       932

    accuracy                           0.88     12540
   macro avg       0.89      0.78      0.82     12540
weighted avg       0.88      0.88      0.88     12540

0.8792663476874003


In [7]:
#Random Forest ###############
RF_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('RF', RandomForestClassifier(n_estimators=100,random_state=2)),
           ])


RF_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred2 = RF_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89      2283
           1       0.96      0.95      0.95       297
           2       0.90      0.90      0.90      2856
           3       0.88      0.96      0.92      5805
           4       0.93      0.58      0.71       367
           5       0.89      0.66      0.75       932

    accuracy                           0.90     12540
   macro avg       0.91      0.82      0.85     12540
weighted avg       0.90      0.90      0.89     12540

0.8964114832535885


In [8]:
#Decision Treet ###############
DT_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('DT', DecisionTreeClassifier(criterion="entropy",max_depth=60)),
           ])


DT_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred3 = DT_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred3))
print(accuracy_score(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.86      0.78      0.82      2283
           1       0.95      0.82      0.88       297
           2       0.65      0.90      0.76      2856
           3       0.90      0.84      0.87      5805
           4       0.70      0.33      0.45       367
           5       0.80      0.55      0.65       932

    accuracy                           0.81     12540
   macro avg       0.81      0.70      0.74     12540
weighted avg       0.82      0.81      0.81     12540

0.8064593301435407


In [9]:
#choose best model by acc
best_model = []
models = {'RF_Pipe' : accuracy_score(y_test, y_pred2),'LR_Pipe' : accuracy_score(y_test, y_pred), 'DT_Pipe' : accuracy_score(y_test, y_pred3)}
models = {k: v for k, v in sorted(models.items(), key=lambda item: item[1])}
best_model = list(models.keys())[-1]

# Model validation
Generates metrics about the model accuracy (precision, recall, F1, etc.) for each category and exports them to a specified path available in the environment variable METRICS_PATH.

In [10]:
try:
    METRICS_PATH = os.environ['METRICS_PATH']
except:
    METRICS_PATH = 'metrics.txt'

In [11]:
df_categorias = df_data[['cod_category', 'category']].copy()
df_categorias = df_categorias.drop_duplicates()
categories = df_categorias['category'].tolist()

def test_predict(Model, Save, txt):    
    metric_file =''
    metric_file += '####################################\n'
    metric_file += txt + '\n'
    metric_file += '####################################\n'
        
    for cat in categories:    
        x_cat_test = test['text_train'][test['category']==cat]
        y_cat_test = test['cod_category'][test['category']==cat]
        y_pred = Model.predict(x_cat_test)
        print('ACC: ',cat, accuracy_score(y_cat_test, y_pred))        
        metric_file += 'Category: ' + cat + ' - ACC:' + str(accuracy_score(y_cat_test, y_pred))  + '\n'
        metric_file += str(classification_report(y_cat_test,y_pred)) + '\n'
        print(classification_report(y_cat_test,y_pred))
    
    if Save == 1:                        
        f = open(METRICS_PATH, "a")        
        f.truncate(0) # need '0' when using r+
        f.write(metric_file)
        f.close()            

In [12]:
if best_model == 'RF_Pipe':
    print('############### Random Forest Metrics for categories #################')
    test_predict(RF_Pipe, 1, 'Random Forest - Metrics for categories')
elif best_model == 'DT_Pipe':
    print('############### Logistic Regression Metrics for categories #################')
    test_predict(LR_Pipe, 1, 'Decision Tree - Metrics for categories')
elif best_model == 'LR_Pipe':
    print('############### Logistic Regression Metrics for categories #################')
    test_predict(LR_Pipe, 1, 'Logistic Regression - Metrics for categories')    

############### Random Forest Metrics for categories #################
ACC:  Decoração 0.9005602240896359
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.90      0.95      2856
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.90      2856
   macro avg       0.17      0.15      0.16      2856
weighted avg       1.00      0.90      0.95      2856

ACC:  Papel e Cia 0.6555793991416309
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Bebê 0.8611476127901884
              precision    recall  f1-score   support

           0       1.00      0.86      0.93      2283
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.86      2283
   macro avg       0.25      0.22      0.23      2283
weighted avg       1.00      0.86      0.93      2283



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Lembrancinhas 0.9645133505598622
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       1.00      0.96      0.98      5805
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.96      5805
   macro avg       0.17      0.16      0.16      5805
weighted avg       1.00      0.96      0.98      5805

ACC:  Bijuterias e Jóias 0.9461279461279462
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.95      0.97       297
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0

    accuracy                           0.95       297
   macro avg       0.25      0.24      0.24     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model exportation
Exports a candidate model to a specified path available in the environment variable MODEL_PATH.

In [13]:
#Random forest
def save_model(model):
    try:
        MODEL_PATH = os.environ['MODEL_PATH']
    except:
        MODEL_PATH = 'model.pkl'
    with open(MODEL_PATH, 'wb') as picklefile:
        pickle.dump(model,picklefile)
    
#Save Best model    
if best_model == 'RF_Pipe':
    save_model(RF_Pipe)
elif best_model == 'DT_Pipe':
    save_model(LR_Pipe)
elif best_model == 'LR_Pipe':
    save_model(LR_Pipe)