# Bibliotecas

In [1]:
#Libs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
import pickle
import nltk
import os
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rsilvei7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data extraction
Loads a dataset with product data from a specified path available in the environment variable DATASET_PATH.

In [2]:
try:
    DATASET_PATH = os.environ['DATASET_PATH']
except:
    DATASET_PATH = 'data/sample_products.csv'

df_file = pd.read_csv(DATASET_PATH, sep=',')
df_file.head(2)

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia


# Data formatting
Processes the dataset to use it for training and validation.

In [3]:
df_data = df_file.copy()
df_data.isnull().sum()

product_id               0
seller_id                0
query                    0
search_page              0
position                 0
title                    0
concatenated_tags        2
creation_date            0
price                    0
weight                  58
express_delivery         0
minimum_quantity         0
view_counts              0
order_counts         20105
category                 0
dtype: int64

In [4]:
#Label encoder, using only text columns
LE = LabelEncoder()
df_data['cod_category'] = LE.fit_transform(df_data['category'])
df_data['text_train'] = df_data['query'] +' ' + df_data['title'] + ' ' + df_data['concatenated_tags']
df_data.head(2)

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category,cod_category,text_train
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração,2,espirito santo Mandala Espírito Santo mandala mdf
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia,5,cartao de visita Cartão de Visita cartao visit...


# Modeling
Specifies a model to handle the categorization problem.

In [5]:
train, test = train_test_split(df_data, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text_train
X_test = test.text_train
y_train = train['cod_category']
y_test = test['cod_category']
print(X_train.shape)
print(X_test.shape)

(25460,)
(12540,)


In [6]:
#Logistic Regression #########
LR_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1))
           ])
LR_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      2283
           1       0.97      0.88      0.92       297
           2       0.88      0.90      0.89      2856
           3       0.86      0.95      0.90      5805
           4       0.91      0.51      0.66       367
           5       0.85      0.61      0.71       932

    accuracy                           0.87     12540
   macro avg       0.89      0.78      0.82     12540
weighted avg       0.87      0.87      0.87     12540

0.8741626794258374


In [7]:
#Random Forest ###############
RF_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=10, random_state=0), n_jobs=1)),
           ])
RF_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred2 = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      2283
           1       0.97      0.88      0.92       297
           2       0.88      0.90      0.89      2856
           3       0.86      0.95      0.90      5805
           4       0.91      0.51      0.66       367
           5       0.85      0.61      0.71       932

    accuracy                           0.87     12540
   macro avg       0.89      0.78      0.82     12540
weighted avg       0.87      0.87      0.87     12540

0.8741626794258374


# Model validation
Generates metrics about the model accuracy (precision, recall, F1, etc.) for each category and exports them to a specified path available in the environment variable METRICS_PATH.

In [23]:
try:
    METRICS_PATH = os.environ['METRICS_PATH']
except:
    METRICS_PATH = 'metrics.txt'

In [29]:
df_categorias = df_data[['cod_category', 'category']].copy()
df_categorias = df_categorias.drop_duplicates()
categories = df_categorias['category'].tolist()

def test_predict(Model, Save, txt):    
    metric_file =''
    metric_file += '####################################\n'
    metric_file += txt + '\n'
    metric_file += '####################################\n'
        
    for cat in categories:    
        x_cat_test = test['text_train'][test['category']==cat]
        y_cat_test = test['cod_category'][test['category']==cat]
        y_pred = Model.predict(x_cat_test)
        print('ACC: ',cat, accuracy_score(y_cat_test, y_pred))        
        metric_file += 'Category: ' + cat + ' - ACC:' + str(accuracy_score(y_cat_test, y_pred))  + '\n'
        metric_file += str(classification_report(y_cat_test,y_pred)) + '\n'
        print(classification_report(y_cat_test,y_pred))
    
    if Save == 1:                        
        f = open(METRICS_PATH, "a")        
        f.truncate(0) # need '0' when using r+
        f.write(metric_file)
        f.close()            

In [30]:
print('############### Random Forest Metrics for categories #################')
test_predict(RF_Pipe, 0, 'Random Forest - Metrics for categories')

print('############### Logistic Regression Metrics for categories #################')
test_predict(LR_Pipe, 1, 'Logistic Regression - Metrics for categories')

############### Random Forest Metrics for categories #################
ACC:  Decoração 0.8967086834733894
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.90      0.95      2856
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.90      2856
   macro avg       0.17      0.15      0.16      2856
weighted avg       1.00      0.90      0.95      2856



  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Papel e Cia 0.7274678111587983
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       1.00      0.73      0.84       932

    accuracy                           0.73       932
   macro avg       0.17      0.12      0.14       932
weighted avg       1.00      0.73      0.84       932

ACC:  Outros 0.670299727520436
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       1.00      0.67      0.80       367
           5       0.00      0.00      0.00         0

    accur

# Model exportation
Exports a candidate model to a specified path available in the environment variable MODEL_PATH.

In [11]:
#Random forest
classifier = RF_Pipe

try:
    MODEL_PATH = os.environ['MODEL_PATH']
except:
    MODEL_PATH = 'model.pkl'

with open(MODEL_PATH, 'wb') as picklefile:
    pickle.dump(classifier,picklefile)