# Libs

In [2]:
#Libs ############################################
import pandas as pd
import numpy as np
import re
import unicodedata
import pickle
import nltk
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data extraction
Loads a dataset with product data from a specified path available in the environment variable DATASET_PATH.

In [3]:
try:
    DATASET_PATH = os.environ['DATASET_PATH']
except:
    DATASET_PATH = 'data/sample_products.csv'

df_file = pd.read_csv(DATASET_PATH, sep=',')
df_file.head(2)

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia


In [4]:
df_file.groupby(['category']).size()

category
Bebê                   6930
Bijuterias e Jóias      940
Decoração              8723
Lembrancinhas         17524
Outros                 1133
Papel e Cia            2750
dtype: int64

# Data formatting
Processes the dataset to use it for training and validation.

In [5]:
#copy dataset
df_data = df_file.copy()
df_data.isnull().sum()


#remove duplicate itens and especial caracters
def dedup_esp_caract(txt):
    x = str(txt)
    nfkd = unicodedata.normalize('NFKD', x)
    palavraSemAcento = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    x = re.sub('[^a-zA-Z0-9 \\\]','', palavraSemAcento)
    x = x.split(' ')
    x = ' '.join([str(elem) for elem in set(x)])
    return x

In [7]:
#Data Aug for Outros category
df_tmp = df_file[df_file['category']=='Outros']
df_tmp = df_tmp.loc[df_tmp.index.repeat(3)]
df_data = df_data.append(df_tmp)

df_tmp = df_file[df_file['category']=='Papel e Cia']
df_tmp = df_tmp.loc[df_tmp.index.repeat(2)]
df_data = df_data.append(df_tmp)

In [8]:
df_data.groupby(['category']).size()

category
Bebê                   6930
Bijuterias e Jóias      940
Decoração              8723
Lembrancinhas         17524
Outros                 4532
Papel e Cia            8250
dtype: int64

In [9]:
#Label encoder
LE = LabelEncoder()
df_data['cod_category'] = LE.fit_transform(df_data['category'])

#generate a text column combining query + title + concatenated_tags
df_data['text_train'] = df_data['query'].str.lower() +' ' + df_data['title'].str.lower() + ' ' + df_data['concatenated_tags'].str.lower()
df_data['text_train'] = df_data.apply(lambda row: dedup_esp_caract(row['text_train']), axis=1)

# Modeling
Specifies a model to handle the categorization problem.

In [10]:
#Split dataset into train and test
train, test = train_test_split(df_data, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text_train
X_test = test.text_train
y_train = train['cod_category']
y_test = test['cod_category']
print(X_train.shape)
print(X_test.shape)

(31422,)
(15477,)


In [11]:
#Logistic Regression #########
LR_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1))
           ])
LR_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.84      0.86      2291
           1       0.94      0.85      0.89       302
           2       0.88      0.89      0.89      2901
           3       0.86      0.92      0.89      5776
           4       0.92      0.87      0.90      1463
           5       0.88      0.83      0.86      2744

    accuracy                           0.88     15477
   macro avg       0.90      0.87      0.88     15477
weighted avg       0.88      0.88      0.88     15477

0.8807262389351942


In [13]:
#Random Forest ###############
RF_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('RF', RandomForestClassifier(n_estimators=200,random_state=2)),
           ])


RF_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred2 = RF_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89      2291
           1       0.94      0.89      0.92       302
           2       0.92      0.90      0.91      2901
           3       0.92      0.95      0.94      5776
           4       0.97      0.99      0.98      1463
           5       0.95      0.98      0.96      2744

    accuracy                           0.93     15477
   macro avg       0.94      0.93      0.93     15477
weighted avg       0.93      0.93      0.93     15477

0.9342249790010984


In [12]:
#Decision Treet ###############
DT_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('DT', DecisionTreeClassifier(criterion="entropy",max_depth=60)),
           ])


DT_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred3 = DT_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred3))
print(accuracy_score(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81      2291
           1       0.94      0.74      0.83       302
           2       0.57      0.89      0.70      2901
           3       0.91      0.83      0.87      5776
           4       0.92      0.61      0.73      1463
           5       0.90      0.83      0.87      2744

    accuracy                           0.81     15477
   macro avg       0.86      0.77      0.80     15477
weighted avg       0.84      0.81      0.81     15477

0.8076500613814047


In [14]:
#choose best model by acc
best_model = []
models = {'RF_Pipe' : accuracy_score(y_test, y_pred2),'LR_Pipe' : accuracy_score(y_test, y_pred), 'DT_Pipe' : accuracy_score(y_test, y_pred3)}
models = {k: v for k, v in sorted(models.items(), key=lambda item: item[1])}
best_model = list(models.keys())[-1]
print(models)

{'DT_Pipe': 0.8076500613814047, 'LR_Pipe': 0.8807262389351942, 'RF_Pipe': 0.9342249790010984}


# Model validation
Generates metrics about the model accuracy (precision, recall, F1, etc.) for each category and exports them to a specified path available in the environment variable METRICS_PATH.

In [15]:
try:
    METRICS_PATH = os.environ['METRICS_PATH']
except:
    METRICS_PATH = 'metrics.txt'

In [16]:
#generate a categories list
df_categorias = df_data[['cod_category', 'category']].copy()
df_categorias = df_categorias.drop_duplicates()
categories = df_categorias['category'].tolist()

#predict function to each category and save it to file
def test_predict(Model, Save, txt):    
    metric_file =''
    metric_file += '####################################\n'
    metric_file += txt + '\n'
    metric_file += '####################################\n'
        
    for cat in categories:    
        x_cat_test = test['text_train'][test['category']==cat]
        y_cat_test = test['cod_category'][test['category']==cat]
        y_pred = Model.predict(x_cat_test)
        print('ACC: ',cat, accuracy_score(y_cat_test, y_pred))        
        metric_file += 'Category: ' + cat + ' - ACC:' + str(accuracy_score(y_cat_test, y_pred))  + '\n'
        metric_file += str(classification_report(y_cat_test,y_pred)) + '\n'
        print(classification_report(y_cat_test,y_pred))
    
    if Save == 1:                        
        f = open(METRICS_PATH, "a")        
        f.truncate(0)
        f.write(metric_file)
        f.close()            

In [17]:
#choose the best model and call test_predict function
if best_model == 'RF_Pipe':
    print('############### Random Forest Metrics for categories #################')
    test_predict(RF_Pipe, 1, 'Random Forest - Metrics for categories')
elif best_model == 'DT_Pipe':
    print('############### Logistic Regression Metrics for categories #################')
    test_predict(LR_Pipe, 1, 'Decision Tree - Metrics for categories')
elif best_model == 'LR_Pipe':
    print('############### Logistic Regression Metrics for categories #################')
    test_predict(LR_Pipe, 1, 'Logistic Regression - Metrics for categories')    

############### Random Forest Metrics for categories #################
ACC:  Decoração 0.9000344708721131
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.90      0.95      2901
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.90      2901
   macro avg       0.17      0.15      0.16      2901
weighted avg       1.00      0.90      0.95      2901



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Papel e Cia 0.9752186588921283
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           5       1.00      0.98      0.99      2744

    accuracy                           0.98      2744
   macro avg       0.25      0.24      0.25      2744
weighted avg       1.00      0.98      0.99      2744

ACC:  Outros 0.9856459330143541
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       1.00      0.99      0.99      1463

    accuracy                           0.99      1463
   macro avg       0.25      0.25      0.25      1463
weighted avg       1.00      0.99      0.99      1463



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Bebê 0.8542121344391096
              precision    recall  f1-score   support

           0       1.00      0.85      0.92      2291
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.85      2291
   macro avg       0.20      0.17      0.18      2291
weighted avg       1.00      0.85      0.92      2291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ACC:  Lembrancinhas 0.9527354570637119
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       1.00      0.95      0.98      5776
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0

    accuracy                           0.95      5776
   macro avg       0.17      0.16      0.16      5776
weighted avg       1.00      0.95      0.98      5776

ACC:  Bijuterias e Jóias 0.8940397350993378
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.89      0.94       302
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

    accuracy                           0.89     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model exportation
Exports a candidate model to a specified path available in the environment variable MODEL_PATH.

In [18]:
#save best model as pickle file
def save_model(model):
    try:
        MODEL_PATH = os.environ['MODEL_PATH']
    except:
        MODEL_PATH = 'model.pkl'
    with open(MODEL_PATH, 'wb') as picklefile:
        pickle.dump(model,picklefile)
    
#choose the best model and call save_model function   
if best_model == 'RF_Pipe':
    save_model(RF_Pipe)
elif best_model == 'DT_Pipe':
    save_model(LR_Pipe)
elif best_model == 'LR_Pipe':
    save_model(LR_Pipe)