# Bibliotecas

In [None]:
#Libs ############################################
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import pickle
import nltk
import os
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('portuguese'))

# Data extraction
Loads a dataset with product data from a specified path available in the environment variable DATASET_PATH.

In [None]:
try:
    DATASET_PATH = os.environ['DATASET_PATH']
except:
    DATASET_PATH = 'data/sample_products.csv'

df_file = pd.read_csv(DATASET_PATH, sep=',')
df_file.head(2)

# Data formatting
Processes the dataset to use it for training and validation.

In [None]:
df_data = df_file.copy()
df_data.isnull().sum()

In [None]:
#Label encoder, using only text columns
LE = LabelEncoder()
df_data['cod_category'] = LE.fit_transform(df_data['category'])
df_data['text_train'] = df_data['query'] +' ' + df_data['title'] + ' ' + df_data['concatenated_tags']
df_data.head(2)

# Modeling
Specifies a model to handle the categorization problem.

In [None]:
train, test = train_test_split(df_data, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text_train
X_test = test.text_train
y_train = train['cod_category']
y_test = test['cod_category']
print(X_train.shape)
print(X_test.shape)

In [None]:
#Logistic Regression #########
LR_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1))
           ])
LR_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
#Random Forest ###############
RF_Pipe = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=10, random_state=0), n_jobs=1)),
           ])
RF_Pipe.fit(X_train.values.astype('U'), y_train)
y_pred2 = LR_Pipe.predict(X_test.values.astype('U'))

print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test, y_pred))

# Model validation
Generates metrics about the model accuracy (precision, recall, F1, etc.) for each category and exports them to a specified path available in the environment variable METRICS_PATH.

In [None]:
try:
    METRICS_PATH = os.environ['METRICS_PATH']
except:
    METRICS_PATH = 'metrics.txt'

In [None]:
df_categorias = df_data[['cod_category', 'category']].copy()
df_categorias = df_categorias.drop_duplicates()
categories = df_categorias['category'].tolist()

def test_predict(Model, Save, txt):    
    metric_file =''
    metric_file += '####################################\n'
    metric_file += txt + '\n'
    metric_file += '####################################\n'
        
    for cat in categories:    
        x_cat_test = test['text_train'][test['category']==cat]
        y_cat_test = test['cod_category'][test['category']==cat]
        y_pred = Model.predict(x_cat_test)
        print('ACC: ',cat, accuracy_score(y_cat_test, y_pred))        
        metric_file += 'Category: ' + cat + ' - ACC:' + str(accuracy_score(y_cat_test, y_pred))  + '\n'
        metric_file += str(classification_report(y_cat_test,y_pred)) + '\n'
        print(classification_report(y_cat_test,y_pred))
    
    if Save == 1:                        
        f = open(METRICS_PATH, "a")        
        f.truncate(0) # need '0' when using r+
        f.write(metric_file)
        f.close()            

In [None]:
print('############### Random Forest Metrics for categories #################')
test_predict(RF_Pipe, 0, 'Random Forest - Metrics for categories')

print('############### Logistic Regression Metrics for categories #################')
test_predict(LR_Pipe, 1, 'Logistic Regression - Metrics for categories')

# Model exportation
Exports a candidate model to a specified path available in the environment variable MODEL_PATH.

In [None]:
#Random forest
classifier = RF_Pipe

try:
    MODEL_PATH = os.environ['MODEL_PATH']
except:
    MODEL_PATH = 'model.pkl'

with open(MODEL_PATH, 'wb') as picklefile:
    pickle.dump(classifier,picklefile)