# Treinamento de modelo para dataset Elo7

In [1]:
import os
import pandas as pd
import scipy.sparse as sp

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline

from multiprocessing import cpu_count

## 1. Carregamento de dados

Carregando apenas as colunas que serão usadas pelo modelo.

In [2]:
df = pd.read_csv(
    os.environ["DATASET_PATH"],
    usecols=["title", "concatenated_tags", "category"],
)

df.shape

(38507, 3)

Removendo as entradas que possuem valores em branco.

In [3]:
df = df.dropna()
df.shape

(38505, 3)

### Divisão em conjuntos de teste e de treino

In [4]:
X = df[["title", "concatenated_tags"]] 
y = df["category"]

In [5]:
y.value_counts()

Lembrancinhas         17759
Decoração              8845
Bebê                   7026
Papel e Cia            2777
Outros                 1147
Bijuterias e Jóias      951
Name: category, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123, test_size=0.3)

## 2. Transformação dos dados, modelagem e validação do modelo

Aqui crio uma pipeline de treinamento que cria um count vectorizer das colunas necessárias e depois utilizamos o Naive Bayes multinomial para classificar os dados.

Utilizei a pipeline do scikit-learn com um [helper](https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html) para selecionar as colunas e realizar os vectorizers de forma individual para cada feature.


Além disso, é feito um grid-search com cross-validation (5 folds) para encontrar os parâmetros que fazem com que a performance seja melhor.

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


In [8]:
pipeline = Pipeline([

    # Use FeatureUnion to combine the features from title and concatenated tags
    ('union', FeatureUnion(
        transformer_list=[

            # Pulling features from the title
            ('title', Pipeline([
                ('selector', ColumnSelector(key='title')),
                ('countvectorizer', CountVectorizer()),
            ])),

            # Pulling features from concatenated tags
            ('concatenated_tags', Pipeline([
                ('selector', ColumnSelector(key='concatenated_tags')),
                ('countvectorizer', CountVectorizer()),
            ])),
        ],

    )),

    # Use a SVC classifier on the combined features
    ('multinomialnb', MultinomialNB()),
])

In [9]:
param_grid = {
    "union__title__countvectorizer__ngram_range": [(1, 5)],
    "union__concatenated_tags__countvectorizer__ngram_range": [(1, 5)],
    "multinomialnb__alpha": [1.0e-5, 1.0e-2, 1]
}

classifier = GridSearchCV(pipeline, cv=5, param_grid=param_grid, n_jobs=cpu_count() - 1)
classifier.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('title',
                                                                        Pipeline(steps=[('selector',
                                                                                         ColumnSelector(key='title')),
                                                                                        ('countvectorizer',
                                                                                         CountVectorizer())])),
                                                                       ('concatenated_tags',
                                                                        Pipeline(steps=[('selector',
                                                                                         ColumnSelector(key='concatenated_tags')),
                                                                                      

### Verificando a performance do classificador obtido

O resultado foi bom dentro do possível com o desbalanceamento das classes. As métricas são salvas em um arquivo para possíveis consultas futuras. 

In [10]:
report = classification_report(y_test, classifier.predict(X_test))
print(report)

                    precision    recall  f1-score   support

              Bebê       0.92      0.91      0.91      2108
Bijuterias e Jóias       0.92      0.95      0.93       285
         Decoração       0.92      0.92      0.92      2654
     Lembrancinhas       0.93      0.94      0.93      5328
            Outros       0.82      0.81      0.82       344
       Papel e Cia       0.80      0.82      0.81       833

          accuracy                           0.92     11552
         macro avg       0.89      0.89      0.89     11552
      weighted avg       0.92      0.92      0.92     11552



In [11]:
with open(os.environ["METRICS_PATH"], "w") as metrics_file:
    metrics_file.write(report)

## 3. Exportação do modelo:


In [12]:
import cloudpickle

with open(os.environ["MODEL_PATH"], "wb") as model_file:
    cloudpickle.dump(classifier, model_file)

In [13]:
classifier.best_params_

{'multinomialnb__alpha': 0.01,
 'union__concatenated_tags__countvectorizer__ngram_range': (1, 5),
 'union__title__countvectorizer__ngram_range': (1, 5)}