# Product Classification (Category)
**Scenario**: I have an online marketplace platform and when my users create a new advertising, sometimes they put the item in the wrong Category. As a side effect, that particular product will not be found by the recommendation engine nor if the customers decide to navigate through the Category taxonomy. 

**Business Challenge**: How can I alert my users when they are about to make a mistake of registering an item in a wrong Category?

**Data Set**: I have a cleaned list of short descriptions and the actual category id of some products

|Short description | Category |
|---|---|
|legging mulher elastica karen stp v verde|6|
|rasteira via mia lace up preta|2|


**Solution**: I need a ML classifier model that will receive as input the short description of a product and predict it's correct Category. So, I will use a basic technique to encode the textual field into a vector, then train a RandomForest to give me the correct Category.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn import preprocessing

### Get the dataset

In [None]:
if not os.path.isfile('data/dataset.csv'):
    !curl -s https://spock.cloud/ai-workshop/product_classification.tar.gz | tar -xz -C .

In [None]:
data = pd.read_csv('data/dataset.csv', sep=',', encoding='utf-8')
data.head()

In [None]:
data.groupby(['category'])['category'].count().plot(kind='bar', figsize=(20,10))

### Prepare the data and split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X = data.product_name.values
y = data.category.astype('category').cat.codes.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

pd.DataFrame(data={'product_name': X_train, 'category': y_train}).to_csv('data/products_cat_train.csv', index=None)
pd.DataFrame(data={'product_name': X_test, 'category': y_test}).to_csv('data/products_cat_test.csv', index=None)

In [None]:
from sklearn.model_selection import train_test_split

clean_data = data.groupby('sub_category').filter(lambda x : len(x)>2)
X_sub = clean_data.product_name.values
y_sub = clean_data.sub_category.astype('category').cat.codes.values

X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size=0.33, random_state=42, stratify=y_sub)

pd.DataFrame(data={'product_name': X_train_sub, 'sub_category': y_train_sub}).to_csv('data/products_subcat_train.csv', index=None)
pd.DataFrame(data={'product_name': X_test_sub, 'sub_category': y_test_sub}).to_csv('data/products_subcat_test.csv', index=None)

### Here we'll create a scikit-learn script to train our classifier with Random Forest
As you will see, the data input is in string format. We'll vectorize it with a scikit-learn feature called **TfidfVectorizer**.

In [None]:
%%writefile src/products.py
import argparse
import os
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

import argparse
import os
import json
import numpy as np
import pandas as pd

from sklearn.externals import joblib
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

word_vectorizer = None
# TODO: When using inference pipelines, it's important
# to reduce the size of the data that will be transfered
# from one container to another
def to_sparse(text):
    data,indices = text.split('|')
    data = [float(i) for i in data.split(';')]
    cols = [int(i) for i in indices.split(';')]
    return data,cols

# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

def input_fn(request_body, request_content_type):
    global word_vectorizer
    """An input_fn that loads a pickled numpy array"""
    
    if request_content_type == "text/plain":
        print(os.getcwd())
        if word_vectorizer is None:
            word_vectorizer = joblib.load(os.path.join( 'opt', 'ml', 'model', 'word_vectorizer.joblib'))
            
        return word_vectorizer.transform([request_body])
    else:
        # Handle other content-types here or raise an Exception
        # if the content type is not supported.
        pass
    
# This has a better performance for post processing the data
def output_fn(prediction, content_type):
    if content_type == "application/json":
        return json.dumps(prediction.tolist())
    else:
        raise Exception( "output_fn: Invalid content-type: %s" % content_type )

def train(args):
    print('reading data')
    #with open(os.path.join(args.model_dir, 'properties.txt'), 'w' ) as prop:
    #    prop.write(str(args.vocab_size))

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    validation_df = pd.read_csv(os.path.join(args.validation, args.validation_file))
    #print(np.concatenate((train_df.product_name.values, validation_df.product_name.values), axis=0))
    print('Tokenizing...')
    # tranlating table for removing accents
    accents = "".maketrans("áàãâéêíóôõúüçÁÀÃÂÉÊÍÓÔÕÚÜÇ", "aaaaeeiooouucAAAAEEIOOOUUC")

    # loading stopwords without accents
    with open("stopwords.txt", "r") as file:
        stopwords = list(map(lambda x:x.strip().translate(accents),file.readlines()))

    word_vectorizer = TfidfVectorizer(ngram_range=(1,2), analyzer='word', stop_words=stopwords, token_pattern='[a-zA-Z]+')
    word_vectorizer.fit(
        np.concatenate((train_df.product_name.values, validation_df.product_name.values), axis=0)
    )
    joblib.dump(word_vectorizer, os.path.join(args.model_dir, 'word_vectorizer.joblib'))
    
    X_train = word_vectorizer.transform(train_df.product_name.values)
    X_val = word_vectorizer.transform(validation_df.product_name.values)
    print("Shapes (train/val)",X_train.shape, X_val.shape)
    y_train = train_df.category.astype('category').cat.codes
    y_val = validation_df.category.astype('category').cat.codes
    
    print('building training and testing datasets')

    # train
    print('training model')
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        n_jobs=-1,verbose=True)
    
    model.fit(X_train, y_train)

    # print abs error
    print('validating model')
    abs_err = np.abs(model.predict(X_val) - y_val)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print('AE-at-' + str(q) + 'th-percentile: '
              + str(np.percentile(a=abs_err, q=q)))
        
    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)
    print(args.min_samples_leaf)
    
        
if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument('--n-estimators', type=int, default=10)
    parser.add_argument('--min-samples-leaf', type=int, default=3)

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--train-file', type=str, default='train.csv')
    parser.add_argument('--validation-file', type=str, default='test.csv')
    
    args, _ = parser.parse_known_args()

    train(args)

### A basic local test to see if the training process is ok

In [None]:
!mkdir -p opt/ml/model
!python src/products.py --n-estimators 100 \
                    --min-samples-leaf 2 \
                    --model-dir opt/ml/model/ \
                    --train data/ \
                    --validation data/ \
                    --train-file products_cat_train.csv \
                    --validation-file products_cat_test.csv

In [None]:
import sys
import os
sys.path.insert(0,'src')
from importlib import reload
import products as p
import random
p = reload(p)

sample = X_test[random.randint(0,len(X_test))]
#os.environ['SM_MODEL_DIR'] =  'model'
model = p.model_fn('model')
payload = p.input_fn(sample, 'text/plain')
pred = model.predict(payload)
out = p.output_fn(pred,"application/json")
print(out)

### Ok, now let's wrap everything and Train our model using SageMaker

In [None]:
import sagemaker
import boto3

from sagemaker.sklearn import SKLearn
from sagemaker import get_execution_role

role = get_execution_role()

prefix='products'
# Retrieve the default bucket
sagemaker_session = sagemaker.Session()

In [None]:
# Upload the dataset to an S3 bucket
input_train = sagemaker_session.upload_data(path='data/products_cat_train.csv', key_prefix='%s/data' % prefix)
input_test = sagemaker_session.upload_data(path='data/products_cat_test.csv', key_prefix='%s/data' % prefix)

In [None]:
train_data = sagemaker.session.s3_input(s3_data=input_train,content_type="csv")
test_data = sagemaker.session.s3_input(s3_data=input_test,content_type="csv")

In [None]:
# Train my estimator
sklearn_estimator = SKLearn(entry_point='products.py',
                            source_dir='src',
                            train_instance_type='ml.c5.4xlarge',
                            framework_version='0.20.0',
                            metric_definitions=[
                                {'Name': 'median-AE',
                                 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}
                            ],
                            role=role,
                            hyperparameters={
                                'n-estimators': 100,
                                'min-samples-leaf': 2,
                                'train-file': 'products_cat_train.csv',
                                'validation-file' : 'products_cat_test.csv'
                            })

In [None]:
sklearn_estimator.fit({'train': train_data, 'validation': test_data, })

In [None]:
# Deploy my estimator to a SageMaker Endpoint and get a Predictor
endpoint_name=None
predictor = sklearn_estimator.deploy( 
    endpoint_name=endpoint_name, 
    instance_type='ml.c5.xlarge', 
    initial_instance_count=1
)

### We now have a trained model and a real-time endpoint deployed.
Let's do a basic test and then check the model performance

In [None]:
import json
import random
predictor.serializer = None
predictor.deserializer = None
predictor.content_type = 'text/plain'
predictor.accept = 'application/json'

idx = random.randint(0,len(X_test))
sample = X_test[idx]
resp = json.loads(predictor.predict(sample))
print("Prod[{}], Cat[{}], Correct? {}".format( sample, resp[0], resp[0]==y_test[idx]) )

#### Test the whole 'TEST' portion of the dataset to create the confusion matrix

In [None]:
%%time
predictions = [json.loads(predictor.predict(i))[0] for i in X_test]

In [None]:
from sklearn.metrics import f1_score
score = f1_score(y_test,predictions,average='micro')

In [None]:
print('F1 Score(micro): %.4f' % (score * 100.0))

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix

cnf_matrix = normalize(confusion_matrix(y_test, predictions))
f, ax = plt.subplots(figsize=(15, 8))

sns.heatmap(cnf_matrix, annot=True, fmt="f", mask=np.zeros_like(cnf_matrix, dtype=np.bool), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)