# Setup

In [1]:
import numpy as np
from numpy import random
import pickle
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import time 
import tensorflow as tf
import keras

from gensim.models import Word2Vec

%matplotlib inline
matplotlib.style.use('ggplot')

Using TensorFlow backend.


In [2]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
from keras import losses, models, optimizers
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard, EarlyStopping
from keras.optimizers import SGD, Adam, Adamax, Adagrad, Adadelta, Nadam, RMSprop

# Dataset Load

In [4]:
PATH_GLOVE   = 'GloVe-1.2/'
DATASET = 'amazon'

PATH_DATASET = '/Users/rafalencar/Documents/Datasets/Products_Catalog/'

In [5]:
dataset = pd.read_csv(PATH_DATASET + 'amazon_co-ecommerce_sample_Dataset.csv')
dataset = dataset.replace(np.nan, '', regex=True)
print('Dataset Shape: ', dataset.shape)

Dataset Shape:  (9310, 5)


In [6]:
CATEGORY = "amazon_category_and_sub_category"

labels = dataset[CATEGORY].unique()
LABELS = labels.shape[0]
print("Total Labels : ", labels.shape[0])

Total Labels :  255


In [7]:
dataTrain, dataTest = train_test_split(dataset, test_size=0.1)
y_train = dataTrain[CATEGORY]
y_test  = dataTest[CATEGORY]
print('Trainset Size: ', dataTrain.shape[0])
print('Testset Size: ', dataTest.shape[0])

Trainset Size:  8379
Testset Size:  931


In [8]:
labelsTrain = dataTrain[CATEGORY].unique()
print("Trainset Labels : ", labelsTrain.shape[0])
labelsTest  = dataTest[CATEGORY].unique()
print("Testset Labels : ", labelsTest.shape[0])

Trainset Labels :  243
Testset Labels :  114


# Embedding

### Paths and Constants

In [9]:
EMBEDDING_DIM = 100

PATH_DATA_MODELS   = 'data_models/'

FILE_WORD2VEC      = PATH_DATA_MODELS + DATASET +'_word2vec_s' + str(EMBEDDING_DIM) + '.model'
FILE_TOKENIZER     = PATH_DATA_MODELS + DATASET +'_tokenizer.sav'
FILE_LABEL_ENCODER = PATH_DATA_MODELS + DATASET +'_label_encoder.sav'

FILE_GEN_GLOVE     = 'GloVe-1.2/' + DATASET + '_genglove_s' + str(EMBEDDING_DIM) + '.txt'

In [10]:
X_used = 'product_name'
    
X_data     = dataset[X_used]
dataTrainX = dataTrain[X_used]
dataTestX  = dataTest[X_used]

X_size = []
for item in X_data:
        split = item.split()
        X_size.append(len(split))     
MAX_SEQUENCE_LENGTH = int(1.5*max(X_size))

### Tokenizer

In [11]:
tokenizer = pickle.load(open(FILE_TOKENIZER, 'rb'))

token_train = tokenizer.texts_to_sequences(dataTrainX)
token_test  = tokenizer.texts_to_sequences(dataTestX)

pad_train = pad_sequences(token_train, maxlen=MAX_SEQUENCE_LENGTH)
pad_test = pad_sequences(token_test, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index

FileNotFoundError: [Errno 2] No such file or directory: 'data_models/amazon_tokenizer.sav'

### Label Encoder

In [None]:
le = pickle.load(open(FILE_LABEL_ENCODER, 'rb'))
y_encode_train = le.transform(y_train)
y_encode_test  = le.transform(y_test)

# Load Model

In [None]:
PATH_MODELS = 'not_trained_models/'

MODEL = 'sepcnn__product_description_word2vec_s100'

FILE_MODEL         = PATH_MODELS + DATASET + '_model_'+  MODEL  + '.json'
FILE_MODEL_WEIGHTS = PATH_MODELS + DATASET + '_model_'+  MODEL  + '.h5'

In [None]:
json_file = open(FILE_MODEL, 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights(FILE_MODEL_WEIGHTS)

# Train Model

In [None]:
tensorboard = TensorBoard(log_dir="logs/" + model.name + '(' + time.asctime() + ')')
                          #histogram_freq=1)

In [None]:
earlyStopping = EarlyStopping(monitor='val_acc', mode='auto',
                              min_delta=0.01, patience=5,
                              restore_best_weights=True)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(lr=0.005, decay=0.0001), metrics=['acc'])

In [None]:
hist = model.fit(pad_train, y_encode_train, 
                 batch_size=128, epochs=60, initial_epoch=20,
                 verbose=1, callbacks=[tensorboard],
                 validation_split=0.1)

# Evaluation

### Metrics

In [None]:
ev_test  = model.evaluate(pad_test, y_encode_test, verbose=0)
y_pred = np.argmax(model.predict(pad_test),axis=1)
y_pred = le.inverse_transform(y_pred)

In [None]:
acc_tr   = round(100 * hist.history['acc'][-1], 1)
loss_tr  = round(hist.history['loss'][-1], 3)
acc_val  = round(100 * hist.history['val_acc'][-1], 1)
loss_val = round(hist.history['val_loss'][-1], 3)
acc_te   = round(100 * ev_test[1], 1)
loss_te  = round(ev_test[0], 3)

print('Acc:', acc_tr, '%')
print('Loss:', loss_tr)
print('Val Acc:', acc_val, '%')
print('Val Loss:', loss_val)
print('Test Acc:', acc_te, '%')
print('Test Loss:', loss_te)

### Classification Report

**Precision** is the percentage of samples correctly predicted to that category<br>
P = tp / (tp + fp), tp = true positive, fp = false positive <br><br>

**Recall** is the percentage of samples of that category predicted correctly<br>
R    = tp / (tp + fn), tp = true positive, fn = false Negative <br><br>

In [None]:
class_report = metrics.classification_report(y_test, y_pred)
split = class_report.split('\n') 
for i, line in enumerate(split):
    x = line
    x = x.replace('     ', '')   
    x = x.replace('   ', ' ')
    x = x.replace('  ', ' ')  
    s = x.split()
    if len(s) > 0:
        support   = s[-1]
        s.pop()
        f1_score  = s[-1]
        s.pop()
        recall    = s[-1]
        s.pop()
        precision = s[-1]
        s.pop()
        if len(s) <= 0:
            label = 'label'
        else:
            label = ' '.join(s)
        split[i] = [label, precision, recall, f1_score, support]
    else: 
        split[i] = ['']

In [None]:
df = pd.DataFrame(split)
df.columns = df.loc[0]
df = df.drop([0])
df = df.dropna()

print('Classification Report')
df[-10:]

### Confusion Matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)

plt.matshow(cm)
plt.colorbar()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.gcf().set_size_inches(8, 8)
plt.show()

# Save Trained Model

In [None]:
PATH_MODELS        = 'trained_models/'
FILE_MODEL         = PATH_MODELS + DATASET + '_model_' + model.name + '(' + str(acc_tr) + '_' + str(acc_te) + ').json'
FILE_MODEL_WEIGHTS = PATH_MODELS + DATASET + '_model_' + model.name + '(' + str(acc_tr) + '_' + str(acc_te) + ').h5'

In [None]:
# Save Model
model_json = model.to_json()
json_file  = open(FILE_MODEL, "w") 
json_file.write(model_json)
json_file.close()
model.save_weights(FILE_MODEL_WEIGHTS)