# Setup

In [1]:
import numpy as np
from numpy import random
import re
import pickle
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import keras

from gensim.models import Word2Vec

%matplotlib inline
matplotlib.style.use('ggplot')

Using TensorFlow backend.


In [2]:
from keras import losses, models, optimizers, initializers, regularizers

from keras.models import Sequential, Model, model_from_json

from keras.preprocessing.text import Tokenizer

from keras.layers import (Layer, Input, Flatten, Dropout, BatchNormalization, Reshape, Embedding,
                          MaxPool1D, AveragePooling1D, GlobalAveragePooling1D,
                          Conv1D, SeparableConv1D, Dense, LeakyReLU, ReLU, Activation,
                          LSTM, SimpleRNNCell, Bidirectional)

# Dataset Load

In [3]:
PATH_GLOVE   = 'GloVe-1.2/'
DATASET = 'amazon'

PATH_DATASET = '/Users/rafalencar/Documents/Datasets/Products_Catalog/'

In [4]:
dataset = pd.read_csv(PATH_DATASET + 'amazon_co-ecommerce_sample_Dataset.csv')
dataset = dataset.replace(np.nan, '', regex=True)
print('Dataset Shape: ', dataset.shape)

Dataset Shape:  (9310, 5)


In [5]:
CATEGORY = "amazon_category_and_sub_category"
labels = dataset[CATEGORY].unique()
LABELS = labels.shape[0]
print("Total Labels : ", labels.shape[0])

Total Labels :  255


# Embedding

### Paths and Consts

In [6]:
X_used = 'product_description'
X_data = dataset[X_used]

X_size = []
for item in X_data:
        split = item.split()
        X_size.append(len(split))  

SW_RATIO            = dataset.shape[0]/np.mean(X_size)
EMBEDDING_TRAINABLE = (SW_RATIO < 15000)
MAX_SEQUENCE_LENGTH = int(1.5*max(X_size))

In [7]:
EMBEDDING_DIM = 100

PATH_DATA_MODELS = 'data_models/'
PATH_GLOVE       = 'GloVe-1.2/'

FILE_WORD2VEC      = PATH_DATA_MODELS + DATASET + '_' + X_used + '_word2vec_s' + str(EMBEDDING_DIM) + '.model'
FILE_TOKENIZER     = PATH_DATA_MODELS + DATASET + '_' + X_used + '_tokenizer.sav'
FILE_LABEL_ENCODER = PATH_DATA_MODELS + DATASET +'_label_encoder.sav'

FILE_GEN_GLOVE          = PATH_GLOVE + DATASET + '_genglove_s' + str(EMBEDDING_DIM) + '.txt'

### Tokenizer

In [8]:
tokenizer = pickle.load(open(FILE_TOKENIZER, 'rb'))
word_index = tokenizer.word_index

### Word Vectorizer

In [9]:
def set_embedding_index(file_Name):
    embeddings_index = {}
    if file_Name.find(".model") == -1:
        f = open(file_Name, encoding='utf8')
        for line in f:
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
    else:
        word2vec = Word2Vec.load(file_Name)
        words = list(word2vec.wv.vocab)
        for word in words:
            embeddings_index[word] = word2vec[word]
    return embeddings_index 

def set_embedding_weights(word_index, embeddings_index, embedding_dim):
    
    embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### Embedding Layer

In [10]:
FILE_EMBEDDING = FILE_WORD2VEC

embedding_index = set_embedding_index(FILE_EMBEDDING)

embedding_matrix = set_embedding_weights(word_index, embedding_index, EMBEDDING_DIM)

embedding_layer = Embedding(input_dim    = len(word_index) + 1,
                            output_dim   = EMBEDDING_DIM, 
                            weights      = [embedding_matrix],
                            input_length = MAX_SEQUENCE_LENGTH, 
                            trainable    = True)

  from ipykernel import kernelapp as app


In [11]:
FILE_EMBEDDING = re.sub(PATH_DATA_MODELS,'', FILE_EMBEDDING)
FILE_EMBEDDING = re.sub(DATASET,'', FILE_EMBEDDING)
FILE_EMBEDDING = re.sub("^(.*[\\\/])",'', FILE_EMBEDDING)
FILE_EMBEDDING = re.sub('.txt','', FILE_EMBEDDING)
FILE_EMBEDDING = re.sub('.model','', FILE_EMBEDDING)

print('Embedding Used:', FILE_EMBEDDING)

Embedding Used: _product_description_word2vec_s100


# CNN

### Jatana CNN

This model was based on the models used in this [article](https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f). Here you can find the [GitHub Repository](https://github.com/jatana-research/Text-Classification/blob/master/CNN.ipynb) for this model implementation

In [12]:
use_jatana_cnn = False
if use_jatana_cnn:
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    x = embedding_layer(input_layer)
    x = Conv1D(128, 5, padding='same', activation='relu')(x)
    x = MaxPool1D(3)(x)
    x = Conv1D(128, 5, padding='same', activation='relu')(x)
    x = MaxPool1D(3)(x)
    x = Conv1D(128, 5, padding='same', activation='relu')(x)
    x = MaxPool1D(3)(x)
    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    output_layer = Dense(LABELS, activation='softmax')(x)

    model = Model(input_layer, output_layer)
    model.name = 'cnn_jatana_' + FILE_EMBEDDING
    model.summary()

### SepCNN

In [13]:
use_sepcnn = True
if use_sepcnn: 
    
    # Hyper Parameters
    blocks       = 1
    filters      = 32
    kernel_size  = 5
    dropout_rate = 0.2
    pool_size    = 3
    
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    x = embedding_layer(input_layer)

    for _ in range(blocks-1):
        x = Dropout(rate=dropout_rate)(x)   
        x = SeparableConv1D(filters=filters, kernel_size=kernel_size, activation='relu',
                            bias_initializer='random_uniform', depthwise_initializer='random_uniform',
                            padding='same')(x)
        x = SeparableConv1D(filters=filters, kernel_size=kernel_size, activation='relu',
                            bias_initializer='random_uniform', depthwise_initializer='random_uniform',
                            padding='same')(x) 
        x = MaxPool1D(pool_size=pool_size)(x) 

    x = SeparableConv1D(filters=filters * 2, kernel_size=kernel_size, activation='relu',
                        bias_initializer='random_uniform', depthwise_initializer='random_uniform',
                        padding='same')(x) 
    x = SeparableConv1D(filters=filters * 2, kernel_size=kernel_size, activation='relu',
                        bias_initializer='random_uniform', depthwise_initializer='random_uniform',
                        padding='same')(x) 
    x = GlobalAveragePooling1D()(x) 
    x = Dropout(rate=dropout_rate)(x) 

    output_layer = Dense(LABELS, activation='softmax')(x)

    model = Model(input_layer, output_layer)
    model.name = 'sepcnn_' + FILE_EMBEDDING
    model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 9246)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 9246, 100)         3203700   
_________________________________________________________________
separable_conv1d_1 (Separabl (None, 9246, 64)          6964      
_________________________________________________________________
separable_conv1d_2 (Separabl (None, 9246, 64)          4480      
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 255)               16575     
Total para

# RNN

### Jatana LSTM 

This model was based on the models used in this [article](https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f). Here you can find the [GitHub Repository](https://github.com/jatana-research/Text-Classification/blob/master/RNN.ipynb) for this model implementation

In [14]:
use_jatana_lstm = False
if use_jatana_lstm:
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    x = embedding_layer(input_layer)
    x = Bidirectional(LSTM(100))(x)
    output_layer = Dense(LABELS, activation='softmax')(x)

    model = Model(input_layer, output_layer)
    model.name = 'lstm_jatana_' + FILE_EMBEDDING
    model.summary()

### Kaggle Simple LSTM

In [None]:
use_kaggle_lstm = True
if use_kaggle_lstm:    
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    x = embedding_layer(input_layer)
    x = LSTM(64)(x)
    x = Dense(256,name='FC1')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(LABELS, activation='softmax')(x)

    model = Model(inputs=inputs,outputs=output_layer)
    model.name = 'lstm_kaggle_' + FILE_EMBEDDING
    model.summary

# Save Model Architecture

In [15]:
PATH_MODELS        = 'not_trained_models/'

FILE_MODEL         = PATH_MODELS + DATASET + '_model_'+  model.name + '.json'
FILE_MODEL_WEIGHTS = PATH_MODELS + DATASET + '_model_'+  model.name + '.h5'

In [16]:
model_json = model.to_json()
json_file  = open(FILE_MODEL, "w") 
json_file.write(model_json)
json_file.close()
model.save_weights(FILE_MODEL_WEIGHTS)