# Setup

In [1]:
import nltk
import os
import time
from pathlib import Path
from dataset import Dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split

# Dataset Load

In [2]:
filename = 'datasets/amazon_co-ecommerce_sample.csv'

dataset = Dataset(filename)
dataset.load(text_field='name', label_field='category')

690 lines skipped (not labeled)
0 lines skipped (not text)


# Preprocessing

In [3]:
tokenizer = dataset.tokenize()
label_encoder = dataset.label_encode()

Number of Tokens: 13038


In [4]:
data_train, data_test = train_test_split(dataset.data, test_size=0.2)
    
token_train = tokenizer.texts_to_sequences(data_train.text)
text_train = pad_sequences(token_train, maxlen=dataset.max_text())
label_train = label_encoder.transform(data_train.label)
        
token_test = tokenizer.texts_to_sequences(data_test.text)
text_test = pad_sequences(token_test, maxlen=dataset.max_text())
label_test = label_encoder.transform(data_test.label)

# Load Model

In [None]:
PATH_MODELS = 'not_trained_models/'

MODEL = 'sepcnn__product_description_word2vec_s100'

FILE_MODEL         = PATH_MODELS + DATASET + '_model_'+  MODEL  + '.json'
FILE_MODEL_WEIGHTS = PATH_MODELS + DATASET + '_model_'+  MODEL  + '.h5'

In [None]:
json_file = open(FILE_MODEL, 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights(FILE_MODEL_WEIGHTS)

# Train Model

In [None]:
tensorboard = TensorBoard(log_dir="logs/" + model.name + '(' + time.asctime() + ')')

In [None]:
earlyStopping = EarlyStopping(monitor='val_acc', mode='auto',
                              min_delta=0.01, patience=5,
                              restore_best_weights=True)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(lr=0.005, decay=0.0001), metrics=['acc'])

In [None]:
hist = model.fit(pad_train, y_encode_train, 
                 batch_size=128, epochs=60, initial_epoch=20,
                 verbose=1, callbacks=[tensorboard],
                 validation_split=0.1)

# Evaluation

### Metrics

In [None]:
ev_test  = model.evaluate(pad_test, y_encode_test, verbose=0)
y_pred = np.argmax(model.predict(pad_test),axis=1)
y_pred = le.inverse_transform(y_pred)

In [None]:
acc_tr   = hist.history['acc'][-1]
loss_tr  = hist.history['loss'][-1]
acc_val  = hist.history['val_acc'][-1]
loss_val = hist.history['val_loss'][-1]
acc_te   = ev_test[1]
loss_te  = ev_test[0]

print('Train)      Acc: %.2f%, Loss: %.3f' % 100*acc_tr,  loss_tr)
print('Validation) Acc: %.2f%, Loss: %.3f' % 100*acc_val, loss_val)
print('Test)       Acc: %.2f%, Loss: %.3f' % 100*acc_te,  loss_te)

### Classification Report

**Precision** is the percentage of samples correctly predicted to that category<br>
P = tp / (tp + fp), tp = true positive, fp = false positive <br><br>

**Recall** is the percentage of samples of that category predicted correctly<br>
R    = tp / (tp + fn), tp = true positive, fn = false Negative <br><br>

In [None]:
class_report = metrics.classification_report(y_real, y_pred)
split = class_report.split('\n')
for i, line in enumerate(split):
    s = line.split()
    if len(s) > 0:
        support   = s[-1]
        s.pop()
        f1_score  = s[-1]
        s.pop()
        recall    = s[-1]
        s.pop()
        precision = s[-1]
        s.pop()
        if len(s) <= 0:
            label = 'label'
        else:
            label = ' '.join(s)
        split[i] = [label, precision, recall, f1_score, support]
    else:
        split[i] = ['']

df = pd.DataFrame(split)
df.columns = df.loc[0]
df = df.drop([0])
df = df.dropna()

In [None]:
print('Classification Report')
df[-10:]

# Save Trained Model

In [None]:
# Save Model
model.save(Path(trained_models_dir, model.name + '.h5'))