# Intent prediction model

Model for predict the intent based on Dense NN

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils import shuffle
import os
from pathlib import Path
import pickle

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, SimpleRNN, LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

## Training dataset

In [None]:
### Model hyperparameters
dataset_path = 'data/snips_utterances.csv'
intent_column = 'Intention'

language = 'English'
# language = 'Spanish'

# Cutting the utterances in this length
max_len = 100
# Max of the dataset
# all_intents_df[language].str.len().max()

# Over the 'max_words' most frequent words.
max_words = 10000

# Dimensions of the selected Embedding
embedding_dim = 100
# embedding_dim = 300

In [None]:
print('Loading Dataset', dataset_path)
all_intents_df = pd.read_csv(dataset_path)

In [None]:
all_intents_df.info()

In [None]:
print('Checking balancing of classes')
all_intents_df[intent_column].value_counts()

In [None]:
print('Shuffling the dataset (intents come ordered)')
all_intents_df = shuffle(all_intents_df)

In [None]:
print('Random sentence')
all_intents_df.sample(1)

In [None]:
print(f'Converting dataframe columns "{language}" and "{intent_column}" into lists')
sentences_list = all_intents_df[language].tolist()
intents_list = all_intents_df[intent_column].tolist()

In [None]:
print('Checking the size of the lists', (len(sentences_list), len(intents_list)))

## Tokenization and padding of data

In [None]:
def fit_tokenizer(sentences_list, max_words=10000, test_word='book'):
    """
        Fit a Keras Tokenizer based on sentences_list
    """
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(sentences_list)
    
    # Internal word_index of the tokenizer
    word_index = tokenizer.word_index
    
    print('Vocabulary of the corpora', len(word_index))
    print(f'Index of the word {test_word}', word_index[test_word])
    
    # TODO Save this tokenizer for predictions
    with open('dist/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Tokenizer saved')
    
    return tokenizer

In [None]:
tokenizer = fit_tokenizer(sentences_list, max_words)

In [None]:
def vectorize_and_pad_sequences(sentences_list, tokenizer):
    print('Converting {} sentences into indices with given tokenizer'.format(len(sentences_list)))
    sequences = tokenizer.texts_to_sequences(sentences_list)
    print('Checking indices of first word', sequences[0][:10])
    
    # Transforms the sequences into 2D tensors of shape (sample, maxlen)
    # Padding to the right data[0, :]
    data = pad_sequences(sequences, maxlen=max_len)
    print('Shape of padded sequences', data.shape)
    
    return data

In [None]:
data = vectorize_and_pad_sequences(sentences_list, tokenizer)

## Encoding and one hot of targets

In [None]:
def fit_encoder(intents_list):
    """
        Fit a Sklearn LabelEncoder based on intents_list
    """    
    encoder = LabelEncoder()
    print('Fitting a LabelEncoder with given target')
    encoder.fit(intents_list)    
    
    print('Found classes', encoder.classes_)
    print('Testing encoder', encoder.transform(encoder.classes_))
    
    np.save('dist/classes.npy', encoder.classes_)
    print('Encoder saved')
    return encoder

In [None]:
encoder = fit_encoder(intents_list)

In [None]:
def encode_and_one_hot_target(intents_list, encoder):
    print('Encoding target with given encoder')
    intents_encoded = encoder.transform(intents_list)

    print('Convert encoded classes integers to dummy variables')
    intents_one_hot = to_categorical(intents_encoded)
    
    print('Target final shape', intents_one_hot.shape)
    return intents_one_hot

In [None]:
intents_one_hot = encode_and_one_hot_target(intents_list, encoder)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, intents_one_hot, random_state=1, test_size = .33)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## Embedding initialization

In [None]:
# cc.es.300.vec
def get_embedding(dim_size = 100, test_word='book'):
    """
        Builds a dictionary {'token': [embedding vector]}
        :dim_size: Could be 50, 100, 200, 300
    """
    embedding_file_name = f'glove.6B.{dim_size}d.txt'
    # embedding_file_name = f'cc.es.{dim_size}.vec'
    embedding_path = os.path.join(os.getcwd(), 'embeddings', embedding_file_name)
    print('Will load the following embedding', embedding_file_name)
    embeddings_index = {}
    with open(os.path.join(os.getcwd(), 'embeddings', embedding_file_name)) as embedding_file:
        for embedding_line in embedding_file.readlines():
            token = embedding_line.split()[0]
            vector = np.asarray(embedding_line.split()[1:], dtype='float32') # Toda su representacion como Embedding
            embeddings_index[token] = vector
    
    print('Found {} word vectors.'.format(len(embeddings_index)))
    print('Checking shape', embeddings_index['sandberger'].shape)
    # print('Checking vector', embeddings_index['sandberger'][:100])
    
    return embeddings_index

In [None]:
# %%time
# First version
# Construye un diccionario {'token': [embedding values]}
# embeddings_index = {}
# f = open(embedding_path)
# for line in f:
#     values = line.split()
#     word = values[0] # Es el token
#     coefs = np.asarray(values[1:], dtype='float32') # Toda su representacion como Embedding
#     embeddings_index[word] = coefs
# f.close()

In [None]:
%%time
embeddings_index = get_embedding(dim_size = embedding_dim, test_word = 'libro')

In [None]:
def build_embedding_matrix(max_words, tokenizer):
    # Building the matrix for feed the embedding, has to be of shape (max_words, embedding_dim)
    embedding_matrix = np.zeros((max_words, embedding_dim))
    print('Initalizing zeros matrix of shape', embedding_matrix.shape)
    
    # tokenizer.word_index, el diccionario que definimos antes en el tokenizador con (token, indice)
    for word, i in tokenizer.word_index.items():
        # Para no pasarnos del indice 10.000
        if i < max_words:
            # Buscamos la palabra en el embedding index
            embedding_vector = embeddings_index.get(word)
            # Las palabras que no encontramos en el embedding van a ser todos cero
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    
    print('Checking representation of word 123', embedding_matrix[123][:100])
    return embedding_matrix

In [None]:
embedding_matrix = build_embedding_matrix(max_words, tokenizer)

## Model

In [None]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(7, activation='softmax'))

model.summary()

In [None]:
# Conocimiento de la capa, cada fila es la palabra con el indice i
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.summary()

In [None]:
%%time
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
# Serialize model to JSON
model_json = model.to_json()
with open("dist/model.json", "w") as json_file:
    json_file.write(model_json)

# Serialize weights to HDF5
model.save_weights("dist/model.h5")
print("Saved model to disk")

## Performance

In [None]:
def print_figure(training_values, validation_values, metric):
    # Usamos un range para generar una serie entera
    epochs = range(1, len(training_values) + 1)
    
    plt.clf()
    plt.plot(epochs, training_values, 'bo', label='Training ' + metric) # bo es blue dot
    plt.plot(epochs, validation_values, 'b', label='Validation ' + metric)
    plt.title('Training and validation ' + metric)
    plt.xlabel('Epochs')
    plt.ylabel(metric)
    plt.legend()

    plt.show()    

In [None]:
def print_loss(history):
    history_dict = history.history
    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']
    
    print_figure(loss_values, val_loss_values, 'Loss')

In [None]:
def print_acc(history):
    history_dict = history.history
    acc_values = history_dict['acc']
    val_acc_values = history_dict['val_acc']
    
    print_figure(acc_values, val_acc_values, 'Accuracy')

In [None]:
print_loss(history)

In [None]:
print_acc(history)

In [None]:
# test_loss, test_acc
results = model.evaluate(X_test, y_test)
results