<a href="https://colab.research.google.com/github/sidneyaguirre/ml-chat/blob/master/practical_intro_talk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A Practical Intro to Productionizing NLP Models (Modeling Workbook)

# gather and augment intents data¶

In [0]:
get_url = lambda intent, training: 'https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/{0}/{1}_{0}{2}.json'.format(intent, 'train' if training else 'validate', '_full' if training else '')
training_urls = []
validation_urls = []
intents = ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'SearchCreativeWork', 'SearchScreeningEvent']
intents_to_ix = {intent:ix for ix,intent in enumerate(intents)}
for intent in intents:
  training_urls.append((intent, get_url(intent, training=True)))
  validation_urls.append((intent, get_url(intent, training=False)))

In [0]:
import requests
get_json = lambda url: requests.get(url).json()
get_texts = lambda url: [' '.join(' '.join([d['text'] for d in j['data']]).split()) for j in list(get_json(url).values())[0]]

In [0]:
# we need to also handle greet, goodbye, and "None" intents:
intents.extend(['Greet', 'Bye','None'])
intents_to_ix = {intent:ix for ix,intent in enumerate(intents)}

augment the dataset with three manually-labeled intents: "greet", "bye",  and "no intent"

In [0]:
# number of intents to augment small data to
import itertools
n = 300
print('generating around',n,'observations of each manually added intent')
augment_data = lambda texts, n=n:list(itertools.chain.from_iterable(itertools.repeat(x, int(n/len(texts))) for x in texts if x))
get_manual_data = lambda fname: 'https://{0}.github.io/{1}/{2}.txt'.format('bkvillalobos', '.github.io',fname)
# add greet utterances
greet_texts = augment_data(requests.get(get_manual_data('greet')).text.split('\n'))
# add bye utterances
bye_texts = augment_data(requests.get(get_manual_data('bye')).text.split('\n'))
# add random utterances
random_texts = augment_data(requests.get(get_manual_data('misc_utterences')).text.split('\n'))

generating around 300 observations of each manually added intent


In [0]:
to_intent_text_tuple = lambda intent, text_list: list(zip([intent,]*len(text_list),text_list))
manual_intents = []

# append greet intents
manual_intents.extend(to_intent_text_tuple('Greet', greet_texts))
print('addded', len(greet_texts),'new greet utterances')

# append goodbye intents
manual_intents.extend(to_intent_text_tuple('Bye', bye_texts))
print(len(bye_texts),'new bye utterances')

# append random intents
manual_intents.extend(to_intent_text_tuple('None', random_texts))
print(len(random_texts),'new random, no-intent utterances')

print('total new manually-added intents:', len(manual_intents))

addded 219 new greet utterances
219 new bye utterances
219 new random, no-intent utterances
total new manually-added intents: 657


In [0]:
import pandas as pd
def get_randomized_df_from_urls(urls, manual_intents=[]):
  return_data = []
  for intent, url in urls:
    print('getting text for url:',url)
    for text in get_texts(url):
      return_data.append((intent,intents_to_ix[intent],text))
  print('adding texts for manually-generated intents' if manual_intents else 'not adding any manual intents')
  for intent, text in manual_intents:
    return_data.append((intent,intents_to_ix[intent],text))
  return pd.DataFrame(return_data, columns=['intent','intent_ix','text']).sample(frac=1).reset_index(drop=True)

training_df = get_randomized_df_from_urls(training_urls, manual_intents=manual_intents)
test_df = get_randomized_df_from_urls(validation_urls) # not adding manually-generated intents to testing data

getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/AddToPlaylist/train_AddToPlaylist_full.json
getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/BookRestaurant/train_BookRestaurant_full.json
getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/GetWeather/train_GetWeather_full.json
getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/PlayMusic/train_PlayMusic_full.json
getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/SearchCreativeWork/train_SearchCreativeWork_full.json
getting text for url: https://raw.githubusercontent.com/snipsco/nlu-benchmark/master/2017-06-custom-intent-engines/SearchScreeningEvent/train_SearchScreeningEvent_full.json
adding texts for manually-generate

# train/test split

create column of data without stop words
(this is a slow, but easy to follow implementation)

In [0]:
# load spacy language model
import spacy

nlp = spacy.load('en')

In [0]:
remove_stopwords = lambda sentence: ' '.join([str(token) for token in nlp(sentence) if not token.is_stop])
test_df['text_no_stop'] = [remove_stopwords(s) for s in test_df['text'].values]
training_df['text_no_stop'] = [remove_stopwords(s) for s in training_df['text'].values]

In [0]:
training_df.head()

Unnamed: 0,intent,intent_ix,text,text_no_stop
0,SearchCreativeWork,4,Find Live at Bearsville Theater,Find Live Bearsville Theater
1,AddToPlaylist,0,Add this track to the classical music for smar...,Add track classical music smart kids playlist
2,GetWeather,2,"Will it be chilly in Milligan College , AK on ...","Will chilly Milligan College , AK law day"
3,SearchCreativeWork,4,Find HeroQuest II: Legacy of Sorasil .,Find HeroQuest II : Legacy Sorasil .
4,SearchCreativeWork,4,play the TV series BET Awards 2013,play TV series BET Awards 2013


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(training_df.text_no_stop.values, 
                                                      training_df.intent_ix.values, 
                                                      test_size=500)
X_test, y_test = test_df.text_no_stop.values, test_df.intent_ix.values

In [0]:
max_len = 20
num_words = 1000
from tensorflow.keras.preprocessing.text import Tokenizer
# Fit the tokenizer on the training data
t = Tokenizer(num_words=num_words, oov_token='__OOV__')
t.fit_on_texts(X_train)

In [0]:
from keras.preprocessing.sequence import pad_sequences
# training data
x_train_seq = t.texts_to_sequences(X_train)
x_train_padded = pad_sequences(x_train_seq, maxlen=max_len, padding='post')

# validation data
x_valid_seq = t.texts_to_sequences(X_valid)
x_valid_padded = pad_sequences(x_valid_seq, maxlen=max_len, padding='post')

# test data
x_test_seq = t.texts_to_sequences(X_test)
x_test_padded = pad_sequences(x_test_seq, maxlen=max_len, padding='post')
print(x_train_padded)

[[  4   1 378 ...   0   0   0]
 [109 324   1 ...   0   0   0]
 [640   1 228 ...   0   0   0]
 ...
 [  4  22   1 ...   0   0   0]
 [  7  57 813 ...   0   0   0]
 [  5 116  27 ...   0   0   0]]


# Import Pre-trained Embeddings Matrix (optional)


In [0]:
top_n_words = tuple(t.word_index.keys())[:num_words]
print('top 5 words: {}'.format(top_n_words[:5]))

top 5 words: ('__OOV__', 'play', 'i', 'add', 'find')


In [0]:
vector_dict = {w: nlp(w).vector for w in top_n_words}

create an embedding matrix with pretrained GloVe word vectors from SpaCy

In [0]:
import numpy as np
emb_matrix = np.zeros((len(vector_dict.keys()), len(vector_dict[top_n_words[0]])))
for i, w in enumerate(vector_dict.keys()):
  emb_matrix[i] = vector_dict[w]

In [0]:
print('embedding matrix shape: {}'.format(emb_matrix.shape))
emb_matrix

embedding matrix shape: (1000, 384)


array([[ 1.80208778,  3.32538462,  2.25258517, ...,  0.05187614,
         0.69417703,  0.07564092],
       [ 0.1234386 ,  1.07958233,  5.19028759, ..., -0.19546871,
         0.15194739,  0.13699253],
       [ 0.04693624, -1.03170252,  6.53575373, ...,  0.08946218,
         0.46413276, -0.04817876],
       ...,
       [-1.0435518 ,  0.92175484,  1.72832751, ..., -0.16411856,
         0.77037418, -0.76295686],
       [ 0.6416406 ,  1.30268419,  3.57998562, ...,  0.03111946,
         0.83241338,  0.35189229],
       [-0.04768187, -0.89021552,  0.3576569 , ..., -0.46991587,
         0.25018063,  0.26320362]])



# Define the Models

## Define the LSTM

In [0]:
embedding_size = 8
n_classes = 3
epochs = 10

import tensorflow as tf
from tensorflow.keras import layers

lstm_model = tf.keras.Sequential()
lstm_model.add(layers.Embedding(num_words, embedding_size, input_shape=(max_len,)))
lstm_model.add(layers.LSTM(128, return_sequences=False))
lstm_model.add(layers.Dropout(.5))
lstm_model.add(layers.Dense(len(intents_to_ix.keys()), activation='softmax'))
lstm_model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 8)             8000      
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               70144     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 9)                 1161      
Total params: 79,305
Trainable params: 79,305
Non-trainable params: 0
_________________________________________________________________


## Define the CNN

In [0]:
cnn_model = tf.keras.Sequential()
cnn_model.add(layers.Embedding(num_words, 
                                embedding_size, 
                                input_shape=(max_len,)))
cnn_model.add(layers.Conv1D(filters=64, 
                            kernel_size=3, 
                            activation='relu'))
cnn_model.add(layers.Conv1D(filters=64, 
                            kernel_size=3, 
                            activation='relu'))
cnn_model.add(layers.Dropout(0.5))
cnn_model.add(layers.MaxPooling1D(pool_size=2))
cnn_model.add(layers.Flatten())
cnn_model.add(layers.Dense(100, activation='relu'))
cnn_model.add(layers.Dense(len(intents_to_ix.keys()), activation='softmax'))
cnn_model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 20, 384)           384000    
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 18, 64)            73792     
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 16, 64)            12352     
_________________________________________________________________
dropout_17 (Dropout)         (None, 16, 64)            0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 100)               51300     
__________

## Define a LSTM with pre-trained embeddings

In [0]:
embedding_size = emb_matrix.shape[1]

lstm_pretrained_embeddings = tf.keras.Sequential()
lstm_pretrained_embeddings.add(layers.Embedding(num_words, 
                                            embedding_size, 
                                            weights=[emb_matrix],
                                            input_shape=(max_len,),
                                            trainable=False))
lstm_pretrained_embeddings.add(layers.LSTM(128, return_sequences=False))
lstm_pretrained_embeddings.add(layers.Dropout(.5))
lstm_pretrained_embeddings.add(layers.Dense(len(intents_to_ix.keys()), 
                                            activation='softmax'))
lstm_pretrained_embeddings.compile('adam', 
                                   'sparse_categorical_crossentropy', 
                                   metrics=['accuracy'])
lstm_pretrained_embeddings.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 20, 384)           384000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               262656    
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 9)                 1161      
Total params: 647,817
Trainable params: 263,817
Non-trainable params: 384,000
_________________________________________________________________


# Train the Models

In [0]:
lstm_model.fit(x_train_padded, y_train, 
          epochs=3, 
          validation_data=(x_valid_padded, y_valid))

Train on 11985 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f637925c908>

In [0]:
cnn_model.fit(x_train_padded, y_train, 
          epochs=3, 
          validation_data=(x_valid_padded, y_valid))

Train on 11985 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f6367e14dd8>

In [0]:
lstm_pretrained_embeddings.fit(x_train_padded, y_train, 
          epochs=3, 
          validation_data=(x_valid_padded, y_valid))

Train on 11985 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f636a1eab38>

# An Exercise for the Reader

### Try this

***Stop here and iterate over the performance of your models to increase your accuracy on the validation data set!***

***Feel free to try out different features and featureizations (i.e. stop words, character embeddings, sntactical information, named entities, other metadata), hyperparameters, deep learning models, etc***

***DO NOT move on the next section (evaluating on the test data) until you are done-done! Otherwise you will not get a good estimate of your out-of-sample performance***

### Where do I start?

A good practice is manually inspecting the predictions in order to prove that your model is using some kind of lazy heuristic. 

Look at the predictions your model is most confident about and wrong. Look at the ones that it is most confident about and right - do they all look similar (i.e. is your model is only picking up on the "easy" ones?) Does the model's performance degrade substantially if you take out the easy ones?

Look at the ones your model is very uncertain about, the ones for which it is basically guessing - can we encode some heuristics as features that could fix those?

# Evaluate the Models

***only do this once, or your model is cheating. If you change your model after evaluating it on the test data, you won't know how it does in the real world***

In [0]:
loss, acc = lstm_model.evaluate(x_test_padded, y_test)
print("lstm_model's out-of-sample accuracy {:.3f}%".format(acc*100))

lstm_model's out-of-sample accuracy 96.833%


In [0]:
loss, acc = cnn_model.evaluate(x_test_padded, y_test)
print("cnn_model's out-of-sample accuracy {:.3f}%".format(acc*100))

cnn_model's out-of-sample accuracy 97.333%


In [0]:
loss, acc = lstm_pretrained_embeddings.evaluate(x_test_padded, y_test)
print("lstm_pretrained_embeddings's out-of-sample accuracy {:.3f}%".format(acc*100))

lstm_pretrained_embeddings's out-of-sample accuracy 95.667%


# Persist the models

In [0]:
def to_disk(keras_model, model_name):
  keras_model.save('{}.h5'.format(model_name))

In [0]:
from tensorflow.keras.models import load_model

def from_disk(model_name):
  return load_model('{}.h5'.format(model_name))

save the models

In [0]:
to_disk(lstm_model,'lstm')

In [0]:
to_disk(cnn_model,'cnn')

In [0]:
to_disk(lstm_pretrained_embeddings, 'lstm_pretrained_embeddings')

load and use the models

In [0]:
model_from_disk = from_disk('cnn')

In [0]:
# use the model
loss, acc = model_from_disk.evaluate(x_test_padded, y_test)
print("lstm_pretrained_embeddings's out-of-sample accuracy {:.3f}%".format(acc*100))

lstm_pretrained_embeddings's out-of-sample accuracy 97.333%


if you want to download these models to your local machine

In [0]:
from google.colab import files

In [0]:
files.download('lstm.h5')

In [0]:
files.download('cnn.h5')

In [0]:
files.download('lstm_pretrained_embeddings.h5')