In [None]:
import warnings
warnings.filterwarnings("ignore")
import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import numpy as np
np.random.seed(42)
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from keras.preprocessing import text, sequence
import pandas as pd
import keras_models
from scipy.sparse import hstack
from tqdm import tqdm_notebook as tqdm

In [None]:
# Read data, fill empty comments with empty strings ' '
N_ROWS = 50000
dataset = pd.read_csv('train.csv', nrows=N_ROWS).fillna(' ')
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
msk = np.random.rand(len(dataset)) < 0.8
train = dataset[msk]
val = dataset[~msk]
y_train = train[LABELS].values 
y_val = val[LABELS].values
print("Train size:", train.shape[0], ", Val size:", val.shape[0])

# Introduction to embeddings
In this tutorial we will have a totally different approach from the previous one: let's try to do the minimum feature engineering possible and focus on architecture engineering. 

This means that we will no longer handcraft features from text but will let the network learn the best representations for them. Word embeddings come handy in here.

As you previously learned, word embeddings are a way to represent words in a vector with a fixed dimension, that is able serve as input to a neural network. When trained correctly, and with enough examples, the embeddings will be able to provide meaningful information to the network.

![this](https://cdn-images-1.medium.com/max/2400/1*sXNXYfAqfLUeiDXPCo130w.png)

Below you can visualize how a distributed representation with word embeddings can be way more meaningful than just a one-hot representation (sort of what you end up having in the Bag of Words approach).

![this2](https://i.imgur.com/goEdlez.png)

## Tokenizer
In practice, word embeddings are nothing more than a lookup table. 

In the example below you can see how two words are converted to word embeddings by using the embedding matrix: both `ride` and `a` are indexed in certain positions of the embedding table. All we have to do is to get those vectors corresponding to those positions and feed them to the network. At the end of the forward pass, the network will train these embeddings as 'normal' weights of the network and will update them in order to minimize the loss. 

![this3](https://image.slidesharecdn.com/kpisummerschool2015wordembeddingsandneurallanguagemodeling1-150828091027-lva1-app6892/95/kpi-summer-school-2015-word-embeddings-and-neural-language-modeling-28-638.jpg?cb=1440753116)

Now the only feature engineering we have to do (if we can call it that way), is to change the words to numbers so that they will point to different indexes on the embedding table.

Keras' tokenizer does that job for us, and we only have to specify how many words we want to represent (`num_words`). This `num_words` will define how many entries we have in our embedding matrix. The most frequent words will picked first, until `num_words` is reached, the rest will have the same index that will correspond to an unknown word. It's useful to always have an 'unknown word index' in our embedding matrix to deal with words that are not indexes in the table.

In [None]:
num_words = 30000  # max words to represent

tokenizer = text.Tokenizer(num_words=num_words) # setup tokenizer
tokenizer.fit_on_texts(train['comment_text']) # create word_to_id  dictionary  (notice how we use only train data! no cheating!)
train_words_indexes = tokenizer.texts_to_sequences(train['comment_text'])  # get words ids
val_words_indexes = tokenizer.texts_to_sequences(val['comment_text']) # get words ids

In [None]:
# Compare input before and after tokenizer
print(train['comment_text'].values[1])  # before tokenizer
print(train_words_indexes[1])  # after tokenizer

A problem with standard feedforward neural networks is that we need to feed them a fixed length input. 

There are many ways to solve this problem. In here will we'll take the simpler solution, which is to define a fixed limited to text string, `maxlen`. Strings will less than `maxlen` words will be padded with zeros and the words with more than `maxlen` words will be truncated to that size.

In [None]:
maxlen = 100  # max number of words in a comment input

train_words_indexes = sequence.pad_sequences(train_words_indexes, maxlen=maxlen, padding='post')
val_words_indexes = sequence.pad_sequences(val_words_indexes, maxlen=maxlen, padding='post')

In [None]:
# Check how every sample now has the same length and that 0s were padded to it
print(train_words_indexes.shape)
train_words_indexes[:2]

We now have our input ready to pass to the neural network.

All we have to do is to call the `keras_models.get_embeddings_model` function and define what dimension we want our embeddings to have, `embeddings_dim`. Feel free to play with the parameters and see what differences it makes to the model's performance.

In [None]:
embeddings_dim = 100

# Construct model
emb_model = keras_models.get_embeddings_model(maxlen, num_words, embeddings_dim)
emb_model.summary()

In [None]:
# Check https://keras.io/models/model/#fit for more option on arguments to pass to the fit function
history = emb_model.fit(train_words_indexes, y_train, batch_size=128, epochs=1, validation_data=(val_words_indexes, y_val))

In [None]:
epochs = [x for x in range(len(history.history['auc']))]
trace1 = {"y": history.history['auc'], "x": epochs, "name": "train_auc", "type": "scatter"}
trace2 = {"y": history.history['val_auc'], "x": epochs, "name": "val_auc", "type": "scatter"}
trace3 = {"y": history.history['loss'], "x": epochs, "name": "train_loss", "type": "scatter"}
trace4 = {"y": history.history['val_loss'], "x": epochs, "name": "val_loss", "type": "scatter"}

data = [trace1, trace2, trace3, trace4]
layout = {"title": "Embeddings + MLP model"}

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='embmlp-model')

# Pretrained embeddings
Training embeddings from scratch it's very time consuming and you may not have enough data to do so. 

Fortunately, nowadays you have a lot of pretrained word embeddings that you can use to train your model! Most of the times it is way easier to start your embeddings with ones that were previously trained on a large corpora. You can choose to freeze (not train) or finetune those embeddings in your model.  

In this tutorial we will finetune the pretrained fastText embeddings but feel free to download any other english embeddings and try them out!

Download the fastText embeddings [here](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip), extract the file `crawl-300d-2M.vec` and place it in the root of this directory.

In [None]:
# We will now open the embeddings file and get the words and their corresponding vectors
# and construct embedding matrix to pass to the model's embedding layer
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_dim = 300
embeddings_matrix = np.zeros((num_words, embeddings_dim))
def get_coefs(word, *arr): 
    if word not in tokenizer.word_index or tokenizer.word_index[word] >= num_words:
        return None
    return (tokenizer.word_index[word], np.asarray(arr, dtype='float32')) 
with open(EMBEDDING_FILE) as fp:
    for o in tqdm(fp, desc="Reading embeddings..."):
        aux = get_coefs(*o.rstrip().rsplit(' '))
        if aux is not None:
            embeddings_matrix[aux[0]] = aux[1]

In [None]:
# Check some embeddings
print("Word toxic:")
print("Id:", tokenizer.word_index['toxic'])
print("Embedding:", embeddings_matrix[tokenizer.word_index['toxic']])

In [None]:
# Construct model
pretrained_emb_model = keras_models.get_pretrained_embeddings_model(maxlen, num_words, embeddings_dim, embeddings_matrix)
pretrained_emb_model.summary()

In [None]:
# Check https://keras.io/models/model/#fit for more option on arguments to pass to the fit function
history = pretrained_emb_model.fit(train_words_indexes, y_train, batch_size=128, epochs=5, validation_data=(val_words_indexes, y_val))

In [None]:
epochs = [x for x in range(len(history.history['auc']))]
trace1 = {"y": history.history['auc'], "x": epochs, "name": "train_auc", "type": "scatter"}
trace2 = {"y": history.history['val_auc'], "x": epochs, "name": "val_auc", "type": "scatter"}
trace3 = {"y": history.history['loss'], "x": epochs, "name": "train_loss", "type": "scatter"}
trace4 = {"y": history.history['val_loss'], "x": epochs, "name": "val_loss", "type": "scatter"}

data = [trace1, trace2, trace3, trace4]
layout = {"title": "Pretrained Embeddings + MLP model"}

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pretrainedembmlp-model')

# Recurrent networks
We have been using a feedforward multilayer perceptron so far. However, we have learned that text representations are better handled by recurrent networks.

![this5](https://www.researchgate.net/profile/Wim_De_mulder/publication/266204519/figure/fig5/AS:270318480654346@1441460356163/Recurrent-versus-feedforward-neural-network.png)

In this assignment, you are challenged to follow the example of `keras_models.get_embeddings_model`, implement a **bidirectional recurrent** network and try to get a good score on the final Kaggle test (You'll have to load the data yourself, pass it through the tokenizer and call `keras_models.get_score_on_kaggle_test_set` on it). 

Go to `keras_models.py` and implement the `get_recurrent_model` function.

Tip: Have a look at Keras' [documentation on recurrent layers](https://keras.io/layers/recurrent/). 

In [None]:
recurrent_model = keras_models.get_recurrent_model(maxlen, num_words, embeddings_dim)
recurrent_model.summary()

In [None]:
history = recurrent_model.fit(train_words_indexes, y_train, batch_size=128, epochs=5, validation_data=(val_words_indexes, y_val))

In [None]:
epochs = [x for x in range(len(history.history['auc']))]
trace1 = {"y": history.history['auc'], "x": epochs, "name": "train_auc", "type": "scatter"}
trace2 = {"y": history.history['val_auc'], "x": epochs, "name": "val_auc", "type": "scatter"}
trace3 = {"y": history.history['loss'], "x": epochs, "name": "train_loss", "type": "scatter"}
trace4 = {"y": history.history['val_loss'], "x": epochs, "name": "val_loss", "type": "scatter"}

data = [trace1, trace2, trace3, trace4]
layout = {"title": "Recurrent model"}

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='recurrent-model')

**(Possible) solution:**

```
def get_recurrent_model(input_len, num_words, embeddings_dim):
    # Inputs
    input = Input(shape=(input_len,), name="comment_words_idx")

    # Embedding layer

    x = Embedding(num_words+2, embeddings_dim)(input)

    # Spatial droupout layer
    x = SpatialDropout1D(0.2)(x)

    # Recurrent bi-directional layer
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)

    # Flatten
    x = Flatten(x)

    # Classification head
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)

    # output
    output = Dense(6, activation='sigmoid')(x)

    # model
    model = Model([input], output)
    model.compile(loss="binary_crossentropy",
                  optimizer='adam', metrics=['accuracy', auc])
    return model
```

# (Optional) Contextual embeddings
Congratulations if you got through here!
See if you can find a way to add ELMo, BERT or other contextual embeddings to your model. Good luck!