# Bidirectional LSTM in Keras with GloVe embeddings

In this quick kernel I'm going to use a multilayered bidirectional LSTM to classify text. Rather than using random embeddings for words I'm going to use GloVe embeddings.

This has the benifit that words which are close to one another are in some sense close in the embedding space.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.utils.np_utils import to_categorical
from keras.initializers import Constant
import re

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv', delimiter='\t')
df = df[['Phrase', 'Sentiment']]

pd.set_option('display.max_colwidth', -1)
df.head(3)

Here's a tip I learnt the hard way: ideally you should try clean your sentences in exactly the same way that it's been cleaned for the word embeddings. If you don't do this then later when we try to match words up with vectors we won't find a match!

Sadly, I'm not sure how the embeddings used in this data source were used, but this seems fairly close (I've also replaced urls with 'url')

In [None]:
replace_puncts = {'`': "'", '′': "'", '“':'"', '”': '"', '‘': "'"}

strip_chars = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '[', ']', '>', '=', '+', '\\', '•',  '~', '@', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

puncts = ['!', '?', '$', '&', '/', '%', '#', '*','£']

def clean_str(x):
    x = str(x)
    
    x = x.lower()
    
    x = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)
    
    for k, v in replace_puncts.items():
        x = x.replace(k, f' {v} ')
        
    for punct in strip_chars:
        x = x.replace(punct, ' ') 
    
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
        
    x = x.replace(" '", " ")
    x = x.replace("' ", " ")
        
    return x


df['text'] = df['Phrase'].apply(clean_str)

It's important to have a balanced training set for our classifier, here's a simplistic way to approach this

In [None]:
df_0 = df[df['Sentiment'] == 0].sample(frac=1)
df_1 = df[df['Sentiment'] == 1].sample(frac=1)
df_2 = df[df['Sentiment'] == 2].sample(frac=1)
df_3 = df[df['Sentiment'] == 3].sample(frac=1)
df_4 = df[df['Sentiment'] == 4].sample(frac=1)

# we want a balanced set for training against - there are 7072 `0` examples
sample_size = min(len(df_0), len(df_1), len(df_2), len(df_3), len(df_4))

data = pd.concat([df_0.head(sample_size), df_1.head(sample_size), df_2.head(sample_size), df_3.head(sample_size), df_4.head(sample_size)]).sample(frac=1)

In [None]:
data['l'] = data['Phrase'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(data.l.mean()))
print("max length of sentence: " + str(data.l.max()))
print("std dev length of sentence: " + str(data.l.std()))

In [None]:
# these sentences aren't that long so we may as well use the whole string
sequence_length = 52

Let's tokenize our text.

Note, there are a couple of details around the `oov_token` that are worth knowing about. Firstly if you don't declare an oov token, Keras will ignore the word, so for example if the word "brown" isn't in our tokenizer dictionary

"the quick brown fox"

might become

[1, 312, 21479]

were 1 -> "the", 2 -> "quick", 21479 -> "fox". "brown" isn't here, so it gets ignored

I've not seen any research on this, but ignoring the fact there was a word there feels weird - you can compensate for this with an `oov_token`. If the `oov_token` is 20000, then this would now become `[1, 312, 20000, 21479]`

Secondly, there's a bit of oddity around what keras picks as the `oov_token`. If your corpus has 14,281 unique words, then the `oov_token` will be given the 14,282nd index. That's something you've got to remember when quoting `max_features` later

In [None]:
max_features = 20000 # this is the number of words we care about

tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(data['Phrase'].values)

# this takes our sentences and replaces each word with an integer
X = tokenizer.texts_to_sequences(data['Phrase'].values)

# we then pad the sequences so they're all the same length (sequence_length)
X = pad_sequences(X, sequence_length)

y = pd.get_dummies(data['Sentiment']).values

# lets keep a couple of thousand samples back as a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print("test set size " + str(len(X_test)))

Now we need to get the word GloVe embeddings into a format ready to query later.

Side note: I did a bit of experimentation here and found I got slightly better results using the 100d word embeddings for this task and archetecutre. I assume the benifit of using 200d is outweighed by the extra features - this could be an interesting thing to investigate

In [None]:
embeddings_index = {}
f = open(os.path.join('../input/glove-global-vectors-for-word-representation', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Now for a bit of fun - lets go through all the words in our tokenizer and find the word embedding (vector) for each.

Which leaves us with a question - what should we do with the words we can't find? In this example I'm going to give each it's own unique random vector, which we can make trainable later (ie move about)

Another lesson I learnt the hard way: don't do what I did and give them all the same vector

In [None]:
num_words = min(max_features, len(word_index)) + 1
print(num_words)

embedding_dim = 100

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

Now our model.

The important bit here is the `Embedding`, we need to specify the embeddings matrix (the set of vectors representing our words) and for this example I'm going to make them trainable - this means they can be modified during training if something more accurate is found

I'm going to use Keras' `CuDNNLSTM` layer, which is an LSTM implementation ready to work on GPUs (it doesn't run on CPUs). If we're going to stack RNNs it's also important to return the sequences.

Keras makes making this Bidirectional very easy - you just need to wrap the LSTM in Bidirctional!

In [None]:
model = Sequential()
model.add(Embedding(num_words,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=sequence_length,
                    trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(32)))
model.add(Dropout(0.25))
model.add(Dense(units=5, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
batch_size = 128
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.1)

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

so how does this perform?

In [None]:
y_hat = model.predict(X_test)

In [None]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))

In [None]:
conf = confusion_matrix(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))
conf

In [None]:
plt.imshow(conf)

In [None]:
df_test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv', delimiter='\t')

df_test['text'] = df_test['Phrase'].apply(clean_str)

In [None]:
x = tokenizer.texts_to_sequences(df_test['text'].values)
x = pad_sequences(x, sequence_length)

y_hat = model.predict(x)

In [None]:
df_results = pd.DataFrame(list(zip(df_test['PhraseId'].values, list(map(lambda x: np.argmax(x), y_hat)))), columns=['PhraseId', 'Sentiment'])

In [None]:
df_results.head(3)

In [None]:
df_results.to_csv('bilstm.csv', index=False)