# Going beyond Jeremy's [LSTM baseline](https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout)


In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
np.random.seed(100)

In [None]:
path = '../input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{path}glovetwitter27b50d/glove.twitter.27B.50d.txt' #using vectors trained on twitter data
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

In [None]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

Read in our data and replace missing values:

In [None]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

# train = train.sample(frac=1)[:50000]
# test = test.sample(frac=1)[:50000]

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [None]:
train.head()

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [None]:
s = set(arr.shape for arr in embeddings_index.values())
print(s)

In [None]:
for i,el in enumerate(embeddings_index.values()):
    if el.shape[0]==49:
        print(i)

In [None]:
list(embeddings_index.keys())[38522]

In [None]:
del(embeddings_index['0.45973'])

In [None]:
set(arr.shape for arr in embeddings_index.values())

Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

Using CuDNNLSTM instead of LSTM.

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto', baseline=None, restore_best_weights=False)

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix],trainable=False)(inp)
x = Bidirectional(CuDNNLSTM(50, return_sequences=True))(x)
x = Dropout(0.3)(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# model.summary()

Now we're ready to fit out model! Use `validation_split` when not submitting.

In [None]:
model.fit(X_t, y, batch_size=32, epochs=6, validation_split=0.1, callbacks=[early_stopping]);

As pointed out by Jeremy, model seems to overfit after 2 epochs. `val_loss` goes lower, but `val_acc` increases.

In [None]:
def model_with_gru():
    
    inp = Input(shape=(maxlen,))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    
    return model

In [None]:
model_gru = model_with_gru()
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_gru.fit(X_t, y, batch_size=32, epochs=6, validation_split=0.1, callbacks=[early_stopping]);

Not much improvement on using GRU. 

Trying a model with two LSTM layers.

In [None]:
def model_with_2_lstms():
    
    inp = Input(shape=(maxlen,))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    
    return model

In [None]:
model_2_lstms = model_with_2_lstms()
model_2_lstms.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_2_lstms.fit(X_t, y, batch_size=32, epochs=6, validation_split=0.1, callbacks=[early_stopping]);

In [None]:
comments_test = ["hey you suck big time buddy", "that's a lovely way to look at it"]
sequences_test = tokenizer.texts_to_sequences(comments_test)
data_test = pad_sequences(sequences_test, maxlen=maxlen)
preds = model_2_lstms.predict(data_test)

df_pred = pd.DataFrame(preds>0.5, columns=list_classes)
print(df_pred)

In [None]:
y_test = model_2_lstms.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)