# Improved LSTM baseline

This kernel is a somewhat improved version of [Keras - Bidirectional LSTM baseline](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051) 

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU,SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [None]:
path = '../input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{path}glove6b50d/glove.6B.50d.txt'
FAST_TEXT=f'{path}FastText crawl 300d 2M/crawl-300d-2M.vec'
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

In [None]:
file = open(FAST_TEXT, 'r')
print(file.readline())

In [None]:
embed_size = 50 # how big is each word vector
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 50 # max number of words in a comment to use

In [None]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
#list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
#X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
#X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
def preprocess(list_sentences):
    list_tokenized = tokenizer.texts_to_sequences(list_sentences)
    return pad_sequences(list_tokenized, maxlen=maxlen)

In [None]:
X_t = preprocess(list_sentences_train)
X_te = preprocess(list_sentences_test)

In [None]:
X_t.shape

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [None]:
def get_model():
    model = Sequential()
    model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False,input_shape=(maxlen,)))
    model.add(Bidirectional(LSTM(maxlen, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(6, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
get_model().summary()

In [None]:
def MyCallback(lr_patience=10):
    es = EarlyStopping(monitor='loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)
    rlr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=lr_patience, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    
    return [mc,es,rlr]

In [None]:
'''
skf = KFold(n_splits=5,random_state=1)
i = 0
scores=np.zeros((5,))
for tr_index,val_index in skf.split(X_t,y):
    print('Fold ', i)
    x_tr_cv = X_t[tr_index]
    y_tr_cv = y[tr_index]
    x_val_cv = X_t[val_index]
    y_val_cv = y[val_index]
    model = get_model()
    model.fit(x=x_tr_cv,y=y_tr_cv,batch_size=128,epochs=2,callbacks=None,shuffle=True)
    print('proba.')
    y_pred = model.predict(x_val_cv)
    print('roc_auc...')
    score = roc_auc_score(y_true=y_val_cv,y_score=y_pred)
    print('auc_roc: ', score)
    scores[i]=score
    i = i+1
'''

In [None]:
def tester():
    m = get_model()
    m.fit(X_t,y,batch_size=128)
    return m

In [None]:
m = tester()

In [None]:
m.save('toxic-model.h5')


In [None]:
'''
sample = preprocess(["You should be added to the list I was creating, ergo you are a cunt."])
print(sample.shape)
print(model.predict(sample))
print(model.predict_proba(sample))
'''

In [None]:
'''
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)
'''