### Quora Insincere Questions: RNN LSTM

In [1]:
import numpy as np
import pandas as pd
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.layers import *
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import csv
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import os
os.chdir('/home/roman/Documents/Projects/Quora/data')

Using TensorFlow backend.


In [3]:
# Load data and tokenize (words to index)
train = pd.read_csv('train.csv')
tk = Tokenizer(lower = True, filters='')
train_text = list(train['question_text'])
tk.fit_on_texts(train_text)
train_tokenized = tk.texts_to_sequences(train['question_text'])

In [49]:
# Pad input data for input in RNN, define embedding size
max_len = 60
max_features = 30000
X_train = pad_sequences(train_tokenized, maxlen = max_len)
embed_size = 100
embedding_matrix = np.load('embedding_matrix.npy')

In [40]:
# make sure X_train doesn't have indices > max_features:
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if X_train[i][j] > max_features:
            X_train[i][j] = 0  

In [42]:
# create one_hot for y
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(train['target'].values.reshape(-1, 1))

In [43]:
# v1: one LSTM layer, avg pool
def build_model(lr=0.0, units=0, spatial_dr=0.0, kernel_size1=3,  
                dense_units=128, dr=0.1, epochs=20):
    file_path = "best_model.hdf5"
    inp = Input(shape = (max_len,))
    x = Embedding(max_features + 1, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)
    #x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x1)
    x_lstm = LSTM(units, return_sequences = True)(x1)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_lstm)
    x = BatchNormalization()(avg_pool1_lstm)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(2, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    return model

In [44]:
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,
                                  save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)
model = build_model(lr=1e-4, units=40, spatial_dr=0, kernel_size1=4, \
                    dense_units=32, dr=0, epochs=5)
model.compile(loss = "binary_crossentropy", optimizer=Adam(lr=1e-4, decay=1e-7), metrics = ["accuracy"])
model.summary()
model.fit(X_train, y_ohe, batch_size = 512, epochs = 5, validation_split=0.1,\
         verbose = 1, callbacks = [check_point, early_stop])
model = load_model(file_path)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 60, 100)           3000100   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 60, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 60, 40)            22560     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 40)                0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 40)                160       
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1312      
__________

In [45]:
# Predict results
val_split = 0.1
n_train = int(X_train.shape[0] * (1 - val_split))
y_train = train['target'][:n_train]
y_val = train['target'][n_train:]
x_train = X_train[:n_train, :]
x_val = X_train[n_train:, :]
y_train_pred = model.predict(x_train, verbose = 1)
y_val_pred = model.predict(x_val, verbose = 1)
y_train_pred_1d = np.round(y_train_pred).dot(ohe.active_features_).astype(int)
y_val_pred_1d = np.round(y_val_pred).dot(ohe.active_features_).astype(int)



In [48]:
# Evaluate results
predicted_positive = np.round(np.mean(y_train_pred_1d), 3)
y_pred_random = np.random.binomial(1, predicted_positive, y_train.shape[0])
actual_positve = np.round(np.mean(y_train), 3)
precision_train = np.round(metrics.precision_score(y_train, y_train_pred_1d), 3)
precision_val = np.round(metrics.precision_score(y_val, y_val_pred_1d), 3)
precision_random = np.round(metrics.precision_score(y_train, y_pred_random), 3)
recall_train = np.round(metrics.recall_score(y_train, y_train_pred_1d), 3)
recall_val = np.round(metrics.recall_score(y_val, y_val_pred_1d), 3)
recall_random = np.round(metrics.recall_score(y_train, y_pred_random), 3)
f1score_train = np.round(metrics.f1_score(y_train, y_train_pred_1d), 3)
f1score_val = np.round(metrics.f1_score(y_val, y_val_pred_1d), 3)
f1score_random = np.round(metrics.f1_score(y_train, y_pred_random), 3)
print('actual positive:    ' + str(np.mean(y_train)))
print('predicted positive: ' + str(np.mean(y_train_pred_1d)))
print('precision (train/val/random): ' + str(precision_train) + ' / ' + str(precision_val) +
      ' / ' + str(precision_random))
print('recall (train/val/random):    ' + str(recall_train) + ' / ' + str(recall_val) +
      ' / ' + str(recall_random))
print('f1 score (train/val/random):  ' + str(f1score_train) + ' / ' + str(f1score_val) +
      ' / ' + str(f1score_random))

actual positive:    0.06186681684274642
predicted positive: 0.038300004508685176
precision (train/val/random): 0.67 / 0.658 / 0.064
recall (train/val/random):    0.415 / 0.405 / 0.039
f1 score (train/val/random):  0.512 / 0.501 / 0.048
