In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from gensim.models import KeyedVectors
from sklearn.preprocessing import StandardScaler

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Reshape, Flatten, LSTM, Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [None]:
# -----------------------------------
# functions for pre-processing texts 
# -----------------------------------

def preprocess(text):
    
    text = text.lower().split()
    text = " ".join(text)
    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return(text)   

# ----

def read_train_data(file):
        texts = [] 
        labels = []
        df_train = pd.read_csv(file)  
        line_num = 0
        for idx in range(len(df_train)):
            texts.append(preprocess(df_train['question_text'][idx]))
            labels.append(df_train['target'][idx])
            line_num += 1
        return texts, labels
    
def read_test_data(file):
        texts = [] 
        ids = []
        df_test = pd.read_csv(file)
        line_num = 0
        for idx in range(len(df_test)):
            texts.append(preprocess(df_test['question_text'][idx]))
            ids.append(df_test['qid'][idx])
            line_num += 1
        return texts, ids

# ---

def preprocess_data(train_data_file, test_data_file, max_seq_len, split_ratio):

        # 1) load train and test datasets
        texts, labels= read_train_data(train_data_file)  
        print('Finished loading train.csv: %s samples' % len(texts))
        
        test_texts, test_ids = read_test_data(test_data_file)
        print('Finished loading test.csv: %s samples' % len(test_texts))
                      
        # 2) train the tokenizer
        tokenizer = Tokenizer(num_words=200000)
        tokenizer.fit_on_texts(texts + test_texts)        
        word_index = tokenizer.word_index
        print('%s tokens in total' % len(word_index))

        # 3) sentences to sequences
        train_sequences = tokenizer.texts_to_sequences(texts)
        test_sequences = tokenizer.texts_to_sequences(test_texts)
        x = pad_sequences(train_sequences, maxlen=max_seq_len, padding='post', truncating='post')
        test_x = pad_sequences(test_sequences, maxlen=max_seq_len, padding='post', truncating='post')
    
        # 4) final step
        num_samples = len(x)
        perm = np.random.permutation(num_samples)
        idx = int(num_samples*split_ratio)
        idx_train = perm[:idx]
        idx_val = perm[idx:]
        
        train_x = x[idx_train]
        val_x = x[idx_val]
        
        y = np.array(labels)
        train_y = y[idx_train]
        val_y = y[idx_val]
            
    
        return train_x, train_y, val_x, val_y, test_x, test_ids, word_index

# ------------------------------------------
# function for building the model
# ------------------------------------------

def build_model(max_seq_len, word_index, embedding_dim, embedding_matrix):
    
    # 1) Embedding layer
    inp = Input(shape=(max_seq_len,), dtype='int32')

    x = Embedding(len(word_index)+1,
                  embedding_dim,
                  input_length=max_seq_len,
                  weights=[embedding_matrix],
                  trainable=False)(inp)

    # 2) LSTM Layer
    x = LSTM(64,dropout=0.2, recurrent_dropout=0.2)(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    # 3) Dense Layer
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    # 4) Output Layer
    preds = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inp, outputs=preds)
    return model

In [3]:
# --- Step 1 Preprocessing texts (texts to numerical values)

max_seq_len = 30
split_ratio = 0.8
train_file = '../input/quora-insincere-questions-classification/train.csv'
test_file = '../input/quora-insincere-questions-classification/test.csv'
train_x, train_y, val_x, val_y, test_x, test_ids, word_index = \
preprocess_data(train_file, test_file, max_seq_len, split_ratio)

Finished loading train.csv: 1306122 samples
Finished loading test.csv: 375806 samples
220853 tokens in total
Shape of training data: (1044897, 30)
Shape of training label: (1044897,)
Shape of val data: (261225, 30)
Shape of val label: (261225,)
Shape of test data: (375806, 30)


In [4]:
# --- Step 2 Prepare embedding matrix

embedding_dim = 300
embedding_matrix = np.zeros((max(list(word_index.values())) + 1, embedding_dim), dtype = 'float32')
embedding_file='../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
f = open(embedding_file)
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    if word not in word_index:
       continue
    embedding_matrix[word_index[word]] = np.asarray(values[1:], dtype='float32')
f.close()

2196017it [03:28, 10531.77it/s]


In [5]:
# --- Step 3 Build and train model

#keras.backend.clear_session()
model = build_model(max_seq_len, word_index, embedding_dim, embedding_matrix)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 300)           66256200  
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0   

In [6]:
nb_epoches = 200

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_name = 'model_best.h5'
model_checkpoint = ModelCheckpoint(model_name, save_best_only=True)

hist = model.fit(train_x, train_y, \
                 validation_data=(val_x, val_y), \
                 epochs=nb_epoches, batch_size=2048, shuffle=True, verbose=2, \
                 callbacks=[early_stopping, model_checkpoint])

model.load_weights(model_name)
best_val_score = min(hist.history['val_loss']) 
print('Min val loss is', best_val_score)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/200
 - 26s - loss: 0.2872 - acc: 0.9151 - val_loss: 0.1295 - val_acc: 0.9522
Epoch 2/200
 - 25s - loss: 0.1371 - acc: 0.9470 - val_loss: 0.1155 - val_acc: 0.9540
Epoch 3/200
 - 25s - loss: 0.1239 - acc: 0.9503 - val_loss: 0.1112 - val_acc: 0.9557
Epoch 4/200
 - 25s - loss: 0.1183 - acc: 0.9525 - val_loss: 0.1089 - val_acc: 0.9571
Epoch 5/200
 - 25s - loss: 0.1150 - acc: 0.9541 - val_loss: 0.1068 - val_acc: 0.9580
Epoch 6/200
 - 26s - loss: 0.1132 - acc: 0.9550 - val_loss: 0.1056 - val_acc: 0.9586
Epoch 7/200
 - 25s - loss: 0.1108 - acc: 0.9562 - val_loss: 0.1046 - val_acc: 0.9588
Epoch 8/200
 - 25s - loss: 0.1091 - acc: 0.9569 - val_loss: 0.1039 - val_acc: 0.9589
Epoch 9/200
 - 25s - loss: 0.1079 - acc: 0.9570 - val_loss: 0.1026 - val_acc: 0.9595
Epoch 10/200
 - 25s - loss: 0.1064 - acc: 0.9576 - val_loss: 0.1017 - val_acc: 0.9596
Epoch 11/200
 - 25s - loss: 0.1054 - acc: 0.9580 - val_loss: 0.1012 - val_acc: 0.9594
Epoch 12/2

In [7]:
# --- Final step: submission

preds = model.predict(test_x, batch_size=1024, verbose=1)
preds = (preds > 0.35).astype(int)

sub = pd.DataFrame({'qid':test_ids, 'prediction':preds.ravel()})
sub.to_csv('submission.csv', index=False)

