Through this notebook, I am trying to give very basic introduction to natural language processing pipeline. This notebook is related to this [blog](http://) on medium. 

# Introduction to competition

Quora is a platform that empowers people to learn from each others. In this platform people can ask question and any member can answer to the questions. But there are some questions that intend to make statement rather than look for answers. These questions are labeled as 'insincere'.

In this kernel we use the dataset provided in the above mentioned [competiotion](https://www.kaggle.com/c/quora-insincere-questions-classification), where we are supposed to label each question if it is 'insincere' or not. The dataset contained 1.31 million questions which are labeled 0 or 1 (1 is for 'insincere' and 0 is for 'sincere'). Out of 1.31 million there are about 80k questions that are labeled as 'insincere'.



# Planning

In order to solve this problem, we need to build a model that can classify a question if its sincere or not.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

print(os.listdir("../input"))
import operator 


In [None]:
from gensim.models import KeyedVectors

In [None]:
import re, string
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import keras

from keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, Dense
from keras.layers import concatenate, CuDNNGRU, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import load_model
from keras.models import Model


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tqdm
import nltk
from nltk.corpus import stopwords

In [None]:
test_df = pd.read_csv('../input/test.csv')
test_df.head()


In [None]:
train_df = pd.read_csv('../input/train.csv')
train_df.head()

In [None]:
lens = train_df.question_text.str.len()
lens.mean(), lens.std(), lens.max()

In [None]:
all_df = pd.concat([train_df ,test_df])

print("Total number of questions: ", all_df.shape[0])

In [None]:
max_features = 100000
ques_len= 72

## Preprocessing

In [None]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

In [None]:
train_df["question_text"] = train_df["question_text"].fillna(NAN_WORD)
test_df["question_text"] = test_df["question_text"].fillna(NAN_WORD)
sub = test_df[['qid']]

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def clean_text(s):
    return re_tok.sub(r' \1 ', s).lower()


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
%%time
print("    Cleaning train questions")
train_df["question_text"] = train_df["question_text"].apply(clean_text)
print("    Cleaning test questions")
test_df["question_text"] = test_df["question_text"].apply(clean_text)

print("    Removing numbers from train questions")
train_df["question_text"] = train_df["question_text"].apply(clean_numbers)
print("    Removing numbers from test questions")
test_df["question_text"] = test_df["question_text"].apply(clean_numbers)

# Tokenize text

In [None]:
%%time
tokenizer = Tokenizer(num_words=max_features, oov_token=UNKNOWN_WORD)
tokenizer.fit_on_texts(list(train_df["question_text"]))

In [None]:
%%time
train_X = tokenizer.texts_to_sequences(train_df["question_text"])
test_X = tokenizer.texts_to_sequences(test_df["question_text"])

In [None]:
train_X = pad_sequences(train_X, maxlen=ques_len)
test_X = pad_sequences(test_X, maxlen=ques_len)

In [None]:
train_y = train_df['target'].values
# test_y = test_df['target'].values

## Loading Embedding file



In [None]:
embd_file =  '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

In [None]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [None]:
%%time
print("Extracting Paragram embedding")
embeddings_index = load_embed(embd_file)

# Creating Embedding matrics

In [None]:
%%time
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [None]:
## rebuilding embedding matrics
nb_words = min(max_features, len(tokenizer.word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        

# Building classification model

In [None]:
input_layer = Input(shape=(ques_len,))
embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                            weights=[embedding_matrix], trainable=False)(input_layer)
x = SpatialDropout1D(0.2)(embedding_layer)
x = Bidirectional(CuDNNGRU(90, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(90, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
x = Dense(256, activation="relu")(x)
output_layer = Dense(1, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
    metrics=['accuracy']
)

model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('saved-dmodel-{acc:03f}.h5', verbose=1, monitor='val_acc',save_best_only=True, mode='auto')  

In [None]:
model.fit(train_X, train_y, batch_size=128, validation_split=0.1, callbacks=[checkpoint], epochs=8)

# Generating the prediction

In [None]:
preds = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
preds = preds.reshape((-1, 1))

In [None]:
pred_test_y = (preds>0.5).astype(int)
sub['prediction'] = pred_test_y

In [None]:
sub.to_csv("submission.csv", index=False)