# Import

In [None]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# Read Data

In [None]:
df_train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
print(df_test.shape)
df_test.head()

# Pre-processing

In [None]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=2018)

embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

X_train = df_train["question_text"].fillna("_NA_").values
X_val = df_val["question_text"].fillna("_NA_").values
X_test = df_test["question_text"].fillna("_NA_").values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

y_train = df_train['target'].values
y_val = df_val['target'].values

# DNN without pretreined embeddings

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
clf = Model(inputs=inp, outputs=x)
clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(clf.summary())

In [None]:
clf.fit(X_train, y_train, batch_size=512, epochs=2, validation_data=(X_val, y_val))

In [None]:
pred_val_y = clf.predict([X_val], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(y_val, (pred_val_y>thresh).astype(int))))

In [None]:
pred_test_y = clf.predict([X_test], batch_size=1024, verbose=1)

# Submit

In [None]:
df_submission = pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')
df_submission['target'] = (0.34*pred_test_y).astype(int) # Best threshold for f1-Score
df_submission.head()

In [None]:
df_submission.to_csv('submission.csv',index=False)