In [None]:
!unzip -n ../input/quora-insincere-questions-classification/embeddings.zip

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.engine import InputSpec, Layer

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

import re
import random
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
tqdm.pandas()

In [None]:
import tensorflow as tf
import keras
print(f"tf version: {tf.__version__}")
print(f"keras version: {keras.__version__}")

In [None]:
max_length = 30 # max number of words in a question to use

embedding_size = 300
learning_rate = 0.001
batch_size = 128
num_epoch = 20

In [None]:
train = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

In [None]:
train_X = train["question_text"].values
test_X = test["question_text"].values

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)
train_X = pad_sequences(train_X, maxlen=max_length)
test_X = pad_sequences(test_X, maxlen=max_length)

train_y = train['target'].values

In [None]:
embedding_files={"glove":r"glove.840B.300d/glove.840B.300d.txt"}

def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

def load_data(embedding,word_index):
    embedding_file=embedding_files.get(embedding)
    if embedding=="glove":
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = len(word_index)+1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    return embedding_matrix 

In [None]:
def build_model(embedding_matrix, nb_words, embedding_size=300):
    inp = Input(shape=(max_length,))
    x = Embedding(nb_words, embedding_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(64))(x)
    predictions = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=predictions)
    adam = optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=adam, loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [None]:
pred_prob = np.zeros((len(test_X),), dtype=np.float32)

print("Loading embedding matrix...")
embedding_matrix_glove = load_data("glove",tokenizer.word_index)

In [None]:
embedding_matrix = embedding_matrix_glove
model = build_model(embedding_matrix, len(embedding_matrix), embedding_size)
model.fit(train_X, train_y, batch_size=batch_size, epochs=num_epoch,validation_split=0.01 ,verbose=1)
pred_prob=np.squeeze(model.predict(test_X, batch_size=batch_size, verbose=1))

In [None]:
train_pred=np.squeeze(model.predict(train_X, batch_size=batch_size, verbose=1))

In [None]:
i=0.01
maxi=0.99
maxf1=0
while i<0.99:
    pred_train = (train_pred > i).astype(np.int)
    f1=f1_score(train_y, pred_train)
    if f1>maxf1:
        maxi=i
        maxf1=f1
    i+=0.01
print(f"threshold: {maxi}")
print(f"max f1 score: {maxf1}")

In [None]:
submission = pd.DataFrame.from_dict({'qid': test['qid']})
submission['prediction'] = (pred_prob>maxi).astype(int)
submission.to_csv('submission.csv', index=False)