In [None]:
import zipfile

In [None]:
zp = zipfile.ZipFile('../input/quora-insincere-questions-classification/embeddings.zip')
zp.extract('GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin')

In [None]:
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
# Read the data
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
texts = train_df['question_text']
labels = train_df['target']

train_x, valid_x, train_y, valid_y = train_test_split(
    texts, 
    labels,
    test_size=0.2,
    stratify=labels)

In [None]:
from gensim.models import KeyedVectors
from tensorflow.keras import layers

# Read the glove embeddings
def get_glove_vecs():
    glove_vecs = KeyedVectors.load_word2vec_format(
        './GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', 
        binary=True,
        limit=500000
    )
    # Construct vocab
    vocab = list(glove_vecs.key_to_index.keys())
    return glove_vecs.vectors, vocab

glove_vecs, vocab = get_glove_vecs()
print(glove_vecs.shape)

gc.collect()

In [None]:
encoder = layers.TextVectorization(
    split='whitespace',
    vocabulary=vocab,
    output_sequence_length=50,
    standardize=lambda tx: tf.strings.lower(tx),
    output_mode='int'
)

In [None]:
batch_size = 1000
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)).batch(batch_size)
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_x, valid_y)).batch(batch_size)

In [None]:
glove_vecs = np.vstack([np.zeros(shape=(2, glove_vecs.shape[1])), glove_vecs])
glove_vecs.shape
gc.collect()

In [None]:
train_vec_dataset = train_dataset.map(lambda x, y: (encoder(x), y)).prefetch(10)
valid_vec_dataset = valid_dataset.map(lambda x, y: (encoder(x), y)).prefetch(10)

In [None]:
# Construct the model
from tensorflow.keras import Model, Input

inputs = Input(shape=(None,), dtype='int64')
# x = encoder(inputs)
x = layers.Embedding(
    glove_vecs.shape[0],
    glove_vecs.shape[1],
    input_length=50,
    weights=[glove_vecs],
    trainable=False
)(inputs)
x = layers.Bidirectional(
    layers.LSTM(128)
)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = Model(inputs, outputs)
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.fit(train_vec_dataset, validation_data=valid_vec_dataset, epochs=6)

In [None]:
# Obtain the optimal cuttoff on the validation dataset
from sklearn.metrics import f1_score

valid_predictions = model.predict(valid_vec_dataset.map(lambda x, y: x))

for threshold in np.arange(0.1, 0.6, 0.1):
    print(f'F1 Score at thresh {threshold}: {f1_score(valid_y, (valid_predictions > threshold)*1)}')

In [None]:
# Obtain the predictions for the test dataset
test_df = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

test_dataset = tf.data.Dataset.from_tensor_slices(
    test_df['question_text']).batch(
    100).map(lambda x: encoder(x))

test_predictions = (model.predict(test_dataset) > 0.3)*1
test_df['prediction'] = test_predictions

# Save the results and submit
test_df[['qid', 'prediction']].to_csv('submission.csv', index=None)