This project comes from [this Kaggle competition](https://www.kaggle.com/c/quora-insincere-questions-classification). The aim of the project is to build a prediction model to identify whether a Quora question is insincere. The definition of an insincere question can be found [here](https://www.kaggle.com/c/quora-insincere-questions-classification/data).

**Import Packages**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.python.client.device_lib import list_local_devices

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

import os, gc
import io
import csv
import unicodedata
from functools import partial
from itertools import islice
from hyperopt import hp, fmin, tpe, space_eval, Trials
from tqdm import tqdm
import spacy
from zipfile import ZipFile

**Global Variable Definitions**

In [None]:
# Paths of input files
INPUT_DIR = "../input/"

PATH_TRAIN = os.path.join(INPUT_DIR, "train.csv")
PATH_TEST = os.path.join(INPUT_DIR, "test.csv")
PATH_RESULT = "submission.csv"

# Model Constants
MAX_SEQ_LEN = 50
MAX_NUM_WORDS = None # 7*10**4
LSTMLayer = CuDNNLSTM if any(d.device_type == "GPU" for d in list_local_devices()) else LSTM
NUM_EPOCH = 20

RANDOM_SEED = 0

**Functions for Performing Text Preprocessing and Feature Extraction**

In [None]:
def preprocess(df):
    """
    Prepreprocess question texts 
    """
    texts = df["question_text"].str.replace(r"\[math\].+\[/math\]", "math formula")
    texts = texts.apply(partial(unicodedata.normalize, "NFKD"))
    texts = texts.str.replace("[’`]", '\'')
    texts = texts.str.replace("[“”„]", '"')
    df["question_text"] = texts
    del texts
    gc.collect()

def fit_tokenizer(*dfs, max_num_words=None):
    """
    Fitting tokenizer from question texts
    """
    if max_num_words:
        tokenizer = Tokenizer(num_words=max_num_words + 2, oov_token="<UNK>")
    else:
        tokenizer = Tokenizer(oov_token="<UNK>")
    
    for df in dfs:
        tokenizer.fit_on_texts(df["question_text"])
    print(f"Number of tokenizer words: {len(tokenizer.word_index)}")
    return tokenizer

def compute_model_input(df, tokenizer):
    """
    Compute model input from df using provided tokenizer
    """
    texts = df["question_text"]
    sequences = tokenizer.texts_to_sequences(texts)
    encoded_text = pad_sequences(sequences, padding="post", maxlen=MAX_SEQ_LEN)
    
    df_features = pd.DataFrame(index=df.index)
    df_features["seq_len"] = [*map(len, sequences)]
    df_features["num_qmarks"] = [s.count('?') for s in texts]
    df_features["num_commas"] = [s.count(',') for s in texts]

    word_index = tokenizer.word_index
    qwords = ["how", "what", "when", "where", "which", "who", "whom", "whose", "why"]
    qwords.extend([w.capitalize() for w in qwords])
    qword_index = {filter(bool, map(word_index.get, qwords))}
#     print(tokenizer.word_docs)
    df_features["cnt_qwords"] = [any(w in qword_index for w in seq) for seq in sequences]

    return encoded_text, df_features

**Function for Loading Embedding Matrix**

In [None]:
class EmbeddingLoader:
    """
    Utility class for loading embeddings
    """
    STEM_FUNCS = [SnowballStemmer("english").stem, PorterStemmer().stem, LancasterStemmer().stem]    
    EMBEDDING_FILE = "../input/embeddings.zip"
    PATH_DICT = {"glove": "glove.840B.300d/glove.840B.300d.txt",
                 "fasttext": "wiki-news-300d-1M/wiki-news-300d-1M.vec",
                 "google-news": "GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"}
    DIM_EMBEDDING = 300

    @staticmethod
    def load(name, tokenizer):
        """
        Load word embeddings
        """
        word_count = tokenizer.num_words
        if not word_count:
            word_count = len(tokenizer.word_index)
        
        with ZipFile(EmbeddingLoader.EMBEDDING_FILE) as embeddings_file:
            with embeddings_file.open(EmbeddingLoader.PATH_DICT[name]) as embedding_file:
                embedding = pd.read_csv(embedding_file, sep=' ', index_col=0, header=None,
                                        dtype={i: np.float32 for i in range(1, EmbeddingLoader.DIM_EMBEDDING + 1)},
                                        quoting=csv.QUOTE_NONE)
        embedding_words = embedding.index
        embedding_accessor = embedding.loc
        print(f"Loaded embedding file for {name}. Computing embedding matrix.")
        
        word_count = tokenizer.num_words
        if word_count:
            df_word_idx = pd.DataFrame(index=["", *islice(tokenizer.word_index, word_count)])
        else:
            df_word_idx = pd.DataFrame(index=["", *tokenizer.word_index])
        df_embedding_matrix = df_word_idx.join(embedding, how="left").reindex(df_word_idx.index)
        
        df_uncovered = df_embedding_matrix.iloc[2:, 1].isna()
        df_uncovered_idx = df_uncovered[df_uncovered].index
        print(f"{len(df_uncovered_idx)} out-of-vocabulary (OOV) words")
        df_stemmed = pd.DataFrame(None, index=df_uncovered_idx)
        df_stemmed["stemmed"] = [s.strip("'“”„ ") for s in df_uncovered_idx]
        df_stemmed["covered"] = df_stemmed["stemmed"].isin(embedding_words)
        
        for stem_func in EmbeddingLoader.STEM_FUNCS:
            df_uncovered_idx = df_stemmed[~df_stemmed["covered"]].index
            stemmed = df_uncovered_idx.map(stem_func)
            df_stemmed.loc[df_uncovered_idx, "stemmed"] = stemmed
            df_stemmed.loc[df_uncovered_idx, "covered"] = stemmed.isin(embedding_words)
        
        df_uncovered_idx = df_stemmed[~df_stemmed["covered"]].index
        df_stemmed.loc[df_uncovered_idx, "stemmed"] = df_uncovered_idx.str.capitalize()
        df_embedding_matrix.fillna(df_stemmed.join(embedding, on="stemmed", how="inner"), inplace=True)
        print(f"After processing: {df_embedding_matrix[1].isna().sum()} OOV words")
        embedding_matrix = df_embedding_matrix.fillna(0).to_numpy()
        
        print(f"Embedding matrix computed (shape: {embedding_matrix.shape}).")
        del embedding, df_word_idx, df_embedding_matrix, df_stemmed, df_uncovered, df_uncovered_idx
        gc.collect()
        
        return embedding_matrix

**Attention Layer**

In [None]:
class Attention(Layer):
    """
    Attention layer
    """
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 has_bias=True, **kwargs):
        from tensorflow.keras import initializers, regularizers, constraints
        
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.has_bias = has_bias
        self.step_dim = step_dim
        self.feature_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.feature_dim = input_shape[-1]

        if self.has_bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, feature_dim)),
                        K.reshape(self.W, (feature_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)
        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.feature_dim

**Model Building Function**

In [None]:
def build_model(input_data, embedding_weights, hparams):
    """
    Build model using Keras API
    """
    dim_feature = input_data[1].shape[1]
    
    input_text = Input((MAX_SEQ_LEN, ), name="TextInput")
    input_features = Input((dim_feature, ), name="FeatureInput")
    
#     for embedding_weights in embeddings_weights:
    X1 = Embedding(*embedding_weights.shape, input_length=MAX_SEQ_LEN,
                   weights=[embedding_weights], trainable=False)(input_text)  # mask_zero=True
    X1 = Bidirectional(LSTMLayer(128, return_sequences=True))(X1)
#     X1 = Attention(3)(X1)
#     X1 = Bidirectional(LSTMLayer(128, return_sequences=True))(X1)
    X1 = Conv1D(64, 3, activation='relu')(X1)
    X1 = GlobalMaxPool1D()(X1)
    X1 = BatchNormalization()(X1)
    X1 = Dense(32, activation='relu', name="dense1")(X1)
    X2 = Dense(4, activation='relu', name="dense_features")(input_features)
    X = concatenate([X1, X2])
    X = Dense(8, activation='relu', name="dense2")(X)
    X = BatchNormalization()(X)
    output = Dense(1, activation="sigmoid", name="output")(X)

    model = Model(inputs=[input_text, input_features], outputs=output, name="QuestionClassificationModel")
    return model

**Model training and prediction functions**

In [None]:
def train_model(input_data, target, embedding_weights, hparams):
    """
    Model building and training
    """
    model = build_model(input_data, embedding_weights, hparams)
    
    if hparams["early_stopping"]:
        callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10)]
        val_split = 0.1
    else:
        callbacks = None
        val_split = 0
    
    optimizer = Adam(learning_rate=hparams["learning_rate"])
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    model.fit(input_data, target, validation_split=val_split, callbacks=callbacks, 
              epochs=NUM_EPOCH, batch_size=hparams["batch_size"],
              class_weight={0: hparams["neg_class_weight"], 1: 1}, shuffle=False, verbose=2)
    return model

def compute_predictions(model, input_data, hparams):
    """
    Compute predictions
    """
    y_prob = model.predict(input_data)
    return y_prob > hparams["pos_threshold"]

def run_cross_validation(input_data, target, embedding_weights, hparams, kfold=5):
    """
    Perform cross validation using provided input data, labels, as well as hyperparameters
    """

    splitter = StratifiedKFold(kfold)
    texts, features = input_data
    predictions = np.zeros(len(target), dtype=np.bool)
    
    for fold_idx, (train_index, test_index) in enumerate(splitter.split(features, target)):
        print(f"Running on Fold {fold_idx}")
        input_train = texts[train_index], features.iloc[train_index]
        target_train = target[train_index]
        model = train_model(input_train, target_train, embedding_weights, hparams)
        input_test = texts[test_index], features.iloc[test_index]
        predictions[test_index] = compute_predictions(model, input_test, hparams)[0]
    
    score = f1_score(target, predictions)
    print(f"F1 Score: {score}")
    return score

**Pipeline**

In [None]:
def run(test_output_path, tune_params=True):
    """
    Model training, 
    """
    HP_TUNING_EVALS = 100  # Number of evaluations in hyperparameter tuning
    
    df_train = pd.read_csv(PATH_TRAIN, usecols=[1, 2], dtype={"target": np.bool})
    df_test = pd.read_csv(PATH_TEST, index_col=0)
    
    # Text preprocessing
    print("Preprocessing and fitting tokenizers")
    preprocess(df_train)
    preprocess(df_test)
    
    # Fit tokenizers
    tokenizer = fit_tokenizer(df_train, df_test, max_num_words=MAX_NUM_WORDS)
    
    # Load embedding
    print("Loading embedding")
    embedding_weights = EmbeddingLoader.load("glove", tokenizer)

    # Compute model input and build model
    print("Computing input for model training")
    input_train = compute_model_input(df_train, tokenizer)
    target_train = df_train["target"]
    del df_train
    gc.collect()
    
    if tune_params:
        print("Tuning hyperparameters")
        # Hyperparameter search space
        hp_space = {
            "learning_rate": hp.loguniform("learning_rate", -8, 0),
            "batch_size": hp.choice("batch_size", [64, 256, 1024]),
            "early_stopping": hp.randint("early_stopping", 2),
            "neg_class_weight": hp.uniform("neg_class_weight", 0, 1),
            "pos_threshold": hp.uniform("pos_threshold", 0.2, 1)     # Threshold for classification
        }
        best_idx = fmin(partial(run_cross_validation, input_train, target_train, embedding_weights),
                        space=hp_space, algo=tpe.suggest, max_evals=HP_TUNING_EVALS)
        hparams = space_eval(space, best_idx)
        print(f"Best hyperparameters: {hparams}")
    else:
        hparams = {
            "learning_rate": 0.004,
            "batch_size": 1024,
            "early_stopping": 1,
            "neg_class_weight": 0.35,
            "pos_threshold": 0.5
        }
    
    if test_output_path:
        print("Training model")
        model = train_model(input_train, target_train, embedding_weights, hparams)
        model.summary()

        print("Classify test set")
        input_test = compute_model_input(df_test, tokenizer)
        result = compute_predictions(model, input_test, hparams)
        df_result = pd.DataFrame(result, columns=["prediction"], index=df_test.index, dtype=np.int8)
        df_result.to_csv(test_output_path)

In [None]:
run(PATH_RESULT, False)