## Importations

In [None]:
import re
import time
import gc
import random
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from tabulate import tabulate

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.utils.data

In [None]:
#unzip embedding files
import zipfile
import shutil
import os

if not os.path.exists('./embeddings') :
    os.mkdir('./embeddings')
    os.mkdir('./embeddings/glove.840B.300d/')
    with zipfile.ZipFile('../input/quora-insincere-questions-classification/embeddings.zip', 'r') as z:
        with z.open('glove.840B.300d/glove.840B.300d.txt') as zf, open('./embeddings/glove.840B.300d/glove.840B.300d.txt', 'wb') as f:
            shutil.copyfileobj(zf, f)

## Parameters

In [None]:
embed_size = 300 # how big is each word vector (given by the number of column in the embedding matrix)
max_features = 30000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 72 # max number of words in a question to use

batch_size = 512
train_epochs = 5

SEED = 1029


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    x = re.sub('[0-9]{1}', '#', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", 
                "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
                "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
                "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", 
                "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", 
                "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", 
                "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", 
                "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", 
                "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 
                'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 
                'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 
                'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 
                'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 
                'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
spaces_re = re.compile('\s')
number_re, numbers_re = re.compile('(?<![0-9])[0-9]{1}(?![0-9])'), re.compile('[0-9]{2,}')
CAPS_re = re.compile('[A-Z]{2,}')
punct_re = re.compile('(\?|!){2,}')

def prop_pattern(text, pattern_re):
    return len(pattern_re.findall(text))/(len(spaces_re.findall(text))+1)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

## Statistic Analysis

In [None]:
#Downloading

train_df = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

In [None]:
#Check the proportion of sincere/insincere questions

description = train_df['target'].value_counts(sort=False)

print(tabulate([['Count',description[0],description[1]], 
                ['Proportion',"{0:.0%}".format(description[0]/sum(description)),"{0:.0%}".format(description[1]/sum(description))]], 
                headers=['',description.index[0], description.index[1]]))

del description

The dataset is strongly unbalanced <br>
For the rest of the code, we consider the test set to have the same distribution of sincere/insincere questions.

In [None]:
#Get a look at the number of words in each category

words_nb_df = train_df.copy()
words_nb_df['word_nb'] = words_nb_df["question_text"].apply(lambda x : len(spaces_re.findall(x)))

print(tabulate([['Mean',words_nb_df[words_nb_df['target']==0]['word_nb'].mean(),words_nb_df[words_nb_df['target']==1]['word_nb'].mean()],
               ['25%',words_nb_df[words_nb_df['target']==0]['word_nb'].quantile(0.25),words_nb_df[words_nb_df['target']==1]['word_nb'].quantile(0.25)],
               ['Med',words_nb_df[words_nb_df['target']==0]['word_nb'].median(),words_nb_df[words_nb_df['target']==1]['word_nb'].median()],
               ['75%',words_nb_df[words_nb_df['target']==0]['word_nb'].quantile(0.75),words_nb_df[words_nb_df['target']==1]['word_nb'].quantile(0.75)]], 
                headers=['','Sincere', 'Insincere']))

del words_nb_df

The number of words is slightly different for sincere and insincere question. <br>
_The average number of words is small compare to the padding._

In [None]:
#Get a look at the proportion of mispelled words, CAPS WORDS, punctuation, number in each category

pattern_list = [number_re, numbers_re, CAPS_re, punct_re, mispellings_re]
legends = ["Number", "Numbers", "CAPS", "Punctuation", "Mispellings"]
display = []
sincere_df = train_df[train_df['target']==0]["question_text"].copy()
insincere_df = train_df[train_df['target']==1]["question_text"].copy()

for i in range(len(legends)) : 
    if legends[i] == "Mispellings" : 
        sincere_df = sincere_df.apply(lambda x: x.lower())
        insincere_df = insincere_df.apply(lambda x: x.lower())
        
    metrics_sincere = sincere_df.apply(lambda x: prop_pattern(x, pattern_list[i]))
    metrics_insincere = insincere_df.apply(lambda x: prop_pattern(x, pattern_list[i]))
    display.append([legends[i] + " - Prop(Words)", "{0:.1%}".format(metrics_sincere.mean()),"{0:.1%}".format(metrics_insincere.mean())])
    display.append([legends[i] + " - Prop(Texts)", "{0:.1%}".format(sum(metrics_sincere>0)/len(metrics_sincere)),"{0:.1%}".format(sum(metrics_insincere>0)/len(metrics_insincere))])
    
print(tabulate(display, headers=['','Sincere', 'Insincere']))  

del sincere_df, insincere_df, metrics_sincere, metrics_insincere, train_df, test_df

**The statistical analysis justifies/or not the following preprocess :**
* Number cleaning : No significant differences
* Numbers cleaning : No significant differences
* Lower cleaning : No significant differences
* Punctuation cleaning : Too few examples
* Mispellings cleaning : Large gap between Sincere and Insincere categories. Cleaning mispellings could delete usefull information. For the basic version, we will not consider this effect.

## Pre-Processing

In [None]:
def load_and_prec():
    train_df = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
    test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
    
    # lower
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: x.lower())
    
    # Clean the text
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))
    
    # Clean numbers
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    
    # Clean misspellings
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    
    # Is ascii ?
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values
    # Get rid of the missing values ?

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data (data is ordered)
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index

In [None]:
import pickle
import os
from tqdm import tqdm
tqdm.pandas()

if not os.path.exists('./train_X.npy') : 

    start_time = time.time()

    train_X, test_X, train_y, word_index = load_and_prec()

    total_time = (time.time() - start_time) / 60

    print("Took {:.2f} minutes".format(total_time))
    
    np.save('./train_X', train_X, allow_pickle=True)
    np.save('./test_X', test_X, allow_pickle=True)
    np.save('./train_y', train_y, allow_pickle=True)

    # saving
    with open('./word_index.pickle', 'wb') as handle:
        pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

else : 
    train_X = np.load('./train_X.npy', allow_pickle=True)
    test_X = np.load('./test_X.npy', allow_pickle=True)
    train_y = np.load('./train_y.npy', allow_pickle=True)
    
    # loading
    with open('./word_index.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

gc.collect()

## Embeddings

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../working/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
if not os.path.exists('./embedding_matrix.npy') :

    start_time = time.time()

    embedding_matrix = load_glove(word_index)
    print(np.shape(embedding_matrix))

    total_time = (time.time() - start_time) / 60

    print("Took {:.2f} minutes".format(total_time))
    
    np.save('./embedding_matrix', embedding_matrix, allow_pickle=True)

else : 
    
    embedding_matrix = np.load('./embedding_matrix.npy', allow_pickle=True)
    
gc.collect()

## Neural Networks

In [None]:
#Model Basic

class BasicNet(nn.Module):
    def __init__(self):
        super(BasicNet, self).__init__()
        
        #Parameters
        hidden_size = 60
        self.word_layer = 16
        num_layers = 2
        dropout = 0.1
        
        #Lookup table that stores words embeddings.
        self.embedding = nn.Embedding(max_features, embed_size, padding_idx=0)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        
        #Neural Network
        self.lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.sigmoid = nn.Sigmoid()
        self.linear_word = nn.Linear(hidden_size*2, self.word_layer) #Bidirectionnel (*2)
        self.linear_sentence = nn.Linear(self.word_layer*maxlen, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_lstm, _ = self.lstm(h_embedding)
        h_lstm = self.sigmoid(h_lstm)
        
        result = self.sigmoid(self.linear_word(h_lstm))
        result = self.dropout(result)
        result = result.reshape((len(result),-1,))
        
        result = self.linear_sentence(result)
        
        return result

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
#Search the best threshold for the training set
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Training

In [None]:
#Basic Model

train_epochs = 5

splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED).split(train_X, train_y))

train_preds = np.zeros((len(train_X)))
test_preds = np.zeros((len(test_X)))

seed_torch(SEED)

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model = BasicNet()
    model.cuda()
    
    #Loss combines a Sigmoid layer and the Entropy Loss
    loss_fn = torch.nn.BCEWithLogitsLoss(weight=None, reduction="sum") 
    optimizer = torch.optim.Adam(model.parameters()) #Adam optimizes the gradient descent
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(train_epochs):
        start_time = time.time()
        
        model.train() #Training mode
        avg_loss = 0.
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad() #The optimizer needs to be reset
            loss.backward() #Computes the direction to follow for minimizing the loss
            optimizer.step() #Updates the parameters of the NN
            avg_loss += loss.item() / len(train_loader)
        
        model.eval() # Evaluation mode
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        avg_val_loss = 0.
        
        for i, (x_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(x_batch).detach() #Deep copy
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, avg_loss, avg_val_loss, elapsed_time))
        
    #Dataset is too large to be tested without batch
    for j, (x_batch,) in enumerate(test_loader):
        y_pred = model(x_batch).detach()

        test_preds_fold[j * batch_size:(j+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits) 

## Submission

In [None]:
search_result = threshold_search(train_y, train_preds)
search_result

In [None]:
sub = pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')
sub.prediction = test_preds > search_result['threshold']
sub['prediction'] = sub['prediction'].apply(lambda x : int(x))
sub.to_csv("./submission.csv", index=False)

In [None]:
!head ./submission.csv