In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print (os.listdir('../input'))

In [None]:
import nltk
import operator
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
data = '../input/quora-insincere-questions-classification/'

In [None]:
# Loading data
train_df = pd.read_csv(data + 'train.csv')
print (train_df.shape)
train_df.head()

In [None]:
train_df_len = train_df.shape[0]
print ('Train data length: ',train_df_len)

In [None]:
# statistics of target 1 vs target 0
t0, t1 = len(train_df[train_df.target == 0]), len(train_df[train_df.target == 1])
t0_pct, t1_pct = t0/train_df_len * 100, t1/train_df_len * 100
print (f'Target 0 vs Target 1 = {t0} vs {t1} ,{t0_pct:.2f}% vs {t1_pct:.2f}%')

In [None]:
test_df = pd.read_csv(data + 'test.csv')
test_df_len = test_df.shape[0]
print ('Test data length: ',test_df_len)
test_df.head()

In [None]:
sample_df = pd.read_csv(data + 'sample_submission.csv')
print ('sample sub length: ', sample_df.shape[0])
sample_df.head()

In [None]:
del sample_df

## Preprocessing
#### From the reference, Paragram will be used as pre-trained embeddings.
#### Preprocessing steps - 
1. lower
2. clean contractions
3. replace special characters
4. tokenize
5. remove stopwords


In [None]:
# contraction corrections
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [None]:
def clean_contractions(text, contraction_mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([contraction_mapping[word] if word in contraction_mapping else word for word in text.split(" ")])
    return text

In [None]:
# example
text = "I can`t go to work today. I'd rather stay home."
text = clean_contractions(text, contraction_mapping)
text

In [None]:
# special characters
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct

In [None]:
punct_mapping = {
    "‘": "'",    "₹": "e",      "´": "'", "°": "",         "€": "e",
    "™": "tm",   "√": " sqrt ", "×": "x", "²": "2",        "—": "-",
    "–": "-",    "’": "'",      "_": "-", "`": "'",        '“': '"',
    '”': '"',    '“': '"',      "£": "e", '∞': 'infinity', 'θ': 'theta',
    '÷': '/',    'α': 'alpha',  '•': '.', 'à': 'a',        '−': '-',
    'β': 'beta', '∅': '',       '³': '3', 'π': 'pi'
}

In [None]:
def clean_special_chars(text, punct, punct_mapping):
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    for p in punct:
        text = text.replace(p, f' {p} ')
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    return text

In [None]:
# example
text = "I have $20. So, I can buy an awesome watch!!"
text = clean_special_chars(text,punct,punct_mapping)
text

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords

In [None]:
def preprocess(df, contraction_mapping, punct, punct_mapping):
    texts = df.question_text
    processed_texts = texts.apply(lambda x:x.lower())
    processed_texts = processed_texts.apply(lambda x: clean_contractions(x, contraction_mapping))
    processed_texts = processed_texts.apply(lambda x: clean_special_chars(x, punct, punct_mapping))
    processed_texts = processed_texts.apply(lambda x: re.split('\W+', x))
    processed_texts = processed_texts.apply(lambda x: [token for token in x if token not in stopwords])
    df['processed_text'] = processed_texts

## Choose data from trainset
In this training, some portion of the data will be used. The number of negative data is very small compared to positive.
Since the test run with big portion of positive data made the result worse, the positive data is cut down to some portion. How many to read is a big question though.
* test data: 56370
* target 0/1 ratio: 93.81/6.19%, 1225312/80810 (very skewed)

The total number of data is set to become 10x of test data after train/validation split by 0.9 to 0.1

In [None]:
sample_rows_t0 = 639190    # positive data
sample_rows_t1 = 80810     # negative data
df_t0 = train_df[train_df.target == 0].sample(sample_rows_t0)
df_t1 = train_df[train_df.target == 1].sample(sample_rows_t1)

In [None]:
print (f'df_t0 length : {df_t0.shape[0]}')
print (f'df_t1 length : {df_t1.shape[0]}')

In [None]:
preprocess(df_t0, contraction_mapping, punct, punct_mapping)
df_t0.head()

In [None]:
preprocess(df_t1, contraction_mapping, punct, punct_mapping)
df_t1.head()

In [None]:
preprocess(test_df, contraction_mapping, punct, punct_mapping)
test_df.head()

## Find Vocabulary
Memory restriction is tight. Loading whole pretrained embeddings easily leads to memory exhaustion. To save memory, below just grabs vocabulary found in train and test data.

In [None]:
def build_vocab(texts, vocab):
    for word in texts:
        vocab.add(word)

In [None]:
vocab = set()
df_t1.processed_text.apply(lambda x:build_vocab(x,vocab))
df_t0.processed_text.apply(lambda x:build_vocab(x,vocab))
test_df.processed_text.apply(lambda x:build_vocab(x,vocab))
print (len(vocab))

## Loading Embeddings

In [None]:
from zipfile import ZipFile
import codecs
file = ZipFile('../input/quora-insincere-questions-classification/embeddings.zip','r')
print (file.printdir())

In [None]:
# choosing paragram 
paragram = file.open(file.namelist()[8])     # since we want to use paragram as pretrained embeddings, hence index 8

In [None]:
from tqdm import tqdm

In [None]:
# load embeddings
word2vec = {}
i = 0
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
for line in tqdm(codecs.iterdecode(paragram,'latin')):
    word, coefs = get_coefs(*line.split(" "))
    if word in vocab:
        word2vec[word] = coefs

In [None]:
print ('Vocab length: ',len(vocab))
print ('Word2Vec length: ',len(word2vec))

### Question Word statistics
The number of question in words varies. To deal with both long and short questions, we find the appropriate number of words.

In [None]:
lens_t0 = list(map(len, df_t0.processed_text))
lens_t1 = list(map(len, df_t1.processed_text))
lens_test = list(map(len, test_df.processed_text))

print (f'For positive questions: Min words:{min(lens_t0)} vs Max words:{max(lens_t0)}')
print (f'For negative questions: Min words:{min(lens_t1)} vs Max words:{max(lens_t1)}')
print (f'For test questions: Min words:{min(lens_test)} vs Max words:{max(lens_test)}')

In [None]:
def freq_stats(tag,counts, key, topk, total):
    most_freqs = sorted(counts, key=key, reverse=True)[:topk]
    freqs = [counts[freq] for freq in most_freqs]
    
    print (f'{tag}: best {topk} frequent word count: {most_freqs}')
    print (f'freqs: {freqs}')
    print (f'Covers: {sum(freqs)/total*100:.2f}%')
    
    return max(most_freqs)

In [None]:
from collections import Counter

counts_t0 = Counter(lens_t0) # counts words freq. Ex. How many 13 words questions are there?
counts_t1 = Counter(lens_t1)
counts_test = Counter(lens_test)

topk = 20  # pick top 20 freq of words
max_t0 = freq_stats('pos',counts_t0, counts_t0.get, topk, sample_rows_t0)
max_t1 = freq_stats('neg',counts_t1, counts_t1.get, topk, sample_rows_t1)
max_test = freq_stats('test',counts_test, counts_test.get, topk, test_df_len)

print (max_t0, max_t1, max_test)

In [None]:
seq_length = max(max_t0, max_t1, max_test)
seq_length

In [None]:
word2vec['india'].shape[0]

## Build Word Matrix

In [None]:
def build_weights_matrix(word2vec):
    word2idx = {}
    weights_matrix = np.zeros((len(word2vec), 300))
    for i, (k,v) in enumerate(word2vec.items()):
        word2idx[k] = i
        weights_matrix[i] = v
    return word2idx, weights_matrix

In [None]:
word2idx, weights_matrix = build_weights_matrix(word2vec)

In [None]:
weights_matrix.shape

In [None]:
def encode_question(word2idx, text, seq_length):
    encoded = []
    for word in text[:seq_length]:
        try:
            encoded.append(word2idx[word])
        except KeyError:
            # missing words in the table
            continue
    
    return np.array(encoded, dtype='int_')

In [None]:
def add_padding(np_arr, seq_length):
    curr_length = np_arr.shape[0]
    if curr_length < seq_length:
        padding = np.zeros((seq_length - curr_length, ), dtype = 'int_')
        return np.concatenate((padding,np_arr))
    else:
        return np_arr

In [None]:
def create_dataset(texts, label, word2idx, seq_length):
    texts_len = len(texts)
    y = np.array([label]*texts_len, dtype='float')
    X = []
    for i, text in enumerate(texts):
        text_array = encode_question(word2idx, text, seq_length)
        text_array = add_padding(text_array, seq_length)
        X.append(text_array)
    return np.array(X), y

In [None]:
# split train data to train and validation
test_size = 0.1
train_texts_t0, val_texts_t0 = train_test_split(df_t0.processed_text, test_size = test_size)
train_texts_t1, val_texts_t1 = train_test_split(df_t1.processed_text, test_size = test_size)

In [None]:
train_X_t0, train_y_t0 = create_dataset(train_texts_t0, 0, word2idx, seq_length)
train_X_t1, train_y_t1 = create_dataset(train_texts_t1, 1, word2idx, seq_length)

train_X = np.concatenate((train_X_t0, train_X_t1))
train_y = np.concatenate((train_y_t0, train_y_t1))

print (f'Shapes: train_X {train_X.shape}, train_y {train_y.shape}')

In [None]:
val_X_t0, val_y_t0 = create_dataset(val_texts_t0, 0, word2idx, seq_length)
val_X_t1, val_y_t1 = create_dataset(val_texts_t1, 1, word2idx, seq_length)

val_X = np.concatenate((val_X_t0, val_X_t1))
val_y = np.concatenate((val_y_t0, val_y_t1))

print (f'Shapes: val_X {val_X.shape}, val_y {val_y.shape}')

## Pytorch - 

In [None]:
# importing libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable

In [None]:
# device config
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# create tensor dataset
train_set = TensorDataset(torch.from_numpy(train_X), torch.from_numpy(train_y))
val_set = TensorDataset(torch.from_numpy(val_X), torch.from_numpy(val_y))

In [None]:
# create dataloader
batch_size = 200

train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size)

## Building Network Architecture


In [None]:
class RNN(nn.Module):
    def __init__(self, weights, output_size, hidden_size, n_layers, bidirectional=False, dropout=0.5,layer_dropout=0.3):
        super(RNN, self).__init__()
        
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        if bidirectional:
            self.direction = 2
        else:
            self.direction = 1
        
        num_embeddings, embedding_dim = weights.shape
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(weights))
        self.embedding.requires_grad = False
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, n_layers, batch_first=True, dropout=dropout,bidirectional = bidirectional)
        
        # GRU layer
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, batch_first=True, dropout=dropout,bidirectional = bidirectional)
            
        # dropout layer
        self.dropout = nn.Dropout(layer_dropout)
        
        # Fully Connected Layer
        self.fc = nn.Linear((hidden_size*self.direction), output_size)
        
        # Sigmoid activation layer
        self.sig = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        seq_len = x.size(1)
        lstm_hidden = hidden
        
        embeds = self.embedding(x)
        
        lstm_out, lstm_hidden = self.lstm(embeds, lstm_hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_size)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]   # get last batch of labels
        
        return  sig_out, hidden

    def init_hidden(self, batch_size, bidirectional=False):
        weight = next(self.parameters()).data
        # for LSTM (initial_hidden_state, initial_cell_state)
        lstm_hidden = (
            weight.new(self.n_layers*self.direction, batch_size, self.hidden_size).zero_().to(device),
            weight.new(self.n_layers*self.direction, batch_size, self.hidden_size).zero_().to(device)
        )
        # for GRU, initial_hidden_state
        #gru_hidden = weight.new(self.n_layers*self.direction, batch_size, self.n_hidden).zero_().to(DEVICE)
        return lstm_hidden
        

In [None]:
# Instantiate the Network
# Hyperparams
output_size = 1
hidden_size = 256
n_layers = 2


net = RNN(weights_matrix, output_size, hidden_size, n_layers, bidirectional=False).to(device)
print(net)


In [None]:
# Training params
lr = 0.00001
epochs = 10
clip = 5  # gradient clipping

In [None]:
# loss and optimizer functions
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=lr)

In [None]:
import datetime

In [None]:
def train(net, criterion, optimizer, train_loader, clip, epoch, epochs, gru=False):
    counter = 0
    print_every = 500
    train_length = len(train_loader)
    
    # init hidden state
    h = net.init_hidden(batch_size)
    
    train_losses = []
    net.train()
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        h = tuple([each.data for each in h])
        
        # zero accumulated gradients
        net.zero_grad()
        
        # forward pass
        outputs, h = net(inputs, h)
        
        # calculate loss and perform backprop
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        
        # clip_grad_norm helps prevent exploding gradient
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        # loss stats
        if counter % print_every == 0:
            train_losses.append(loss.item())
            print (f'Epoch: {epoch+1}/{epochs} \t Step: {counter} \t Train Loss: {np.mean(train_losses):.6f} \t Time: {datetime.datetime.now()}')
            

In [None]:
len(train_loader)

In [None]:
# validation loss
def validate(net, criterion, optimizer, val_loader, epoch, epochs, gru=False):
    # init hidden state
    h = net.init_hidden(batch_size)
    
    val_losses = []
    acc = 0.0
    net.eval()
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            h = tuple([each.data for each in h])

            # forward pass
            outputs, h = net(inputs, h)

            # calculate loss and perform backprop
            val_loss = criterion(outputs.squeeze(), labels.float())
            val_losses.append(val_loss.item())
            
            acc += torch.eq(labels.float(), torch.round(outputs.squeeze())).sum().item()
            
        print (f'Epoch: {epoch+1}/{epochs} \t Val Loss: {np.mean(val_losses):.6f} \t Acc: {(acc/(len(val_loader)*batch_size))*100:.2f}% \t Time: {datetime.datetime.now()}')    
        

In [None]:
def run_train(net, criterion, optimizer, epochs, train_loader, val_loader, clip, gru=False):
    for epoch in range(epochs):
        print ('Running epoch {}...\n'.format(epoch+1))
        train(net, criterion, optimizer, train_loader ,clip, epoch, epochs, gru)
        validate(net, criterion, optimizer, val_loader, epoch, epochs, gru)

In [None]:
run_train(net, criterion, optimizer, epochs, train_loader, val_loader, clip,gru=False)

## Test

In [None]:
class QuoraTestDataset(Dataset):
    def __init__(self, df, word2idx, seq_length):
        self.word2idx = word2idx
        self.seq_length = seq_length
        self.data = df
        self.data_len = len(df)
        
    def __len__(self):
        return self.data_len

    def __getitem__(self, idx):
        if idx >= self.data_len:
            idx %= self.data_len
            
        #preprocessed
        tokens = self.data.iloc[idx].processed_text
        
        # encode to make array of indices
        encoded = encode_question(word2idx, tokens, self.seq_length)
        text_array = add_padding(encoded, self.seq_length)
        return self.data.iloc[idx].qid, torch.from_numpy(text_array)

In [None]:
# create dataset
test_set = QuoraTestDataset(test_df, word2idx, seq_length)

In [None]:
len(test_df)

In [None]:
test_batch_size = 41

In [None]:
# create dataloader
test_loader = DataLoader(test_set, shuffle=False, batch_size=test_batch_size)

In [None]:
def test(net, test_loader, batch_size=test_batch_size):
    test_l_h = net.init_hidden(batch_size)
    ret_qid = []
    ret_pred = []
    test_len = len(test_loader)
    counter = 0
    with torch.no_grad():
        for qids, inputs in test_loader:
            counter += 1
            inputs = inputs.to(device)
            
            # for LSTM
            test_l_h = tuple([each.data for each in test_l_h])

            outputs, test_l_h = net(inputs, test_l_h)
            
            ret_qid.append(qids)
            ret_pred.append(torch.round(outputs.squeeze()).cpu().numpy().astype(int))
            
            if counter % 300 == 0:
                print('{}/{} done'.format(counter, test_len))

    return ret_qid, ret_pred

In [None]:
ret_qid, ret_pred = test(net, test_loader)

In [None]:
ret_qid = np.concatenate(ret_qid)
ret_pred = np.concatenate(ret_pred)
print (len(ret_qid))
print (len(ret_pred))

In [None]:
submit_df = pd.DataFrame({'qid': ret_qid, 'prediction': ret_pred})

In [None]:
submit_df.head()

In [None]:
submit_df.to_csv("submission.csv",index=False)