In [None]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import KeyedVectors
import tensorflow as tf
tqdm.pandas()

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
submission_df = pd.read_csv("../input/sample_submission.csv")
print(train_df.shape)
print(test_df.shape)
print(submission_df.shape)

In [None]:
news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [None]:
def build_vocab(sentences):
    vocab = {}
    for sentence in tqdm(sentences, desc="Building Vocabulary"):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings):
    found = {}
    oov = {}
    f, nf = 0, 0
    for word in vocab:
        if word in embeddings:
            found[word] = vocab[word]
            f += vocab[word]
        else:
            oov[word] = vocab[word]
            nf += vocab[word]
    print("Found embeddings for %.4f of vocab"%(len(found)/len(vocab)))
    print("Found embeddings for %.4f of total text"%(f/(f+nf)))
    return oov

In [None]:
sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:10]})
oov = check_coverage(vocab, embeddings)
print(list(oov)[:20])
del sentences
del vocab
del oov

In [None]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}
contraction_re = re.compile("(%s)"%"|".join(contraction_dict.keys()))
mispell_re = re.compile("(%s)"%"|".join(mispell_dict.keys()))

def remove_contraction(text):
    text = str(text)
    return contraction_re.sub(lambda match: contraction_dict[match.group(0)], text)

def remove_mispell(text):
    text = str(text)
    return mispell_re.sub(lambda match: mispell_dict[match.group(0)], text)

def clean_numbers(x):
    x = str(x)
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, ' ')
    return x

def remove_oovs(x):
    x = str(x)
    to_remove = ['a','to','of','and']
    x = re.sub(r'\ba\b', '', x)
    x = re.sub(r'\bto\b', '', x)
    x = re.sub(r'\bof\b', '', x)
    x = re.sub(r'\band\b', '', x)
    return x

In [None]:
def process_data(df):
    df["question_text"] = df["question_text"].progress_apply(remove_contraction)
    df["question_text"] = df["question_text"].progress_apply(clean_text)
    df["question_text"] = df["question_text"].progress_apply(clean_numbers)
    df["question_text"] = df["question_text"].progress_apply(remove_mispell)
    df["question_text"] = df["question_text"].progress_apply(remove_oovs)
    return df

In [None]:
train_df = process_data(train_df)
test_df = process_data(test_df)

In [None]:
sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:10]})
oov = check_coverage(vocab, embeddings)
print(list(oov)[:20])
del sentences
del vocab
del oov

In [None]:
max_sent_len = 72
embedding_size = 300

def tokenize_sentences(df, field, new_field="tokens"):
    df[new_field] = [[] for _ in range(len(df))]
    for i, row in enumerate(df[field]):
        tokens = row.split()
        if len(tokens) > max_sent_len:
            tokens = tokens[:max_sent_len]
        df[new_field][i].extend(tokens)
    return df

train_df = tokenize_sentences(train_df, "question_text")
test_df = tokenize_sentences(test_df, "question_text")

In [None]:
train_df.head()

In [None]:
def generate_embeddings(words, word2vec, max_sent_len, embedding_size=300):
    embeddings = np.zeros((max_sent_len, embedding_size), dtype=np.float32)
    for i, word in enumerate(words):
        try:
            embeddings[i] = word2vec.word_vec(word)
        except KeyError:
            pass
    return embeddings

def batch_generator(df, batch_size, word2vec, max_sent_len, token_field="tokens", lbl_field="target", embedding_size=300):
    iterations = len(df)//batch_size
    for i in range(iterations):
        inp = np.ndarray(shape=(batch_size, max_sent_len, embedding_size), dtype=np.float32)
        lbl = np.ndarray(shape=(batch_size, 1), dtype=np.float32)
        for j in range(batch_size):
            tokens = df[token_field][i*batch_size+j]
            inp[j] = generate_embeddings(tokens, word2vec, max_sent_len, embedding_size)
            lbl[j] = df[lbl_field][i*batch_size+j]
        yield np.expand_dims(inp, axis=3), lbl
    return

def test_batch_generator(df, batch_size, word2vec, max_sent_len, token_field="tokens", embedding_size=300):
    iterations = len(df)//batch_size
    for i in range(iterations):
        inp = np.ndarray(shape=(batch_size, max_sent_len, embedding_size), dtype=np.float32)
        for j in range(batch_size):
            tokens = df[token_field][i*batch_size+j]
            inp[j] = generate_embeddings(tokens, word2vec, max_sent_len, embedding_size)
        yield np.expand_dims(inp, axis=3)
    return

gen = batch_generator(train_df, 128, embeddings, max_sent_len)
t_gen = test_batch_generator(test_df, 100, embeddings, max_sent_len)
inp, lbl = next(gen)
t_inp = next(t_gen)
print(inp.shape)
print(lbl.shape)
print(t_inp.shape)

In [None]:
def conv2d(x, output_channels, filter_size):
    w_init = tf.contrib.layers.variance_scaling_initializer()
    b_init = tf.random_uniform_initializer(0.0, 0.01)
    w = tf.get_variable("w", shape=[filter_size, 300, 1, output_channels], dtype=tf.float32, initializer=w_init)
    b = tf.get_variable("b", shape=[output_channels], dtype=tf.float32, initializer=b_init)
    conv = tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding="VALID") + b
    return tf.nn.tanh(conv)

def score(y_pred, y_true):
    assert len(y_pred) == len(y_true)
    return np.sum(np.equal(np.round(y_pred).astype(np.int32), y_true.astype(np.int32)))/len(y_true)

In [None]:
batch_size = 128
tf.reset_default_graph()
inp_plh = tf.placeholder(tf.float32, [None, max_sent_len, embedding_size, 1], "input_placeholder")
lbl_plh = tf.placeholder(tf.float32, [None, 1], "label_placeholder")
train_plh = tf.placeholder(tf.bool, None, "training_placeholder")

with tf.variable_scope("conv1"):
    conv1 = conv2d(inp_plh, 36, 1)
    conv1 = tf.nn.max_pool(conv1, ksize=[1, max_sent_len-1+1, 1, 1], strides=[1, 1, 1, 1], padding="VALID")
with tf.variable_scope("conv2"):
    conv2 = conv2d(inp_plh, 36, 2)
    conv2 = tf.nn.max_pool(conv2, ksize=[1, max_sent_len-2+1, 1, 1], strides=[1, 1, 1, 1], padding="VALID")
with tf.variable_scope("conv3"):
    conv3 = conv2d(inp_plh, 36, 3)
    conv3 = tf.nn.max_pool(conv3, ksize=[1, max_sent_len-3+1, 1, 1], strides=[1, 1, 1, 1], padding="VALID")
with tf.variable_scope("conv4"):
    conv4 = conv2d(inp_plh, 36, 5)
    conv4 = tf.nn.max_pool(conv4, ksize=[1, max_sent_len-5+1, 1, 1], strides=[1, 1, 1, 1], padding="VALID")
conc = tf.concat([conv1, conv2, conv3, conv4], axis=1)
shape = conc.get_shape().as_list()
dim = shape[1]*shape[2]*shape[3]
reshaped = tf.reshape(conc, shape=[-1, dim])
# reshaped = tf.cond(train_plh, lambda: reshaped, lambda: tf.nn.dropout(reshaped, keep_prob=0.9))
w_init = tf.contrib.layers.variance_scaling_initializer()
b_init = tf.constant_initializer(0.01)
w = tf.get_variable("w", shape=[dim, 1], dtype=tf.float32, initializer=w_init)
b = tf.get_variable("b", shape=[1], dtype=tf.float32, initializer=b_init)
logits = tf.matmul(reshaped, w) + b
pred = tf.nn.sigmoid(logits)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=lbl_plh))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [None]:
num_epochs = 9
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print("Training Started...")
    for epoch in range(num_epochs):
        loss_list = []
        acc_list = []
        gen = batch_generator(train_df, batch_size, embeddings, max_sent_len)
        for inp, lbl in gen:
            feed_dict = {inp_plh: inp, lbl_plh: lbl, train_plh: True}
            _, loss_val, pred_val = sess.run([optimizer, loss, pred], feed_dict=feed_dict)
            loss_list.append(float(loss_val))
            acc_list.append(float(score(pred_val, lbl)))
        print("##########Epoch %d completed##############"%(epoch+1))
        print("Average Loss: ", float(np.mean(loss_list)))
        print("Average Accuracy: ", float(np.mean(acc_list)))
        print("##########################################")
    print("Generating submission data...")
    sub_df = pd.read_csv("../input/sample_submission.csv")
    gen = test_batch_generator(test_df, 100, embeddings, max_sent_len)
    index = 0
    for inp in gen:
        feed_dict = {inp_plh: inp, train_plh: False}
        pred_val = sess.run(pred, feed_dict=feed_dict)
        sub_df.prediction.iloc[index: index+100] = np.round(pred_val).astype(np.int32).reshape(100)
        index += 100
    sub_df.to_csv("submission.csv", index=False)
    print("Submission file generated as submission.csv")