In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

import time # kernels have a 2 hour limit

from collections import Counter

In [None]:
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

# Preprocessing

Preprocessing ideas based on https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings. 


Embeddings_index code from https://www.kaggle.com/shujian/different-embeddings-with-attention-fork-fork/notebook.

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print(f"Train shape: {train.shape}")
print(f"Test shape:  {test.shape}")
train.sample()

In [None]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt' 
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')  

# creates a mapping from the words to the embedding vectors=
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE)) 

We want to preprocess our sentences manually to best fit the GloVe embeddings.

In [None]:
def check_coverage(vocab,embeddings_index):
    a, oov, k, i = {}, {}, 0, 0
    for word in vocab:
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print(f'Found embeddings for {(len(a) / len(vocab)):.2%} of vocab')
    print(f'Found embeddings for  {(k / (k + i)):.2%} of all text')
    sorted_x = sorted(oov.items(), key=(lambda x: x[1]), reverse=True)

    return sorted_x

def get_vocab(question_series):
    sentences = question_series.str.split().values #get a list of lists of words
    words = [item for sublist in sentences for item in sublist] # flatten list into just words
    return dict(Counter(words)) # count words

In [None]:
vocab = get_vocab(train["question_text"])
out_of_vocab = check_coverage(vocab, embeddings_index)
out_of_vocab[:10]

GloVe has embeddings for certain types of punctuation, so let's keep those in (space seperated) and add an unknown punctuation character.

In [None]:
punct = set('?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~°√' + '“”’')
embed_punct = punct & set(embeddings_index.keys())

def clean_punctuation(txt):
    for p in "/-":
        txt = txt.replace(p, ' ')
    for p in "'`‘":
        txt = txt.replace(p, '')
    for p in punct:
        txt = txt.replace(p, f' {p} ' if p in embed_punct else ' _punct_ ') 
        #known punctuation gets space padded, otherwise we use a newn token
    return txt

In [None]:
train["question_text"] = train["question_text"].map(lambda x: clean_punctuation(x)).str.replace('\d+', ' # ')
test["question_text"] = test["question_text"].map(lambda x: clean_punctuation(x)).str.replace('\d+', ' # ')
vocab = get_vocab(train["question_text"])
out_of_vocab = check_coverage(vocab, embeddings_index)

In [None]:
out_of_vocab[:100]

In [None]:
x = train["question_text"].str.split().map(lambda x: len(x))
x.describe()

In [None]:
train, validation = train_test_split(train, test_size=0.08, random_state=20181224)

embed_size = 300 #word vector sizes
vocab_size = 95000 # words in vocabulary
maxlen = 100 # max words to use per question

# fill up the missing values
train_X = train["question_text"].fillna("_##_").values
val_X = validation["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

# Use Keras to tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

# Get the response
train_y = train['target'].values
val_y = validation['target'].values

We will get the mean and standard deviation from the existing embeddings to create random ones for words that do not have embeddings

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [None]:
word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index)) # only want at most vocab_size words in our vocabulary 
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #first, make our embedding matric random7
num_missed = 0
for word, i in word_index.items(): # insert embeddings we that exist into our matrix
    if i >= vocab_size: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    else: num_missed += 1
print(num_missed)

## Model

50 and 150 were also tried for the hidden layer size, and both performed worse than 100, so 100 seems to be the sweet spot.

In [None]:
hidden_layer_size = 100
BATCH_SIZE = 64

# mmakes sure that all our following operations will be placed in the right graph.
tf.reset_default_graph()

# should be batchsize x length of each question (vectors of numbers representing indices into the embedding matrix)
X = tf.placeholder(tf.int32, [None, maxlen], name='X')

# 1d vector with size = None because we want to predict one val for each q, but want variable batch sizes
Y = tf.placeholder(tf.float32, [None], name='Y')
batch_size = tf.placeholder(tf.int64)

We will use tf.Dataset to load the data for speed and efficiency. We will feed the data using a Reinitializable Iterator so we can shuffle our training data but evaluate our test data without shuffling (more info here: https://towardsdatascience.com/how-to-use-dataset-in-tensorflow-c758ef9e4428). 

I do not know an easy way to use tf.Dataset in "inference mode", so we will feed it dummy y values for now at inference time.

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X, Y)).shuffle(buffer_size=1000).batch(batch_size).repeat()
test_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).batch(batch_size) #this one does not shuffle

iterator = tf.data.Iterator.from_structure(dataset.output_types,
                                           dataset.output_shapes) 

# To choose which dataset we use, we simply initialize the appropriate one using the init_ops below
train_init_op = iterator.make_initializer(dataset)
test_init_op = iterator.make_initializer(test_dataset)

questions, labels = iterator.get_next()

Our embeddings layer will not be trainable for now, perhaps in a future version we can make them trainable part way into training. If we make them trainable too early, the embeddings will be destroyed due to the random weights of the actual model.

In [None]:
embeddings = tf.get_variable(name="embeddings", shape=embedding_matrix.shape,
                             initializer=tf.constant_initializer(np.array(embedding_matrix)), 
                             trainable=False)
embed = tf.nn.embedding_lookup(embeddings, questions)

A single layer LSTM with a single fully connected layer on top seems to work best. GRUs did not perform as well, likely due to the LSTM having more control over it's hidden state (which we use in the end for classification), whereas the GRU does not have a hidden state. Deeper GRUs and LSTMs also did not perform as well, likely due to the limit on training time. A deeper network would probably perform better with more training time. 

In [None]:
lstm_cell= tf.nn.rnn_cell.LSTMCell(hidden_layer_size)

# define the operation that runs the LSTM, across time, on the data
_, final_state = tf.nn.dynamic_rnn(lstm_cell, embed, dtype=tf.float32)

last_layer = tf.layers.dense(final_state.h, 1) #fully connected layer
prediction = tf.nn.sigmoid(last_layer) #activation function
prediction = tf.squeeze(prediction, [1]) # layers.dense returns a tensor, but we want to remove the extra dimension

A learning rate of 0.001 seemed to work best. Unexpectedly, the performance seemed to drop off faster when increasing the learning rate than when I decreased the learning rate. I would have expected a higher learning rate to work better since the training time is limited.

RMSPropOptimizer, SGD, and Adam were tested for optimizers, but Adam worked the best.

In [None]:
learning_rate=0.001

# define cross entropy loss function
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.squeeze(last_layer), labels=labels)
loss = tf.reduce_mean(loss)

# define our optimizer to minimize the loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

F1 is our target metric, so we will track that. Every minibatch, you run f1_update, which keeps track of the true pos/false pos/false negs. When you run f1, it will compute an optimal F1 score (based on 200 threshholds). You run the reset_op to reset these counters for each batch.

In [None]:
with tf.name_scope('metrics'):
    F1, f1_update = tf.contrib.metrics.f1_score(labels=labels, predictions=prediction, name='my_metric')
    
running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="my_metric")
reset_op = tf.variables_initializer(var_list=running_vars)

## Training

In [None]:
num_epochs = 10
seed = 3 # we use a seed to have deterministic results

sess = tf.Session()

# Run the initialization
sess.run(tf.global_variables_initializer()) # initializes all of our variables
sess.run(tf.local_variables_initializer()) # need this for the f1 metric to work

costs, f1 = [], []

In [None]:
## Want to train for 6600 seconds to stay under the time limit of 7200 seconds (2 hours)
start = time.time()
end = 0
max_time = 6600

# initialize iterator with train data
sess.run(train_init_op, feed_dict={X:train_X, Y:train_y, batch_size:BATCH_SIZE})
num_iter = 1000 # print after each num_iter
num_batches = int(train_X.shape[0] / BATCH_SIZE) # number of batches/minibatches

# Training Loop
for epoch in range(1, num_epochs+1):
    seed += seed # want a different random shuffle every time, but still have deterministic results
    tf.set_random_seed(seed)
    iter_cost = 0.
    
    # the last batch is smaller than the rest, so we will use 
    # this to keep track of the number of iterations to get the right average cost
    prev_iter = 0. 
    
    for i in range(num_batches):
        _ , batch_loss, _ = sess.run([optimizer, loss, f1_update]) 
        iter_cost += batch_loss
        
        # End training after 
        end = time.time()
        if (end-start > max_time): 
            break
        
        if (i % num_iter == 0 and i > 0): 
            iter_cost /= (i-prev_iter) # get average batch cost
            prev_iter = i #update prev_iter for next iteration
            cur_f1 = sess.run(F1)
            sess.run(reset_op) # reset counters for F1
            
            f1.append(cur_f1)
            costs.append(iter_cost)
            print (f"Epoch {epoch} Iteration {i:5} cost: {iter_cost:.10f}  f1: {cur_f1:.10f}  time: {end-start:4.4f}")
            batch_cost = 0. #reset batch_cost)

## Evaluate Model with Validation Set

In [None]:
def easy_plot(yvals, ylabel=''):
    plt.plot(yvals)
    plt.ylabel(ylabel)
    plt.xlabel('Iterations (per thousand)')
    plt.title(f"{ylabel} by Iterations for Learning Rate = {learning_rate}")
    plt.show()
    
easy_plot(np.squeeze(costs), 'Cost')
easy_plot(np.squeeze(f1), 'F1 Score')

sz = 90
sess.run(test_init_op, feed_dict={X: val_X, Y: val_y, batch_size: sz})
val_cost = 0.
num_batches = int(val_X.shape[0] / sz) # number of minibatches of size minibatch_size in the train set
tf.set_random_seed(2018)

for _ in range(num_batches):
    sess.run(f1_update)

print (f"Validation f1: {sess.run(F1)}")

In [None]:
sz = 90
tf.set_random_seed(2018)

sess.run(test_init_op, feed_dict={X: val_X, Y: val_y, batch_size: sz})
val_pred = np.concatenate([sess.run(prediction) for _ in range(int(val_X.shape[0]/sz))])

In [None]:
thresholds = [i/200 for i in range(10, 120, 1)] 
scores = [metrics.f1_score(val_y,np.int16(val_pred > t)) for t in thresholds]

plt.plot(thresholds, scores)
plt.ylabel("F1 Score")
plt.xlabel('Threshold')
plt.title("F1 Score by thresholds for Validation Set")
plt.show()

In [None]:
thresh = thresholds[np.argmax(scores)]
print(f"Best Validation F1 Score is {max(scores):.4f} at threshold {thresh}")

## Predict on Test Set

In [None]:
sz=30
temp_y = val_y[:test_X.shape[0]]
sub = test[['qid']]
sess.run(test_init_op, feed_dict={X: test_X, Y: temp_y, batch_size:sz})
sub['prediction'] = np.concatenate([sess.run(prediction) for _ in range(int(test_X.shape[0]/sz))])

In [None]:
sub['prediction'] = (sub['prediction'] > thresh).astype(np.int16)
sub.to_csv("submission.csv", index=False)
sub.sample()