In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from datetime import datetime as dt
import time
import re

DRIVE_PATH = "/content/gdrive/My Drive/Colab Notebooks/"
TRAIN_FILE = "datasets/train.csv"
CORRECTED_TRAIN_FILE = "datasets/corrected_train.csv"
TEST_FILE = "datasets/test.csv"
GLOVE_FILE = "embeddings/glove.840B.300d/glove.840B.300d.txt"

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Toolbox

In [0]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

def load_data(path = DRIVE_PATH, file = CORRECTED_TRAIN_FILE):
    csv_path = os.path.join(path,file)
    return pd.read_csv(csv_path)

def tokenize(sentences_list):
    return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in sentences_list] 
    
def get_vocab(sentences):
    vocab={}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    return vocab

def glove_embeddings(vocabulary_in_set, drive = DRIVE_PATH, gloveFile = GLOVE_FILE ,extract = -1):

    glove_file = os.path.join(drive,gloveFile)
  
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

    embeddings = []
    words_id = {}
    f = open(glove_file,'r', encoding="utf8")
    increment = 0
    words_id[""]=0
    first = True
    i = 1
    
    for line in f:
        word, vect = get_coefs(*line.split(" "))
        if first:
            embeddings.append(np.zeros_like(vect))
            first = False
        if word in vocabulary_in_set:
            embeddings.append(vect)
            words_id[word] = i
            i += 1
            if increment == extract - 1:
                break
            elif extract != -1:
                increment += 1
    f.close()   
    return np.array(embeddings), words_id

def df_to_data_target(df, data_col = "corrected_question_text", target_col = "target"):
    data = df[data_col]
    target = df[target_col]
    return np.c_[data, target]

def embed_and_pad(X, embeddings, n_dims, width):
    padded_X = np.zeros((len(X),width,n_dims))
    i = 0
    for sentence in X:
        j = 0
        for word in sentence:
            padded_X[i,j,:] = embeddings[word]
            j += 1
        i +=1
    return padded_X

In [0]:
class data_split_and_batch:
    def __init__(self, data_targets, dv=0, cv=0, batch_size=500, words_ids = None, seed=42):
        self.seed = seed
        np.random.seed(seed)
        self.batch_length = batch_size
        self.pointer = 0
        self.reshuffle_seed = 0
        self.__split__(data_targets=data_targets, dv=dv, cv=cv, words_ids=words_ids)
    
    def __split__(self, data_targets, dv, cv, words_ids):
        positive_data_targets = data_targets[data_targets[:,1]==1]
        negative_data_targets = data_targets[data_targets[:,1]==0]
        
        positive_length = len(positive_data_targets)
        negative_length = len(negative_data_targets)
  
        pos_data_targets = positive_data_targets[np.random.permutation(positive_length), :]
        neg_data_targets = negative_data_targets[np.random.permutation(negative_length), :]
        
        if dv != 0:
            dv_positive = int(dv * positive_length/(positive_length + negative_length))
            dv_negative = dv - dv_positive
            dv_data_targets = np.concatenate((positive_data_targets[0:dv_positive,:], negative_data_targets[0:dv_negative,:]))
            dv_data_targets = dv_data_targets[np.random.permutation(dv), :]
            self.dv_targets = dv_data_targets[:,1].astype(float)
            X_dev = self.__tokenize__(dv_data_targets[:,0])
            self.X_dv_lengths = np.array(list(map(len, X_dev)))
            if words_ids == None:
                self.dv_data = X_dev
            else:
                self.dv_data = self.__ids_and_pad__(X_dev,words_ids[0], words_ids[1])
        else:
            dv_positive = 0
            dv_negative = 0
            
        if cv != 0:
            cv_positive = int(cv * positive_length/(positive_length + negative_length))
            cv_negative = cv - cv_positive
            cv_data_targets = np.concatenate((positive_data_targets[dv_positive:dv_positive + cv_positive,:], negative_data_targets[dv_negative:dv_negative + cv_negative,:]))
            cv_data_targets = cv_data_targets[np.random.permutation(cv), :]
            self.cv_targets = cv_data_targets[:,1].astype(float)
            X_cross = self.__tokenize__(cv_data_targets[:,0])
            self.X_cv_lengths = np.array(list(map(len, X_cross)))
            if words_ids == None:
                self.cv_data = X_cross
            else:
                self.cv_data = self.__ids_and_pad__(X_cross,words_ids[0], words_ids[1])   
        else:
            cv_positive = 0
            cv_negative = 0
            
        train_data_targets = np.concatenate((positive_data_targets[dv_positive + cv_positive:,:], negative_data_targets[dv_negative + cv_negative:,:]))
        train_data_targets = train_data_targets[np.random.permutation(len(data_targets)-dv-cv), :]
        self.train_targets = train_data_targets[:,1].astype(float)
        X_train = self.__tokenize__(train_data_targets[:,0])
        self.X_train_lengths = np.array(list(map(len, X_train)))
        if words_ids == None:
            self.train_data = X_train
        else:
            self.train_data = self.__ids_and_pad__(X_train ,words_ids[0], words_ids[1])
        self.num_train_examples = len(self.train_targets)
        self.nr_batches = self.num_train_examples//self.batch_length
    
    def __tokenize__(self, X):
        return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in X]
    
    def __ids_and_pad__(self, X, word_ids, width):
        padded_X = np.zeros((len(X),width))
        i = 0
        for sentence in X:
            j = 0
            for word in sentence:
                padded_X[i,j] = word_ids[word]
                j += 1
            i +=1
        return padded_X
    
    def cross_val_set(self):
        return self.X_cv_lengths, self.cv_data, self.cv_targets 
    
    def dev_set(self):
        return self.X_dv_lengths, self.dv_data, self.dv_targets
    
    def next_batch(self):
        self.pointer += 1
        X_batch = self.train_data[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        y_batch = self.train_targets[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        X_batch_lengths = self.X_train_lengths[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        
        if self.pointer == self.nr_batches:
            self.pointer = 0
            self.reshuffle_seed += 1
            np.random.seed(self.seed + self.reshuffle_seed)
            permutations = np.random.permutation(self.num_train_examples)
            self.train_data = self.train_data[permutations]
            self.train_targets = self.train_targets[permutations]
            self.X_train_lengths = self.X_train_lengths[permutations]
            
        return X_batch_lengths, X_batch, y_batch
    
    def reset(self):
        self.pointer = 0        

# Load and prepare data

In [6]:
train_data = load_data()
train_data.head()

Unnamed: 0,qid,target,corrected_question_text
0,00002165364db923c7e6,0,How did Quebec nationalists see their province...
1,000032939017120e6e44,0,"Do you have an adopted dog , how would you enc..."
2,0000412ca6e4628ce2cf,0,Why does velocity affect time ? Does velocity ...
3,000042bf85aa498cd78e,0,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,0,Can I convert montra helicon D to a mountain b...


In [0]:
data_targets = df_to_data_target(train_data)

In [0]:
tokenized_questions = tokenize(train_data["corrected_question_text"].values)
vocabulary_in_set = get_vocab(tokenized_questions)

In [0]:
embeddings, words_ids = glove_embeddings(vocabulary_in_set=vocabulary_in_set)

# Construction phase

In [0]:
reset_graph()

n_inputs = 300
n_neurons = 1000
n_fc1 = 1000
n_fc2 = 500
n_output = 1
learning_rate = 0.01
activation_fn = tf.nn.elu
k_initializer = tf.contrib.layers.variance_scaling_initializer

n_steps = 250
seq_len = tf.placeholder(tf.int32, [None], name="seq_len")

X = tf.placeholder(tf.int32, [None, n_steps], name="X")
y = tf.placeholder(tf.float32, [None], name="y")

with tf.name_scope("embedding_layer"):
    word_embeddings = tf.Variable(initial_value=embeddings, trainable=False)
    embedded_sentences = tf.nn.embedding_lookup(word_embeddings, X)

with tf.name_scope("shallow_LSTM_layer"):
    lstm_cell = tf.contrib.rnn.LSTMCell(num_units=n_neurons, name="LSTM")
    h_states, states = tf.nn.dynamic_rnn(lstm_cell, embedded_sentences, dtype = tf.float32, sequence_length=seq_len)

with tf.name_scope("fully_connected_layers"):
    fc1 = tf.contrib.layers.fully_connected(states[0], n_fc1, activation_fn=activation_fn)
    fc2 = tf.contrib.layers.fully_connected(fc1, n_fc2, activation_fn=activation_fn)

with tf.name_scope("logits_and_outputs"):
    logits = tf.reshape(tf.contrib.layers.fully_connected(fc2, n_output, activation_fn=None),shape=[-1])
    outputs = tf.sigmoid(logits)
    predictions = tf.round(outputs)

with tf.name_scope("loss"):
    xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy)

with tf.name_scope("evaluation"):
    TP = tf.count_nonzero(predictions * y)
    TN = tf.count_nonzero((predictions - 1) * (y - 1))
    FP = tf.count_nonzero(predictions * (y - 1))
    FN = tf.count_nonzero((predictions - 1) * y)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN)/(TP + FP + TN + FN)
    f1 = 2 * precision * recall / (precision + recall)

with tf.name_scope("training_op"):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

f1_train_summary = tf.summary.scalar('Train_F1_score', f1)
f1_dev_summary = tf.summary.scalar("Dev_F1_score", f1)
loss_train_summary = tf.summary.scalar("Train_loss", loss)
loss_dev_summary = tf.summary.scalar("Dev_loss", loss)

# Training phase

In [0]:
batch_size = 500

QIQC = data_split_and_batch(data_targets, cv=1000, dv=1000, batch_size=batch_size, words_ids=(words_ids,n_steps))
X_dev_lengths, X_dev_set, y_dev_set = QIQC.dev_set()

In [12]:
QIQC.reset()
n_epoch = 5

now = dt.utcnow().strftime("%Y%m%d%H%M%S")
LOGS_FOLDER = "logs//run-{}//".format(now)
MODELS_FILE = "models/model1.2/QIQC_model1.2.ckpt"
MODELS_INDEX = "models/model1.2/QIQC_model1.2.ckpt.index"
logdir = os.path.join(DRIVE_PATH,LOGS_FOLDER)
modfil = os.path.join(DRIVE_PATH,MODELS_FILE)
modind = os.path.join(DRIVE_PATH,MODELS_INDEX)

file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

with tf.Session() as sess:
    #if os.path.isfile(modind):
    #    saver.restore(sess, modfil)
    #else:
    init.run()
    t_0 = time.time()
    for epoch in range(n_epoch):
        for iteration in range(QIQC.nr_batches):
            X_lengths, X_batch, y_batch = QIQC.next_batch()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, seq_len: X_lengths})

            if iteration % 30 == 0 :
                f1_train_str, loss_train_str = sess.run([f1_train_summary, loss_train_summary],feed_dict = {X : X_batch, y : y_batch, seq_len: X_lengths})
                f1_dev_str, loss_dev_str = sess.run([f1_dev_summary, loss_dev_summary], feed_dict = {X : X_dev_set, y : y_dev_set, seq_len: X_dev_lengths})
                step = epoch * QIQC.nr_batches + iteration
                file_writer.add_summary(f1_train_str, step)
                file_writer.add_summary(f1_dev_str, step)
                file_writer.add_summary(loss_train_str, step)
                file_writer.add_summary(loss_dev_str, step)

            if iteration % 1000 == 0:
                save_path = saver.save(sess,modfil)
            if iteration % 100 == 0:
                t_1 = time.time()
                t_delta = t_1 - t_0
                t_0 = t_1
                f1_train, loss_train = sess.run([f1, loss],feed_dict = {X : X_batch, y : y_batch, seq_len: X_lengths})
                f1_dev, loss_dev = sess.run([f1, loss], feed_dict = {X : X_dev_set, y : y_dev_set, seq_len: X_dev_lengths})
                print("Epoch {}/{}; iteration {}/{}; runtime {:2f} sec, f1_train {:4f}; f1_dev {:4f}; loss_train {:4f}; loss_dev {:4f}".format(epoch, n_epoch, iteration, QIQC.nr_batches, t_delta,f1_train,f1_dev,loss_train,loss_dev))
            

Epoch 0/5; iteration 0/2608; runtime 6.660501 sec, f1_train  nan; f1_dev  nan; loss_train 70.589867; loss_dev 71.454887
Epoch 0/5; iteration 100/2608; runtime 61.819350 sec, f1_train 0.117647; f1_dev 0.129870; loss_train 0.210219; loss_dev 0.228703
Epoch 0/5; iteration 200/2608; runtime 59.328079 sec, f1_train  nan; f1_dev  nan; loss_train 0.209844; loss_dev 0.212270
Epoch 0/5; iteration 300/2608; runtime 59.542238 sec, f1_train  nan; f1_dev  nan; loss_train 0.191791; loss_dev 0.206446
Epoch 0/5; iteration 400/2608; runtime 59.574251 sec, f1_train 0.181818; f1_dev 0.084507; loss_train 0.238146; loss_dev 0.204085
Epoch 0/5; iteration 500/2608; runtime 60.047210 sec, f1_train 0.148148; f1_dev 0.090909; loss_train 0.182543; loss_dev 0.185381
Epoch 0/5; iteration 600/2608; runtime 59.410520 sec, f1_train  nan; f1_dev  nan; loss_train 0.300713; loss_dev 0.197094
Epoch 0/5; iteration 700/2608; runtime 61.686570 sec, f1_train 0.153846; f1_dev 0.092308; loss_train 0.189124; loss_dev 0.172590
E