In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from datetime import datetime as dt
import time
import re
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

DRIVE_PATH = "/content/gdrive/My Drive/Colab Notebooks/"
TRAIN_FILE = "datasets/train.csv"
CORRECTED_TRAIN_FILE = "datasets/corrected_train.csv"
TEST_FILE = "datasets/test.csv"
GLOVE_FILE = "embeddings/glove.840B.300d/glove.840B.300d.txt"

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Toolbox

In [0]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

def load_data(path = DRIVE_PATH, file = CORRECTED_TRAIN_FILE):
    csv_path = os.path.join(path,file)
    return pd.read_csv(csv_path)

def tokenize(sentences_list):
    return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in sentences_list] 
    
def get_vocab(sentences):
    vocab={}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    return vocab

def glove_embeddings(vocabulary_in_set, drive = DRIVE_PATH, gloveFile = GLOVE_FILE ,extract = -1):

    glove_file = os.path.join(drive,gloveFile)
  
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

    embeddings = []
    words_id = {}
    f = open(glove_file,'r', encoding="utf8")
    increment = 0
    words_id[""]=0
    first = True
    i = 1
    
    for line in f:
        word, vect = get_coefs(*line.split(" "))
        if first:
            embeddings.append(np.zeros_like(vect))
            first = False
        if word in vocabulary_in_set:
            embeddings.append(vect)
            words_id[word] = i
            i += 1
            if increment == extract - 1:
                break
            elif extract != -1:
                increment += 1
    f.close()   
    return np.array(embeddings), words_id

def df_to_data_target(df, data_col = "corrected_question_text", target_col = "target"):
    data = df[data_col]
    target = df[target_col]
    return np.c_[data, target]

def embed_and_pad(X, embeddings, n_dims, width):
    padded_X = np.zeros((len(X),width,n_dims))
    i = 0
    for sentence in X:
        j = 0
        for word in sentence:
            padded_X[i,j,:] = embeddings[word]
            j += 1
        i +=1
    return padded_X

In [0]:
class data_split_and_batch:
    def __init__(self, data_targets, dv=0, cv=0, batch_size=500, words_ids = None, seed=42):
        self.seed = seed
        np.random.seed(seed)
        self.batch_length = batch_size
        self.pointer = 0
        self.reshuffle_seed = 0
        self.__split__(data_targets=data_targets, dv=dv, cv=cv, words_ids=words_ids)
    
    def __split__(self, data_targets, dv, cv, words_ids):
        positive_data_targets = data_targets[data_targets[:,1]==1]
        negative_data_targets = data_targets[data_targets[:,1]==0]
        
        positive_length = len(positive_data_targets)
        negative_length = len(negative_data_targets)
  
        pos_data_targets = positive_data_targets[np.random.permutation(positive_length), :]
        neg_data_targets = negative_data_targets[np.random.permutation(negative_length), :]
        
        if dv != 0:
            dv_positive = int(dv * positive_length/(positive_length + negative_length))
            dv_negative = dv - dv_positive
            dv_data_targets = np.concatenate((positive_data_targets[0:dv_positive,:], negative_data_targets[0:dv_negative,:]))
            dv_data_targets = dv_data_targets[np.random.permutation(dv), :]
            self.dv_targets = dv_data_targets[:,1].astype(float)
            X_dev = self.__tokenize__(dv_data_targets[:,0])
            self.X_dv_lengths = np.array(list(map(len, X_dev)))
            if words_ids == None:
                self.dv_data = X_dev
            else:
                self.dv_data = self.__ids_and_pad__(X_dev,words_ids[0], words_ids[1])
        else:
            dv_positive = 0
            dv_negative = 0
            
        if cv != 0:
            cv_positive = int(cv * positive_length/(positive_length + negative_length))
            cv_negative = cv - cv_positive
            cv_data_targets = np.concatenate((positive_data_targets[dv_positive:dv_positive + cv_positive,:], negative_data_targets[dv_negative:dv_negative + cv_negative,:]))
            cv_data_targets = cv_data_targets[np.random.permutation(cv), :]
            self.cv_targets = cv_data_targets[:,1].astype(float)
            X_cross = self.__tokenize__(cv_data_targets[:,0])
            self.X_cv_lengths = np.array(list(map(len, X_cross)))
            if words_ids == None:
                self.cv_data = X_cross
            else:
                self.cv_data = self.__ids_and_pad__(X_cross,words_ids[0], words_ids[1])   
        else:
            cv_positive = 0
            cv_negative = 0
            
        train_data_targets = np.concatenate((positive_data_targets[dv_positive + cv_positive:,:], negative_data_targets[dv_negative + cv_negative:,:]))
        train_data_targets = train_data_targets[np.random.permutation(len(data_targets)-dv-cv), :]
        self.train_targets = train_data_targets[:,1].astype(float)
        X_train = self.__tokenize__(train_data_targets[:,0])
        self.X_train_lengths = np.array(list(map(len, X_train)))
        if words_ids == None:
            self.train_data = X_train
        else:
            self.train_data = self.__ids_and_pad__(X_train ,words_ids[0], words_ids[1])
        self.num_train_examples = len(self.train_targets)
        self.nr_batches = self.num_train_examples//self.batch_length
    
    def __tokenize__(self, X):
        return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in X]
    
    def __ids_and_pad__(self, X, word_ids, width):
        padded_X = np.zeros((len(X),width))
        i = 0
        for sentence in X:
            j = 0
            for word in sentence:
                padded_X[i,j] = word_ids[word]
                j += 1
            i +=1
        return padded_X
    
    def cross_val_set(self):
        return self.X_cv_lengths, self.cv_data, self.cv_targets 
    
    def dev_set(self):
        return self.X_dv_lengths, self.dv_data, self.dv_targets
    
    def next_batch(self):
        self.pointer += 1
        X_batch = self.train_data[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        y_batch = self.train_targets[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        X_batch_lengths = self.X_train_lengths[(self.pointer - 1)*self.batch_length:self.pointer*self.batch_length]
        
        if self.pointer == self.nr_batches:
            self.pointer = 0
            self.reshuffle_seed += 1
            np.random.seed(self.seed + self.reshuffle_seed)
            permutations = np.random.permutation(self.num_train_examples)
            self.train_data = self.train_data[permutations]
            self.train_targets = self.train_targets[permutations]
            self.X_train_lengths = self.X_train_lengths[permutations]
            
        return X_batch_lengths, X_batch, y_batch
    
    def reset(self):
        self.pointer = 0        

# Load and prepare data

In [6]:
train_data = load_data()
train_data.head()

Unnamed: 0,qid,target,corrected_question_text
0,00002165364db923c7e6,0,How did Quebec nationalists see their province...
1,000032939017120e6e44,0,"Do you have an adopted dog , how would you enc..."
2,0000412ca6e4628ce2cf,0,Why does velocity affect time ? Does velocity ...
3,000042bf85aa498cd78e,0,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,0,Can I convert montra helicon D to a mountain b...


In [0]:
data_targets = df_to_data_target(train_data)

In [0]:
tokenized_questions = tokenize(train_data["corrected_question_text"].values)
vocabulary_in_set = get_vocab(tokenized_questions)

In [0]:
embeddings, words_ids = glove_embeddings(vocabulary_in_set=vocabulary_in_set)

# Construction phase

In [0]:
reset_graph()

n_neurons = [900, 600, 300, 200]
n_fc1 = 1500
n_fc2 = 600
n_fc3 = 200
n_output = 1
learning_rate = tf.placeholder(tf.float32)
activation_fn = tf.nn.elu

n_steps = 250
seq_len = tf.placeholder(tf.int32, [None], name="seq_len")

X = tf.placeholder(tf.int32, [None, n_steps], name="X")
y = tf.placeholder(tf.float32, [None], name="y")

with tf.name_scope("embedding_layer"):
    word_embeddings = tf.Variable(initial_value=embeddings, trainable=False)
    embedded_sentences = tf.nn.embedding_lookup(word_embeddings, X)

with tf.name_scope("shallow_LSTM_layer"):
    lstm_cell = [tf.contrib.rnn.LSTMCell(num_units=n, name="LSTM") for n in n_neurons]
    multi_lstm_cell = tf.contrib.rnn.MultiRNNCell(lstm_cell)
    h_states, out_states = tf.nn.dynamic_rnn(multi_lstm_cell, embedded_sentences, dtype = tf.float32, sequence_length=seq_len) #state = 3 * 2 * m * 500
    states = tf.concat([out_states[0][0],out_states[1][0],out_states[2][0], out_states[3][0]], axis = 1)
    
with tf.name_scope("fully_connected_layers"):
    fc1 = tf.contrib.layers.fully_connected(states, n_fc1, activation_fn=activation_fn)
    fc2 = tf.contrib.layers.fully_connected(fc1, n_fc2, activation_fn=activation_fn)
    fc3 = tf.contrib.layers.fully_connected(fc2, n_fc3, activation_fn=activation_fn)

with tf.name_scope("logits_and_outputs"):
    logits = tf.reshape(tf.contrib.layers.fully_connected(fc3, n_output, activation_fn=None),shape=[-1])
    outputs = tf.sigmoid(logits)
    predictions = tf.round(outputs)

with tf.name_scope("loss"):
    xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy)

with tf.name_scope("evaluation"):
    TP = tf.count_nonzero(predictions * y)
    TN = tf.count_nonzero((predictions - 1) * (y - 1))
    FP = tf.count_nonzero(predictions * (y - 1))
    FN = tf.count_nonzero((predictions - 1) * y)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN)/(TP + FP + TN + FN)
    f1 = 2 * precision * recall / (precision + recall)

with tf.name_scope("training_op"):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

f1_train_summary = tf.summary.scalar('Train_F1_score', f1)
f1_dev_summary = tf.summary.scalar("Dev_F1_score", f1)
loss_train_summary = tf.summary.scalar("Train_loss", loss)
loss_dev_summary = tf.summary.scalar("Dev_loss", loss)

# Training phase

In [0]:
batch_size = 500

QIQC = data_split_and_batch(data_targets, cv=1000, dv=1000, batch_size=batch_size, words_ids=(words_ids,n_steps))
X_dev_lengths, X_dev_set, y_dev_set = QIQC.dev_set()

In [12]:
"""
QIQC.reset()
with tf.Session() as sess:
    init.run()
    loss_hist = []
    lr_hist = []
    iter_hist = []
    lr = 0
    t_0 = time.time()
    for iteration in range(QIQC.nr_batches):  
        lr = (iteration + 1)/QIQC.nr_batches * 10 # 1.008188^85 = 2
        X_lengths, X_batch, y_batch = QIQC.next_batch()
        _, run_loss = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch, seq_len: X_lengths, learning_rate:lr})

        loss_hist.append(run_loss)
        iter_hist.append(iteration)
        lr_hist.append(lr)
        if iteration % 100 == 0:
            t_1 = time.time()
            t_delta = t_1 - t_0
            expected_runtime = t_delta * (QIQC.nr_batches - iteration)/100
            expected_runtime_h = int(expected_runtime//3600)
            expected_runtime_min = int((expected_runtime - 3600 * expected_runtime_h)//60)
            expected_runtime_sec = int((expected_runtime - 3600 * expected_runtime_h - 60 * expected_runtime_min))
            t_0 = t_1
            print("It. {}/{}; Runtime {:.2f} s; Remaining runtime {}h{}min{}s".format(iteration, QIQC.nr_batches, t_delta, expected_runtime_h,expected_runtime_min,expected_runtime_sec))
    
"""     

'\nQIQC.reset()\nwith tf.Session() as sess:\n    init.run()\n    loss_hist = []\n    lr_hist = []\n    iter_hist = []\n    lr = 0\n    t_0 = time.time()\n    for iteration in range(QIQC.nr_batches):  \n        lr = (iteration + 1)/QIQC.nr_batches * 10 # 1.008188^85 = 2\n        X_lengths, X_batch, y_batch = QIQC.next_batch()\n        _, run_loss = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch, seq_len: X_lengths, learning_rate:lr})\n\n        loss_hist.append(run_loss)\n        iter_hist.append(iteration)\n        lr_hist.append(lr)\n        if iteration % 100 == 0:\n            t_1 = time.time()\n            t_delta = t_1 - t_0\n            expected_runtime = t_delta * (QIQC.nr_batches - iteration)/100\n            expected_runtime_h = int(expected_runtime//3600)\n            expected_runtime_min = int((expected_runtime - 3600 * expected_runtime_h)//60)\n            expected_runtime_sec = int((expected_runtime - 3600 * expected_runtime_h - 60 * expected_runtime_min))

In [0]:
#plt.plot(np.log(lr_hist), loss_hist)
#plt.ylim((0,30))

In [14]:
QIQC.reset()
n_epoch = 5

now = dt.utcnow().strftime("%Y%m%d%H%M%S")
LOGS_FOLDER = "logs/run-{}/".format(now)
MODELS_FILE = "models/model3.3/QIQC_model3.3.ckpt"
logdir = os.path.join(DRIVE_PATH,LOGS_FOLDER)
modfil = os.path.join(DRIVE_PATH,MODELS_FILE)

file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
total_deltas = 0
t_init = time.time()
min_lr = 0.0005
max_lr = 0.0015
with tf.Session() as sess:
    init.run()
    t_0 = time.time()
    stepsize = QIQC.nr_batches
    for epoch in range(n_epoch):
        for iteration in range(QIQC.nr_batches):
            step = epoch * QIQC.nr_batches + iteration
            cycle = np.floor(1 + step/(2 * stepsize))
            pattern = np.absolute(step/stepsize - 2*cycle + 1)
            cycle_lr = min_lr + (max_lr - min_lr) * pattern
            X_lengths, X_batch, y_batch = QIQC.next_batch()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, seq_len: X_lengths, learning_rate: cycle_lr})

            if iteration % 30 == 0 :
                f1_train_str, loss_train_str = sess.run([f1_train_summary, loss_train_summary],feed_dict = {X : X_batch, y : y_batch, seq_len: X_lengths})
                f1_dev_str, loss_dev_str = sess.run([f1_dev_summary, loss_dev_summary], feed_dict = {X : X_dev_set, y : y_dev_set, seq_len: X_dev_lengths})
                file_writer.add_summary(f1_train_str, step)
                file_writer.add_summary(f1_dev_str, step)
                file_writer.add_summary(loss_train_str, step)
                file_writer.add_summary(loss_dev_str, step)

            if iteration % 1000 == 0:
                save_path = saver.save(sess,modfil)
            if iteration % 100 == 0:
                f1_train, loss_train = sess.run([f1, loss],feed_dict = {X : X_batch, y : y_batch, seq_len: X_lengths})
                f1_dev, loss_dev = sess.run([f1, loss], feed_dict = {X : X_dev_set, y : y_dev_set, seq_len: X_dev_lengths})
                t_1 = time.time()
                t_delta = t_1 - t_0
                total_deltas += t_delta
                expected_runtime = t_delta * ((n_epoch - epoch - 1) * QIQC.nr_batches + (QIQC.nr_batches - iteration))/100
                expected_runtime_h = int(expected_runtime//3600)
                expected_runtime_min = int((expected_runtime - 3600 * expected_runtime_h)//60)
                expected_runtime_sec = int((expected_runtime - 3600 * expected_runtime_h - 60 * expected_runtime_min))
                t_0 = t_1
                print("Ep. {}/{}; It. {}/{}; Runtime {:.2f} s; Remaining runtime {}h{}min{}s; f1_train {:.4f}; f1_dev {:.4f}; loss_train {:.4f}; loss_dev {:.4f}".format(epoch + 1, n_epoch, iteration, QIQC.nr_batches, t_delta, expected_runtime_h,expected_runtime_min,expected_runtime_sec,f1_train,f1_dev,loss_train,loss_dev))
    save_path = saver.save(sess,modfil)
t_final = time.time()
actual_runtime_h = int((t_final - t_init)//3600)
actual_runtime_min = int((t_final - t_init - 3600 * actual_runtime_h)//60)
actual_runtime_sec = int((t_final - t_init - 3600 * actual_runtime_h - 60 * actual_runtime_min))

print("Total actual runtime: {} hours {} min {} sec".format(actual_runtime_h, actual_runtime_min, actual_runtime_sec))

Ep. 1/5; It. 0/2608; Runtime 9.53 s; Remaining runtime 0h20min42s; f1_train nan; f1_dev nan; loss_train 1.6409; loss_dev 1.4110
Ep. 1/5; It. 100/2608; Runtime 112.30 s; Remaining runtime 4h2min11s; f1_train 0.4314; f1_dev 0.5203; loss_train 0.1163; loss_dev 0.1386
Ep. 1/5; It. 200/2608; Runtime 109.74 s; Remaining runtime 3h54min51s; f1_train 0.4878; f1_dev 0.4222; loss_train 0.1297; loss_dev 0.1227
Ep. 1/5; It. 300/2608; Runtime 110.39 s; Remaining runtime 3h54min23s; f1_train 0.1379; f1_dev 0.1493; loss_train 0.1115; loss_dev 0.1426
Ep. 1/5; It. 400/2608; Runtime 110.53 s; Remaining runtime 3h52min50s; f1_train 0.6087; f1_dev 0.5946; loss_train 0.1548; loss_dev 0.1190
Ep. 1/5; It. 500/2608; Runtime 109.64 s; Remaining runtime 3h49min9s; f1_train 0.3871; f1_dev 0.3590; loss_train 0.1093; loss_dev 0.1219
Ep. 1/5; It. 600/2608; Runtime 109.87 s; Remaining runtime 3h47min47s; f1_train 0.5823; f1_dev 0.5962; loss_train 0.1653; loss_dev 0.1124
Ep. 1/5; It. 700/2608; Runtime 114.69 s; Remai

KeyboardInterrupt: ignored

In [0]:
MODELS_FILE = "models/model3.3/QIQC_model3.3.ckpt"
modfil = os.path.join(DRIVE_PATH,MODELS_FILE)
X_cv_lengths, X_cv_set, y_cv_set = QIQC.cross_val_set()

with tf.Session() as sess: 
    saver.restore(sess, modfil)
    f1_cv = f1.eval(feed_dict = {X : X_cv_set, y : y_cv_set, seq_len: X_cv_lengths})
    f1_dev = f1.eval(feed_dict = {X : X_dev_set, y : y_dev_set, seq_len: X_dev_lengths})

f1_cv_pc = f1_cv * 100
f1_dev_pc = f1_dev * 100
print("cross val f1 {:.2f}%, dev f1 {:.2f}%".format(f1_cv_pc, f1_dev_pc))