In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
import pandas as pd
import gc
import numpy as np

import tensorflow as tf
import keras
from keras import layers, activations, models, optimizers, utils, regularizers
import keras.backend as backend
from keras.engine.input_layer import Input
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score


import os
from datetime import datetime as dt
import time
import re
from functools import partial

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

version = "4.3"

DRIVE_PATH = "/content/gdrive/My Drive/Colab Notebooks/"
TRAIN_FILE = "datasets/train.csv"
CORRECTED_TRAIN_FILE = "datasets/corrected_train.csv"
TEST_FILE = "datasets/test.csv"
GLOVE_FILE = "embeddings/glove.840B.300d/glove.840B.300d.txt"
MODEL_SAVE_PATH = "models/model{}/QIQC_model{}.h5".format(version,version)

Using TensorFlow backend.


In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Toolbox

In [0]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

def load_data(path = DRIVE_PATH, file = CORRECTED_TRAIN_FILE):
    csv_path = os.path.join(path,file)
    return pd.read_csv(csv_path)

def tokenize(sentences_list):
    return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in sentences_list] 
    
def get_vocab(sentences):
    vocab={}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    return vocab

def glove_embeddings(vocabulary_in_set, drive = DRIVE_PATH, gloveFile = GLOVE_FILE ,extract = -1):

    glove_file = os.path.join(drive,gloveFile)
  
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

    embeddings = []
    words_id = {}
    f = open(glove_file,'r', encoding="utf8")
    increment = 0
    words_id[""]=0
    first = True
    i = 1
    
    for line in f:
        word, vect = get_coefs(*line.split(" "))
        if first:
            embeddings.append(np.zeros_like(vect))
            first = False
        if word in vocabulary_in_set:
            embeddings.append(vect)
            words_id[word] = i
            i += 1
            if increment == extract - 1:
                break
            elif extract != -1:
                increment += 1
    f.close()   
    return np.array(embeddings), words_id

def df_to_data_target(df, data_col = "corrected_question_text", target_col = "target"):
    data = df[data_col]
    target = df[target_col]
    return np.c_[data, target]

def embed_and_pad(X, embeddings, n_dims, width):
    padded_X = np.zeros((len(X),width,n_dims))
    i = 0
    for sentence in X:
        j = 0
        for word in sentence:
            padded_X[i,j,:] = embeddings[word]
            j += 1
        i +=1
    return padded_X

def numpy_ewma_vectorized(data, alpha):
    alpha_ = 1-alpha
    D = pd.DataFrame(data)
    M = D.ewm(alpha=alpha_).mean()
    out = M.values
    return out
  
def split_data(data_targets, dv, cv):
        positive_data_targets = data_targets[data_targets[:,1]==1]
        negative_data_targets = data_targets[data_targets[:,1]==0]
        
        positive_length = len(positive_data_targets)
        negative_length = len(negative_data_targets)
  
        pos_data_targets = positive_data_targets[np.random.permutation(positive_length), :]
        neg_data_targets = negative_data_targets[np.random.permutation(negative_length), :]
        
        if dv != 0:
            dv_positive = int(dv * positive_length/(positive_length + negative_length))
            dv_negative = dv - dv_positive
            dv_data_targets = np.concatenate((positive_data_targets[0:dv_positive,:], negative_data_targets[0:dv_negative,:]))
            dv_data_targets = dv_data_targets[np.random.permutation(dv), :]
            y_dev = dv_data_targets[:,1].astype(float)
            X_dev = dv_data_targets[:,0]
        else:
            dv_positive = 0
            dv_negative = 0
            
        if cv != 0:
            cv_positive = int(cv * positive_length/(positive_length + negative_length))
            cv_negative = cv - cv_positive
            cv_data_targets = np.concatenate((positive_data_targets[dv_positive:dv_positive + cv_positive,:], negative_data_targets[dv_negative:dv_negative + cv_negative,:]))
            cv_data_targets = cv_data_targets[np.random.permutation(cv), :]
            y_cross = cv_data_targets[:,1].astype(float)
            X_cross = cv_data_targets[:,0]
        else:
            cv_positive = 0
            cv_negative = 0
            
        train_data_targets = np.concatenate((positive_data_targets[dv_positive + cv_positive:,:], negative_data_targets[dv_negative + cv_negative:,:]))
        train_data_targets = train_data_targets[np.random.permutation(len(data_targets)-dv-cv), :]
        y_train = train_data_targets[:,1].astype(float)
        X_train = train_data_targets[:,0]
        
        if cv == 0 and dv == 0:
          return X_train, y_train
        elif cv == 0 and dv != 0:
          return X_train, y_train, X_dev, y_dev
        elif cv != 0 and dv == 0:
          return X_train, y_train, X_cross, y_cross
        else:
          return X_train, y_train, X_dev, y_dev, X_cross, y_cross


In [0]:
class QIQCSequence(utils.Sequence):

    def __init__(self, x_set, y_set, batch_size, words_ids, seed=42):
        Xs = self.__tokenize(x_set)
        self.x = self.__ids_and_pad(Xs ,words_ids[0], words_ids[1])
        self.y = y_set
        self.batch_size = batch_size
        self.seed = seed
        np.random.seed(seed)
        self.pointer = 0
        self.reshuffle_seed = 0

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __tokenize(self, X):
        return [re.findall(r"[\w]+|[']|[.,!?;]", str(x)) for x in X]
      
    def __ids_and_pad(self, X, word_ids, width):
        padded_X = np.zeros((len(X),width))
        i = 0
        for sentence in X:
            j = 0
            for word in sentence:
                padded_X[i,j] = word_ids[word]
                j += 1
            i +=1
        return padded_X
    
    def on_epoch_end(self):
        self.pointer = 0
        self.reshuffle_seed += 1
        self.indexes = np.arange(len(self.y))
        np.random.shuffle(self.indexes)
        np.random.seed(self.seed + self.reshuffle_seed)

    def __getitem__(self, idx): 
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]        
            
        return np.array(batch_x), np.array(batch_y)

In [0]:
class Metrics(Callback):

  def on_train_begin(self, logs={}):
    self.val_f1s = []
    self.val_recalls = []
    self.val_precisions = []
    
  def on_epoch_end(self, epoch, logs={}):
    val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
    val_targ = self.validation_data[1]
    _val_f1 = f1_score(val_targ, val_predict)
    _val_recall = recall_score(val_targ, val_predict)
    _val_precision = precision_score(val_targ, val_predict)
    self.val_f1s.append(_val_f1)
    self.val_recalls.append(_val_recall)
    self.val_precisions.append(_val_precision)
    print(" — val_f1: {} — val_precision: {} — val_recall {}".format(_val_f1, _val_precision, _val_recall))
    return

# Load and prepare data

In [7]:
train_data = load_data()
train_data.head()

Unnamed: 0,qid,target,corrected_question_text
0,00002165364db923c7e6,0,How did Quebec nationalists see their province...
1,000032939017120e6e44,0,"Do you have an adopted dog , how would you enc..."
2,0000412ca6e4628ce2cf,0,Why does velocity affect time ? Does velocity ...
3,000042bf85aa498cd78e,0,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,0,Can I convert montra helicon D to a mountain b...


In [0]:
data_targets = df_to_data_target(train_data)

In [9]:
tokenized_questions = tokenize(train_data["corrected_question_text"].values)
vocabulary_in_set = get_vocab(tokenized_questions)
del train_data, tokenized_questions
gc.collect()

33

In [0]:
embeddings, words_ids = glove_embeddings(vocabulary_in_set=vocabulary_in_set)

# Construction phase

In [11]:
"""reset_graph()

n_steps = 250
window_width = [3, 4, 5]
out_channels = [n_steps-window_width[0]+1, n_steps-window_width[1]+1, n_steps-window_width[2]+1]
embedding_dimensions = 300
n_fc1 = 300
n_fc2 = 300
n_fc3 = 300
n_fc4 = 200
n_fc5 = 100
n_output = 1
threshold = 1.
lambda_ = 0.05

activation_fn = tf.nn.elu
he_init = tf.variance_scaling_initializer()

init_W_CL_3 = np.random.randn(window_width[0],embedding_dimensions,1,out_channels[0]).astype(np.float32)
init_b_CL_3 = np.random.randn(1,1,1,out_channels[0]).astype(np.float32)
init_W_CL_4 = np.random.randn(window_width[1],embedding_dimensions,1,out_channels[1]).astype(np.float32)
init_b_CL_4 = np.random.randn(1,1,1,out_channels[1]).astype(np.float32)
init_W_CL_5 = np.random.randn(window_width[2],embedding_dimensions,1,out_channels[2]).astype(np.float32)
init_b_CL_5 = np.random.randn(1,1,1,out_channels[2]).astype(np.float32)


X = tf.placeholder(tf.int32, [None, n_steps], name="X") #m * n_steps
y = tf.placeholder(tf.float32, [None], name="y")
learning_rate = tf.placeholder(tf.float32)
#training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("embedding_layer"):
    word_embeddings = tf.Variable(initial_value=embeddings, trainable=False)# n_words * embedding_dimensions
    embedded_sentences = tf.nn.embedding_lookup(word_embeddings, X) # m * n_steps * embedding_dimensions

with tf.name_scope("CNN_3"):
    W_CL_3 = tf.Variable(initial_value=init_W_CL_3)
    b_CL_3 = tf.Variable(initial_value=init_b_CL_3)
    CL_3 = tf.nn.conv2d(tf.reshape(embedded_sentences,[-1,n_steps,embedding_dimensions,1]), W_CL_3, strides=[1,1,1,1], padding="VALID")# m * n_steps - 3 + 1 *1 * out_channels 
    activated_CL_3 = activation_fn(tf.add(CL_3,b_CL_3)) # m * n_steps - 5 + 1 * 1 * out_channels
    MAXPOOL_3 = tf.reshape(tf.math.reduce_max(activated_CL_3,axis=1),[-1,out_channels[0]]) # m * 1 * out_channels  --> m * out_channels
    
with tf.name_scope("CNN_4"):
    W_CL_4 = tf.Variable(initial_value=init_W_CL_4)
    b_CL_4 = tf.Variable(initial_value=init_b_CL_4)
    CL_4 = tf.nn.conv2d(tf.reshape(embedded_sentences,[-1,n_steps,embedding_dimensions,1]), W_CL_4, strides=[1,1,1,1], padding="VALID")# m * n_steps - 4 + 1 *1 * out_channels 
    activated_CL_4 = activation_fn(tf.add(CL_4,b_CL_4)) # m * n_steps - 5 + 1 * 1 * out_channels
    MAXPOOL_4 = tf.reshape(tf.math.reduce_max(activated_CL_4,axis=1),[-1,out_channels[1]]) # m * 1 * out_channels  --> m * out_channels
    
with tf.name_scope("CNN_5"):
    W_CL_5 = tf.Variable(initial_value=init_W_CL_5)
    b_CL_5 = tf.Variable(initial_value=init_b_CL_5)
    CL_5 = tf.nn.conv2d(tf.reshape(embedded_sentences,[-1,n_steps,embedding_dimensions,1]), W_CL_5, strides=[1,1,1,1], padding="VALID")# m * n_steps - 5 + 1 *1 * out_channels 
    activated_CL_5 = activation_fn(tf.add(CL_5,b_CL_5)) # m * n_steps - 5 + 1 * 1 * out_channels
    MAXPOOL_5 = tf.reshape(tf.math.reduce_max(activated_CL_5,axis=1),[-1,out_channels[2]]) # m * 1 * out_channels  --> m * out_channels

with tf.name_scope("Max_pool"):
    MAXPOOL = tf.concat([MAXPOOL_3,MAXPOOL_4,MAXPOOL_5], axis=1)
    
with tf.name_scope("fully_connected_layers"):
    fc_layer = partial(tf.layers.dense, activation=activation_fn, kernel_regularizer=tf.contrib.layers.l2_regularizer(lambda_), kernel_initializer=he_init)
    fc1 = fc_layer(MAXPOOL, n_fc1)
    fc2 = fc_layer(fc1, n_fc2)
    fc3 = fc_layer(fc2, n_fc3)
    fc4 = fc_layer(fc3, n_fc4)
    fc5 = fc_layer(fc4, n_fc5)


with tf.name_scope("logits_and_outputs"):
    logits = tf.reshape(tf.contrib.layers.fully_connected(fc5, n_output, activation_fn=None),shape=[-1])
    outputs = tf.sigmoid(logits)
    predictions = tf.round(outputs)

with tf.name_scope("loss"):
    xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
    base_loss = tf.reduce_mean(xentropy)
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_loss] + reg_loss)

with tf.name_scope("evaluation"):
    TP = tf.count_nonzero(predictions * y)
    TN = tf.count_nonzero((predictions - 1) * (y - 1))
    FP = tf.count_nonzero(predictions * (y - 1))
    FN = tf.count_nonzero((predictions - 1) * y)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN)/(TP + FP + TN + FN)
    f1 = 2 * precision * recall / (precision + recall)

with tf.name_scope("training_op"):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
    l2_grads = tf.norm(grads_and_vars[0])
    l2_capped_grads = tf.norm(capped_gvs[0])
    training_op = optimizer.apply_gradients(capped_gvs)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

f1_train_summary = tf.summary.scalar('Train_F1_score', f1)
f1_dev_summary = tf.summary.scalar("Dev_F1_score", f1)
loss_train_summary = tf.summary.scalar("Train_loss", loss)
loss_dev_summary = tf.summary.scalar("Dev_loss", loss)"""

'reset_graph()\n\nn_steps = 250\nwindow_width = [3, 4, 5]\nout_channels = [n_steps-window_width[0]+1, n_steps-window_width[1]+1, n_steps-window_width[2]+1]\nembedding_dimensions = 300\nn_fc1 = 300\nn_fc2 = 300\nn_fc3 = 300\nn_fc4 = 200\nn_fc5 = 100\nn_output = 1\nthreshold = 1.\nlambda_ = 0.05\n\nactivation_fn = tf.nn.elu\nhe_init = tf.variance_scaling_initializer()\n\ninit_W_CL_3 = np.random.randn(window_width[0],embedding_dimensions,1,out_channels[0]).astype(np.float32)\ninit_b_CL_3 = np.random.randn(1,1,1,out_channels[0]).astype(np.float32)\ninit_W_CL_4 = np.random.randn(window_width[1],embedding_dimensions,1,out_channels[1]).astype(np.float32)\ninit_b_CL_4 = np.random.randn(1,1,1,out_channels[1]).astype(np.float32)\ninit_W_CL_5 = np.random.randn(window_width[2],embedding_dimensions,1,out_channels[2]).astype(np.float32)\ninit_b_CL_5 = np.random.randn(1,1,1,out_channels[2]).astype(np.float32)\n\n\nX = tf.placeholder(tf.int32, [None, n_steps], name="X") #m * n_steps\ny = tf.placeholde

In [12]:
n_fc = [300, 300, 300, 200, 100, 1]
n_steps = 250
n_dims = 300
window_width = [3, 4, 5]
out_channels = [100,100,100]#[n_steps-window_width[0]+1, n_steps-window_width[1]+1, n_steps-window_width[2]+1]
activation = activations.elu

lambda_ = 0.01
lr = 0.004

X = Input(shape=(n_steps,))

embedding = layers.Embedding(len(words_ids), n_dims, input_length=n_steps, weights=[embeddings], trainable=False)(X)# m x n_steps x 300
embedding = layers.Reshape((n_steps,n_dims,1))(embedding)

conv_3 = layers.Conv2D(out_channels[0], (window_width[0], n_dims), activation=activation)(embedding) #m x (n_steps-window_width[0]+1) x 1 x out_channel[0]
conv_3 = layers.MaxPooling2D(pool_size=((n_steps-window_width[0]+1) ,1))(conv_3) #m x 1 x 1 x out_channels[0]
conv_3 = layers.Reshape((out_channels[0],))(conv_3)

conv_4 = layers.Conv2D(out_channels[1], (window_width[1], n_dims), activation=activation)(embedding)
conv_4 = layers.MaxPooling2D(pool_size=((n_steps-window_width[1]+1) ,1))(conv_4)
conv_4 = layers.Reshape((out_channels[1],))(conv_4)

conv_5 = layers.Conv2D(out_channels[2], (window_width[2], n_dims), activation=activation)(embedding)
conv_5 = layers.MaxPooling2D(pool_size=((n_steps-window_width[2]+1),1))(conv_5)
conv_5 = layers.Reshape((out_channels[2],))(conv_5)

merged = layers.concatenate([conv_3, conv_4, conv_5], axis=1)

fc = layers.Dense(n_fc[0], activation=activation, kernel_regularizer=regularizers.l2(lambda_))(merged)
fc = layers.Dense(n_fc[1], activation=activation, kernel_regularizer=regularizers.l2(lambda_))(fc)
fc = layers.Dense(n_fc[2], activation=activation, kernel_regularizer=regularizers.l2(lambda_))(fc)
fc = layers.Dense(n_fc[3], activation=activation, kernel_regularizer=regularizers.l2(lambda_))(fc)
fc = layers.Dense(n_fc[4], activation=activation, kernel_regularizer=regularizers.l2(lambda_))(fc)

out = layers.Dense(n_fc[5], activation='sigmoid')(fc)

model = models.Model(inputs=X, outputs=out)
optimizer = optimizers.Adam(lr=lr)

metrics = Metrics()
checkpoint = ModelCheckpoint(os.path.join(DRIVE_PATH, MODEL_SAVE_PATH), monitor='val_loss', verbose=2, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.00001, verbose=2)

model.compile(loss='binary_crossentropy',optimizer=optimizer)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 300)     53934300    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 250, 300, 1)  0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 248, 1, 100)  90100       reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

# Training phase

In [13]:
batch_size = 64

X_train, y_train, X_dev, y_dev = split_data(data_targets, dv=2000, cv=0)
QIQC_train = QIQCSequence(X_train, y_train, batch_size=batch_size, words_ids=(words_ids,n_steps))
QIQC_dev = QIQCSequence(X_dev, y_dev, batch_size=2000, words_ids=(words_ids,n_steps))
X_dev, y_dev = QIQC_dev.__getitem__(0)
del embeddings, words_ids
gc.collect()

0

In [0]:
n_epochs = 8
model.fit_generator(QIQC_train, epochs=n_epochs, verbose=1, validation_data=(X_dev, y_dev), callbacks=[metrics, checkpoint, reduce_lr])

Epoch 1/8
 — val_f1: 0.5555555555555556 — val_precision: 0.6451612903225806 — val_recall 0.4878048780487805

Epoch 00001: val_loss improved from inf to 0.13913, saving model to /content/gdrive/My Drive/Colab Notebooks/models/model4.3/QIQC_model4.3.h5
Epoch 2/8
 — val_f1: 0.34394904458598724 — val_precision: 0.7941176470588235 — val_recall 0.21951219512195122

Epoch 00002: val_loss did not improve from 0.13913

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0020000000949949026.
Epoch 3/8
 — val_f1: 0.6278026905829596 — val_precision: 0.7 — val_recall 0.5691056910569106

Epoch 00003: val_loss improved from 0.13913 to 0.12201, saving model to /content/gdrive/My Drive/Colab Notebooks/models/model4.3/QIQC_model4.3.h5
Epoch 4/8
 — val_f1: 0.5714285714285714 — val_precision: 0.7671232876712328 — val_recall 0.45528455284552843

Epoch 00004: val_loss did not improve from 0.12201

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.
Epoch 5/8
  524/20377 [.