In [1]:
import pandas as pd
import os
from model import NLUModel
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
import random
import sys
import math

DATA_DIRECTORY = '../data/'
ROC_TRAIN_SET = 'train_stories.csv'
ROC_VAL_SET = 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'
ROC_TEST_SET = 'test_for_report-stories-labels.csv'

TRAIN_SKIP_THOUGHTS_EMBEDDINGS = '/cluster/project/infk/courses/machine_perception_19/Sasglentamekaiedo/skip-thoughts-embbedings.npy'
VAL_SKIP_THOUGHTS_EMBEDDINGS = '/cluster/project/infk/courses/machine_perception_19/Sasglentamekaiedo/skip-thoughts-embbedings_validation.npy'
TEST_SKIP_THOUGHTS_EMBEDDINGS = '/cluster/project/infk/courses/machine_perception_19/Sasglentamekaiedo/skip-thoughts-embbedings_test.npy'

LOG_PATH = '../log_path'

# parameter
num_labels = 2

# create dataset
def create_dataset_from_embeddings(embeddings, df):
    v_embeddings = list()
    v_classes = list()
    correct_answers = df['AnswerRightEnding'].values

    for i, story_embedding in enumerate(embeddings):
        v_embeddings.append(np.append(story_embedding[:4], [story_embedding[4]], axis=0))
        v_embeddings.append(np.append(story_embedding[:4], [story_embedding[5]], axis=0))

        if correct_answers[i] == 1:
            v_classes.append(0)
            v_classes.append(1)
        else:
            v_classes.append(1)
            v_classes.append(0)

    return np.array(v_embeddings), np.array(v_classes)


def get_final_predictions(probabilities):
    # predictions based on probabilities!
    my_predictions = []

    probabilities_exp = np.exp(probabilities)

    i = 0
    while i < len(probabilities):
        p_first = probabilities_exp[i]
        p_second = probabilities_exp[i + 1]

        p1 = p_first[0] + p_second[1]
        p2 = p_first[1] + p_second[0]

        if p1 > p2:
            my_predictions.append(0)
        else:
            my_predictions.append(1)
        i += 2

    return np.array(my_predictions)


class Encoder(tf.keras.Model):
    def __init__(self, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units

        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self, batch_size):
        return tf.zeros((batch_size, self.enc_units))


class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()

        self.W1 = tf.keras.layers.Dense(units, name='Dense_1')
        self.W2 = tf.keras.layers.Dense(units, name='Dense_2')
        self.V = tf.keras.layers.Dense(1, name='Dense_3')

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class DenseLayerWithSoftmax(tf.keras.Model):
    def __init__(self, layers, num_classes, dropout_keep_proba=0.9, activation=tf.nn.relu):
        super(DenseLayerWithSoftmax, self).__init__()

        self.dense_layers = []
        self.dropout_keep_proba = dropout_keep_proba
        self.num_classes = num_classes

        for i, layer_size in enumerate(layers):
            self.dense_layers.append(
                tf.keras.layers.Dense(layer_size, name='DenseLayer_' + str(i), use_bias=True, activation=tf.nn.relu))

        self.final_layer = tf.keras.layers.Dense(self.num_classes, name='DenseLayer_final', use_bias=True)

    def call(self, input_, input_labels):

        logits = input_

        for layer in self.dense_layers:
            logits = layer(logits)
            logits = tf.nn.dropout(logits, keep_prob=self.dropout_keep_proba)

        logits = self.final_layer(logits)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(input_labels, depth=self.num_classes, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

        per_example_loss = - tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return predicted_labels, loss, log_probs


class FCSkip(tf.keras.Model):
    def __init__(self, units,num_classes=2, fc_layers=[], dropout_keep_prob=0.9, activation=tf.nn.relu):
        super(FCSkip, self).__init__()

        self.units = units

        self.encoder = Encoder(self.units)
        self.feed_forward = DenseLayerWithSoftmax(fc_layers, num_classes,
                                                 dropout_keep_proba=dropout_keep_prob, activation=activation)

    def call(self, input_embeddings, input_labels):

        # sample input
        sample_hidden = self.encoder.initialize_hidden_state(tf.shape(input_embeddings)[0])
        sample_output, sample_hidden = self.encoder(input_embeddings[:, :4, :], sample_hidden)

        concatenated_input = tf.concat([sample_output[:, -1, :], input_embeddings[:, 4, :]], axis=1)

        return self.feed_forward(concatenated_input, input_labels)


class LSSkip(tf.keras.Model):
    def __init__(self, units,num_classes=2, fc_layers=[], dropout_keep_prob=0.9, activation=tf.nn.relu):
        super(LSSkip, self).__init__()

        self.units = units

        self.feed_forward = DenseLayerWithSoftmax(fc_layers, num_classes,
                                                 dropout_keep_proba=dropout_keep_prob, activation=activation)

    def call(self, input_embeddings, input_labels):
        concatenated_input = tf.concat([input_embeddings[:, 3, :], input_embeddings[:, 4, :]], axis=1)
#         concatenated_input = input_embeddings[:, 3, :] + input_embeddings[:, 4, :]

        return self.feed_forward(concatenated_input, input_labels)


class SimpleAndEffectiveApproach(NLUModel):
    train_embeddings=None
    train_classes=None
    test_embeddings=None
    test_classes=None
    final_test_embeddings=None
    final_test_classes=None
    batch_size=None

    def __init__(self, units, fc_layers=None, num_classes=2, train_on_validation=False, mode='FC-skip', verbose=False, negative_sampling=3, learning_rate=1e-3):
        assert mode in ['FC-skip', 'LS-skip'], "mode specified not supported"

        super(SimpleAndEffectiveApproach, self).__init__('SimpleAndEffectiveApproach')
        self.train_on_validation = train_on_validation
        self.verbose = verbose
        self.negative_sampling = negative_sampling
        self.mode = mode
        self.fc_layers = fc_layers
        self.units = units
        self.learning_rate = learning_rate

    def _create_graph(self):
        train_x, train_y, test_x, test_y, final_test_x, final_test_y = self._prepare_embeddings()

        if self.mode == 'FC-skip':
            if self.fc_layers is None:
                print('Using default values for feed forward network [256, 64]')
                self.fc_layers = [256, 64]
            fc_skip = FCSkip(self.units, num_classes=2, fc_layers=self.fc_layers,
                                        dropout_keep_prob=0.9, activation=tf.nn.relu)

            self.predicted_labels_train, self.loss_train, self.log_probs_train = fc_skip(train_x, train_y)
            self.predicted_labels_test, self.loss_test, self.log_probs_test = fc_skip(test_x, test_y)
            self.predicted_labels_final_test, self.loss_final_test, self.log_probs_final_test = fc_skip(final_test_x, final_test_y)

        elif self.mode == 'LS-skip':
            if self.fc_layers is None:
                print('Using default values for feed forward network [2400, 1200, 600]')
                self.fc_layers = [2400, 1200, 600]

            ls_skip = LSSkip(self.units, num_classes=2, fc_layers=self.fc_layers, dropout_keep_prob=0.9, activation=tf.nn.relu)

            self.predicted_labels_train, self.loss_train, self.log_probs_train = ls_skip(train_x, train_y)
            self.predicted_labels_test, self.loss_test, self.log_probs_test = ls_skip(test_x, test_y)
            self.predicted_labels_final_test, self.loss_final_test, self.log_probs_final_test = ls_skip(final_test_x, final_test_y)

    def _prepare_embeddings(self):
        if not self.train_on_validation:
            data_train = pd.read_csv(os.path.join(DATA_DIRECTORY, ROC_TRAIN_SET), header='infer')
            # has a shape (88161, 5, 4800)
            train_embeddings = np.load(TRAIN_SKIP_THOUGHTS_EMBEDDINGS)
        
        data_test = pd.read_csv(os.path.join(DATA_DIRECTORY, ROC_TEST_SET), header='infer')
        # has a shape (1871, 6, 4800)
        test_embeddings = np.load(TEST_SKIP_THOUGHTS_EMBEDDINGS)

        data_val = pd.read_csv(os.path.join(DATA_DIRECTORY, ROC_VAL_SET), header='infer')
        # has a shape (1871, 6, 4800)
        validation_embeddings = np.load(VAL_SKIP_THOUGHTS_EMBEDDINGS)

        # create set for validation dataset
        val_embeddings, val_classes = create_dataset_from_embeddings(validation_embeddings, data_val)
        self.final_test_embeddings, self.final_test_classes = create_dataset_from_embeddings(test_embeddings, data_test)
        
        if self.train_on_validation:
            self.train_embeddings, self.train_classes = shuffle(val_embeddings, val_classes)

            self.test_embeddings, self.test_classes = self.final_test_embeddings, self.final_test_classes
        else:
            # noinspection PyUnboundLocalVariable
            self.train_embeddings, self.train_classes = shuffle(train_embeddings), np.zeros(len(train_embeddings))

            self.test_embeddings, self.test_classes = val_embeddings, val_classes

        self.num_samples_training = len(self.train_embeddings)
        self.num_samples_test = len(self.test_embeddings)
        self.num_samples_final_test = len(self.final_test_embeddings)

        if self.train_on_validation:
            if self.verbose:
                print('Loading without negative sampling.')
                
#             dataset_train = tf.data.Dataset.from_tensor_slices((self.train_embeddings, self.train_classes))
#             dataset_train = dataset_train.shuffle(buffer_size=len(self.train_embeddings))

#             dataset_train = dataset_train.batch(self.batch_size)
#             dataset_train = dataset_train.repeat()

#             iterator_train = dataset_train.make_one_shot_iterator()
#             train_x, train_y = iterator_train.get_next()

#             dataset_test = tf.data.Dataset.from_tensor_slices((self.test_embeddings, self.test_classes))

#             dataset_test = dataset_test.batch(self.batch_size)
#             dataset_test = dataset_test.repeat()

#             iterator_test = dataset_test.make_one_shot_iterator()
#             test_x, test_y = iterator_test.get_next()
            train_x, train_y = self.create_dataset(0, self.batch_size, 0)

            
        else:
            if self.verbose:
                print('Loading with negative sampling on the original training set.')
                
            train_x, train_y = self.create_dataset(0, self.batch_size, self.negative_sampling)

#             test_x, test_y = self.create_dataset(1, self.batch_size, 0)

        # create dataset for the final test
#         dataset_final_test = tf.data.Dataset.from_tensor_slices((self.final_test_embeddings, self.final_test_classes))

#         dataset_final_test = dataset_final_test.batch(self.batch_size)
#         dataset_final_test = dataset_final_test.repeat()

#         iterator_final_test = dataset_final_test.make_one_shot_iterator()
#         final_test_x, final_test_y = iterator_final_test.get_next()
        test_x, test_y = self.create_dataset(1, self.batch_size, 0)
        final_test_x, final_test_y = test_x, test_y = self.create_dataset(2, self.batch_size, 0)
        
        return train_x, train_y, test_x, test_y, final_test_x, final_test_y

    @staticmethod
    def sample_negatives(embeddings, embeddings_batch, classes_batch, negative_sampling):
        new_classes = []
        new_embeddings_batch = []
        for i, embedding in enumerate(embeddings_batch):

            new_embeddings_batch.append(embedding)
            new_classes.append(classes_batch[i])

            for _ in range(negative_sampling):
                new_embeddings_batch.append(
                    np.concatenate((embedding[:4], [random.choice(embeddings[:, 4, :])]), axis=0))
                # negative class always
                new_classes.append(1)
        return np.array(new_embeddings_batch, dtype=np.float32), np.array(new_classes, dtype=np.int32)

    def generator(self, mode, batch_size=64, negative_sampling=0):
        """
        negative_sampling: For each positive sample these many negatives
        """
        if mode == 0:
            embeddings = self.train_embeddings
            classes = self.train_classes
        elif mode == 1:
            embeddings = self.test_embeddings
            classes = self.test_classes
        else:
            embeddings = self.final_test_embeddings
            classes = self.final_test_classes

        if negative_sampling > 0:
            batch_size /= (negative_sampling + 1)
            if batch_size != int(batch_size):
                raise Exception('Batch size should be an integer. Please change negative sampling rate')

            batch_size = int(batch_size)

        # repeat
        while True:
            if mode == 0 or (mode == 1 and self.train_on_validation):
                embeddings, classes = shuffle(embeddings, classes)

            length = len(embeddings)
            for ndx in range(0, length, batch_size):
                embeddings_batch = embeddings[ndx: min(ndx + batch_size, length)]
                classes_batch = classes[ndx: min(ndx + batch_size, length)]

                if negative_sampling <= 0:
                    yield embeddings_batch, classes_batch
                else:
                    yield self.sample_negatives(embeddings, embeddings_batch, classes_batch, negative_sampling)

    def create_dataset(self, mode, batch_size, negative_sampling):
        dataset = tf.data.Dataset.from_generator(self.generator, (tf.float32, tf.int32),
                                                 output_shapes=(
                                                 tf.TensorShape([None, 5, 4800]), tf.TensorShape([None])),
                                                 args=([mode, batch_size, negative_sampling]))

        iterator = dataset.make_one_shot_iterator()
        return iterator.get_next()

    def _predict_wth_session(self, sess, final_test=False):
        predictions = []
        for _ in range(math.ceil(self.num_samples_test / self.batch_size)):
            
            if final_test:
                predictions_batch = sess.run(self.log_probs_final_test)
            else:
                predictions_batch = sess.run(self.log_probs_test)
                
            predictions.append(predictions_batch)

        return np.concatenate(predictions, axis=0).reshape(-1, 2)
    
    def predict(self, X):
        """
        X should always be the validation set?
        """

        with tf.Session() as sess:

            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(LOG_PATH))

            return self._predict_wth_session(sess, final_test=True)

    def fit(self, X, y, epochs=10, batch_size=64):
        """
        X, y are unused in this case
        """

        self.batch_size = batch_size
        self._create_graph()
        
        update_learning_rate = 20 # empirical
        if self.train_on_validation:
            update_lr_every = int((math.ceil(self.num_samples_training / batch_size) * epochs) / update_learning_rate)
        else:
            update_lr_every = int(
                (math.ceil(self.num_samples_training / (batch_size / (self.negative_sampling + 1))) * epochs) / update_learning_rate)

        global_step = tf.Variable(0, trainable=False, name="global_step")
        if self.verbose:
            print('Updating the learning rate every:', update_lr_every, 'steps.')

        # learning rate 1e-3 for most models
        learning_rate = tf.train.exponential_decay(
            self.learning_rate,  # Base learning rate.
            global_step,  # Current index into the dataset.
            update_lr_every,  # Decay step.
            0.96,  # Decay rate.
            staircase=True)

        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients, variables = zip(*optimizer.compute_gradients(self.loss_train))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # define model saver
            saver = tf.train.Saver(tf.global_variables())

            #     with trange(math.ceil(NUM_SAMPLES_TRAINING / (batch_size / (negative_sampling + 1))) * NUM_EPOCHS) as t:
            if self.train_on_validation:
                number_of_steps = math.ceil(self.num_samples_training / batch_size)
            else:
                number_of_steps = math.ceil(self.num_samples_training/ (batch_size / (self.negative_sampling + 1)))

            last_epoch = 0
            for i in range(number_of_steps * epochs):
                # display training status
                epoch_cur = i // number_of_steps

                _, _, lt = sess.run([train_op, global_step, self.loss_train])

                if epoch_cur > last_epoch and self.verbose:
                    # print score on validation set
                    last_epoch = epoch_cur
                    log_predictions_test = self._predict_wth_session(sess)
                    
                    score = accuracy_score(self.test_classes[::2], get_final_predictions(log_predictions_test))
                    print('At epoch %3d score on test set %.3f' % (last_epoch, score))

            saver.save(sess,os.path.join(LOG_PATH,"model"),
                       global_step=int(epochs * self.num_samples_training / batch_size))

  from ._conv import register_converters as _register_converters


In [2]:
simpleAndEffectiveApproach = SimpleAndEffectiveApproach(4800, train_on_validation=False, mode='LS-skip', verbose=True, learning_rate=1e-4)

In [3]:
simpleAndEffectiveApproach.fit(None, None, batch_size=64, epochs=10)

Loading with negative sampling on the original training set.
Using default values for feed forward network [2400, 1200, 600]
Updating the learning rate every: 2755 steps.
At epoch   1 score on test set 0.500
At epoch   2 score on test set 0.497
At epoch   3 score on test set 0.486


KeyboardInterrupt: 

In [None]:
log_predictions = simpleAndEffectiveApproach.predict(None)

score = accuracy_score(simpleAndEffectiveApproach.final_test_classes[::2], get_final_predictions(log_predictions))

print('Final score on test set based on last epoch model %.5f' % (score))

In [None]:
1e-3 for train on validation
1e-7 for train on training ?