In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# install tf 2.0
from __future__ import absolute_import, division, print_function, unicode_literals

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf

print(tf.__version__)

2.0.0-alpha0


In [0]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow.keras.backend as K

In [0]:
# create dataset

def create_generator_for_ffn(
        data_dir,
        file_list=[
            "ehealthforumQAs.csv",
            "icliniqQAs.csv",
            "questionDoctorQAs.csv",
            "webmdQAs.csv"],
        mode='train'):

    for file_name in file_list:
        full_file_path = os.path.join(data_dir, file_name)
        if not os.path.exists(full_file_path):
            raise FileNotFoundError("File %s not found" % full_file_path)
        df = pd.read_csv(full_file_path)

        # so train test split
        if mode == 'train':
            df, _ = train_test_split(df, test_size=0.2)
        else:
            _, df = train_test_split(df, test_size=0.2)

        for _, row in df.iterrows():
            q_vectors = np.fromstring(row.question_bert.replace(
                '[[', '').replace(']]', ''), sep=' ')
            a_vectors = np.fromstring(row.answer_bert.replace(
                '[[', '').replace(']]', ''), sep=' ')
            if mode == 'train':
                yield {
                    "q_vectors": q_vectors,
                    "a_vectors": a_vectors,
                    "labels": 1
                }
            else:
                yield {
                    "q_vectors": q_vectors,
                    "a_vectors": a_vectors,
                }

def create_dataset_for_ffn(
        data_dir,
        file_list=[
            "ehealthforumQAs.csv",
            "icliniqQAs.csv",
            "questionDoctorQAs.csv",
            "webmdQAs.csv",
            "healthtapQAs.csv"],
        mode='train',
        hidden_size=768,
        shuffle_buffer=10000,
        prefetch=128,
        batch_size=32):

    def gen(): return create_generator_for_ffn(
        data_dir=data_dir,
        file_list=file_list,
        mode=mode)

    output_types = {
        'q_vectors': tf.float32,
        'a_vectors': tf.float32
    }

    output_shapes = {
        'q_vectors': [hidden_size],
        'a_vectors': [hidden_size],
    }

    if mode == 'train':
        output_types.update({'labels': tf.int32})
        output_shapes.update({'labels': []})

    dataset = tf.data.Dataset.from_generator(
        generator=gen,
        output_types=output_types,
        output_shapes=output_shapes
    )
    if mode == 'train':
        dataset = dataset.shuffle(shuffle_buffer)

    dataset = dataset.prefetch(prefetch)

    dataset = dataset.batch(batch_size)
    return dataset

In [5]:
# get bert embedded dataset
d = create_dataset_for_ffn(data_dir='/content/gdrive/My Drive/mqa-biobert', batch_size=64)

W0417 05:52:08.727642 139820326467456 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py:410: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


In [0]:
# QA pair ffn layer


class QAFFN(tf.keras.layers.Layer):
    def __init__(
            self,
            hidden_size=768,
            dropout=0.1,
            residual=True,
            activation=tf.keras.layers.ReLU(),
            name='QAFFN'):
        """Feed-forward layers for question and answer.
        The input to this layer should be a two-elements tuple (q_embeddnig, a_embedding).
        The elements of tuple should be None or a tensor. 

        In training, we should input both question embedding and answer embedding.

        In pre-inference, we should pass answer embedding only and save the embedding.

        In inference, we should pass the question embedding only and do a vector similarity search.

        Keyword Arguments:
            hidden_size {int} -- hidden size of feed-forward network (default: {768})
            dropout {float} -- dropout rate (default: {0.1})
            residual {bool} -- whether to use residual connection (default: {True})
            activation {[type]} -- activation function (default: {tf.keras.layers.ReLU()})
        """

        super(QAFFN, self).__init__(name=name)
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.residual = residual
        self.activation = activation
        self.q_ffn = tf.keras.layers.Dense(
            units=hidden_size,
            use_bias=True,
            activation=activation
        )

        self.a_ffn = tf.keras.layers.Dense(
            units=hidden_size,
            use_bias=True,
            activation=activation
        )
        self.q_ffn.build([1, self.hidden_size])
        self.a_ffn.build([1, self.hidden_size])

    @tf.function
    def _bert_to_ffn(self, bert_embedding, ffn_layer):
        if bert_embedding is not None:
            ffn_embedding = ffn_layer(bert_embedding)
            if self.dropout > 0:
                ffn_embedding = tf.keras.layers.Dropout(
                    self.dropout)(ffn_embedding)

            if self.residual:
                try:
                    ffn_embedding += bert_embedding
                except:
                    raise ValueError('Incompatible shape for res connection, got {0}, {1}'.format(
                        ffn_embedding.shape, bert_embedding.shape))
        else:
            ffn_embedding = None

        return ffn_embedding

    def call(self, inputs):
        q_bert_embedding, a_bert_embedding = inputs
        q_ffn_embedding = self._bert_to_ffn(q_bert_embedding, self.q_ffn)
        a_ffn_embedding = self._bert_to_ffn(a_bert_embedding, self.a_ffn)
        return q_ffn_embedding, a_ffn_embedding


@tf.function
def qa_pair_loss(q_embedding, a_embedding):
    if q_embedding is not None and a_embedding is not None:
        q_embedding = q_embedding / \
            tf.norm(q_embedding, axis=-1, keepdims=True)
        a_embedding = a_embedding / \
            tf.norm(a_embedding, axis=-1, keepdims=True)
        similarity_vector = tf.reshape(
            tf.matmul(q_embedding, a_embedding, transpose_b=True), [-1, ])
        target = tf.reshape(tf.eye(q_embedding.shape[0]), [-1, ])
        loss = tf.keras.losses.binary_crossentropy(target, similarity_vector)
        return loss
    else:
        return 0


class MedicalQAModel(tf.keras.Model):
    def __init__(self, name=''):
        super(MedicalQAModel, self).__init__(name=name)
        self.qa_ffn_layer = QAFFN()

    def call(self, inputs):
        q_bert_embedding = inputs.get('q_vectors')
        a_bert_embedding = inputs.get('a_vectors')

        self.add_loss(qa_pair_loss(q_bert_embedding, a_bert_embedding))

        return self.qa_ffn_layer((q_bert_embedding, a_bert_embedding))

def fake_loss(y_true, y_pred):
    return 0.0

In [0]:
medical_qa_model = MedicalQAModel()
optimizer = tf.keras.optimizers.Adam()

In [9]:
epochs=10
loss_metric = tf.keras.metrics.Mean()
K.set_learning_phase(1)

# Iterate over epochs.
for epoch in range(epochs):
  print('Start of epoch %d' % (epoch,))

  # Iterate over the batches of the dataset.
  for step, x_batch_train in enumerate(d):
    with tf.GradientTape() as tape:
      q_embedding, a_embedding = medical_qa_model(x_batch_train)
      # Compute reconstruction loss
      loss = qa_pair_loss(q_embedding, a_embedding)

    grads = tape.gradient(loss, medical_qa_model.trainable_variables)
    optimizer.apply_gradients(zip(grads, medical_qa_model.trainable_variables))
    
    loss_metric(loss)


    if step % 100 == 0:
      print('step %s: mean loss = %s' % (step, loss_metric.result()))


Start of epoch 0


W0417 05:52:46.946611 139820326467456 tf_logging.py:161] Entity <bound method QAFFN._bert_to_ffn of <tensorflow.python.eager.function.TfMethodTarget object at 0x7f29f59fed68>> could not be transformed and will be staged without change. Error details can be found in the logs when running with the env variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the AutoGraph team. Cause: KeyError during conversion: LIVE_VARS_IN




W0417 05:52:47.323804 139820326467456 tf_logging.py:161] Entity <bound method QAFFN._bert_to_ffn of <tensorflow.python.eager.function.TfMethodTarget object at 0x7f29f59fed68>> could not be transformed and will be staged without change. Error details can be found in the logs when running with the env variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the AutoGraph team. Cause: KeyError during conversion: LIVE_VARS_IN


step 0: mean loss = tf.Tensor(1.1809258, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(0.15270387, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(0.11724205, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(0.10493506, shape=(), dtype=float32)
step 400: mean loss = tf.Tensor(0.09857195, shape=(), dtype=float32)
step 500: mean loss = tf.Tensor(0.09459527, shape=(), dtype=float32)
step 600: mean loss = tf.Tensor(0.09184392, shape=(), dtype=float32)
step 700: mean loss = tf.Tensor(0.08978085, shape=(), dtype=float32)
step 800: mean loss = tf.Tensor(0.08816196, shape=(), dtype=float32)
step 900: mean loss = tf.Tensor(0.0868286, shape=(), dtype=float32)
step 1000: mean loss = tf.Tensor(0.0857477, shape=(), dtype=float32)
step 1100: mean loss = tf.Tensor(0.08482037, shape=(), dtype=float32)
step 1200: mean loss = tf.Tensor(0.083998494, shape=(), dtype=float32)
step 1300: mean loss = tf.Tensor(0.08327168, shape=(), dtype=float32)
step 1400: mean loss = tf.Tensor(0

W0417 05:56:58.200734 139820326467456 tf_logging.py:161] Entity <bound method QAFFN._bert_to_ffn of <tensorflow.python.eager.function.TfMethodTarget object at 0x7f29f59fed68>> could not be transformed and will be staged without change. Error details can be found in the logs when running with the env variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the AutoGraph team. Cause: KeyError during conversion: LIVE_VARS_IN
W0417 05:56:58.393811 139820326467456 tf_logging.py:161] Entity <bound method QAFFN._bert_to_ffn of <tensorflow.python.eager.function.TfMethodTarget object at 0x7f29f59fed68>> could not be transformed and will be staged without change. Error details can be found in the logs when running with the env variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the AutoGraph team. Cause: KeyError during conversion: LIVE_VARS_IN


Start of epoch 1
step 0: mean loss = tf.Tensor(0.08096242, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(0.08075694, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(0.08046189, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(0.08019418, shape=(), dtype=float32)
step 400: mean loss = tf.Tensor(0.07988028, shape=(), dtype=float32)
step 500: mean loss = tf.Tensor(0.07955853, shape=(), dtype=float32)
step 600: mean loss = tf.Tensor(0.07924241, shape=(), dtype=float32)
step 700: mean loss = tf.Tensor(0.07894271, shape=(), dtype=float32)
step 800: mean loss = tf.Tensor(0.078649156, shape=(), dtype=float32)
step 900: mean loss = tf.Tensor(0.07837875, shape=(), dtype=float32)
step 1000: mean loss = tf.Tensor(0.07810703, shape=(), dtype=float32)
step 1100: mean loss = tf.Tensor(0.077862464, shape=(), dtype=float32)
step 1200: mean loss = tf.Tensor(0.07763252, shape=(), dtype=float32)
step 1300: mean loss = tf.Tensor(0.07741066, shape=(), dtype=float32)
step 1400: me

In [12]:
K.set_learning_phase(0)
q_embedding, a_embedding = medical_qa_model(next(iter(d)))

q_embedding = q_embedding / tf.norm(q_embedding, axis=-1, keepdims=True)
a_embedding = a_embedding / tf.norm(a_embedding, axis=-1, keepdims=True)

batch_score = tf.reduce_sum(q_embedding*a_embedding, axis=-1)
baseline_score = tf.reduce_mean(tf.matmul(q_embedding,tf.transpose(a_embedding)))

print('Training Batch Cos similarity')
print(tf.reduce_mean(batch_score))
print('Baseline: {0}'.format(baseline_score))

Training Batch Cos similarity
tf.Tensor(0.038819928, shape=(), dtype=float32)
Baseline: 0.017509445548057556


In [13]:
eval_d = create_dataset_for_ffn(data_dir='/content/gdrive/My Drive/mqa-biobert', mode='eval', batch_size=64)
q_embedding, a_embedding = medical_qa_model(next(iter(eval_d)))

q_embedding = q_embedding / tf.norm(q_embedding, axis=-1, keepdims=True)
a_embedding = a_embedding / tf.norm(a_embedding, axis=-1, keepdims=True)

batch_score = tf.reduce_sum(q_embedding*a_embedding, axis=-1)
baseline_score = tf.reduce_mean(tf.matmul(q_embedding,tf.transpose(a_embedding)))

print('Eval Batch Cos similarity')
print(tf.reduce_mean(batch_score))
print('Baseline: {0}'.format(baseline_score))

Eval Batch Cos similarity
tf.Tensor(0.032733116, shape=(), dtype=float32)
Baseline: 0.01878243125975132


[<tf.Tensor: id=2190117, shape=(), dtype=float32, numpy=2.8632064>]