In [1]:
import numpy as np, os
import tensorflow as tf
from tensorflow import keras as tfk
keras = tfk
import datetime as dt
import six
import h5py
import requests
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()
%matplotlib inline

# Data loading 

In [2]:
# download the data 
url = 'https://www.dropbox.com/s/ysrim2re8mh22z9/synthetic_code_dataset.h5?dl=0'
save_name = 'data.h5'
_=!wget {url} -O {save_name}

In [3]:
# load the data into x_train, y_train, .....
f = h5py.File(save_name, 'r')
suffixes = ['train', 'test', 'valid']
for suffix in suffixes:
    exec("x_%s=np.transpose(f.get(\"X_%s\")[:], (0, 2, 1))"%(suffix, suffix))
    exec("y_%s=f.get(\"Y_%s\")[:]"%(suffix, suffix))
f.close()

# Model definition function

In [4]:
def get_activation(activation = 'relu'):
    """
    Create an activation function. The activation argument should one of:
    1. A string representing the keras name of the activation. 
    2. A callable which may or may not be an instance of keras.layers.Layer. 
    """
    if isinstance(activation, str):
        actfn = tfk.layers.Activation(activation)
    else:
        if callable(activation) and not isinstance(activation, tfk.layers.Layer):
            actfn = tfk.layers.Activation(activation)
        else:
            actfn = activation
    return actfn

In [5]:
def conv_layer(x, num_filters, kernel_size, padding, activation, dropout=0.5, l2=1e-6, bn=True): 
    """
    A convolutional block comprising of a convolutional layer followed by
    batch normalization, an activation function, and dropout. 
    """
    y = tfk.layers.Conv1D(filters=num_filters, kernel_size=kernel_size, kernel_regularizer=tfk.regularizers.l2(l2), padding=padding)(x)
    if bn:
        y = tfk.layers.BatchNormalization()(y)
    actfn = get_activation(activation)
    y = actfn(y)
    if dropout:
        y = tfk.layers.Dropout(dropout)(y)
    return y

def dense_layer(x, num_units, activation, dropout=0.5, l2=None, bn=True):
    """
    A dense block comprising of a dense layer followed by batch normalization, 
    activation and dropout. 
    """
    y = tfk.layers.Dense(num_units, use_bias=False, kernel_regularizer=tfk.regularizers.l2(l2))(x)
    if bn:
        y = tfk.layers.BatchNormalization()(y)
    actfn = get_activation(activation)
    y = actfn(y)
    if dropout:
        y = tfk.layers.Dropout(dropout)(y)
    return y

def get_model(L, A, name="cnn_att"):
	## input layer
	x = tfk.layers.Input((L, A), name='Input')
	
	## 1st conv layer
	y = keras.layers.Conv1D(filters=32, kernel_size=19, kernel_regularizer=tfk.regularizers.l2(1e-6), padding='same', name='conv1', use_bias=True)(x)
	y = keras.layers.Activation('relu')(y)
	y = keras.layers.MaxPool1D(pool_size=4)(y)
	
	# multi head attention layer
	embedding = keras.layers.Dropout(0.1)(y)
	y, weights = keras.layers.MultiHeadAttention(num_heads=8, key_dim=64, value_dim=64)(embedding, embedding, return_attention_scores=True)
	y = keras.layers.Dropout(0.1)(y)
	y = keras.layers.LayerNormalization(epsilon=1e-6)(y)
	
	# everything else
	y = keras.layers.Flatten()(y)
	y = keras.layers.Dense(128, activation=None, use_bias=False)(y)
	y = keras.layers.BatchNormalization()(y)
	y = keras.layers.Activation('relu')(y)
	y = keras.layers.Dropout(0.5)(y)
	y = keras.layers.Dense(1, name='logits')(y)
	y = keras.layers.Activation('sigmoid', name='output')(y)
	model = tfk.Model(inputs=x, outputs=y, name=name)
	return model

# def get_model(L, A, activation='relu', name='cnn_dist'):
#     """
#     A function to assemble the full CNN distributed model. 
#     """
#     # input layer 
#     x = tfk.layers.Input((L, A), name='input')

#     # 1st convolutional block 
#     y = conv_layer(x,num_filters=24, kernel_size=19, padding='same', dropout=0.1,l2=1e-6, bn=True, activation=activation)
    
#     # 2nd conv. block + pooling 
#     y = conv_layer(y,num_filters=32, kernel_size=7, padding='same', activation=activation, dropout=0.2,l2=1e-6, bn=True)
#     y = tfk.layers.MaxPool1D(pool_size=4)(y)
    
#     # 3rd convolutional block + pooling 
#     y = conv_layer(y,num_filters=64, kernel_size=3, padding='same', activation=activation, dropout=0.4,l2=1e-6, bn=True)
#     y = tfk.layers.MaxPool1D(pool_size=3, strides=3, padding='same')(y)
    
#     # dense block and final output layer 
#     y = tfk.layers.Flatten()(y)
#     y = dense_layer(y, num_units=96, activation=activation, dropout=0.5, l2=1e-6, bn=True)
#     y = tfk.layers.Dense(1, use_bias=True, name = 'logits')(y)
#     y = tfk.layers.Activation('sigmoid')(y)

#     # assemble full model
#     model = tfk.Model(x, y, name=name)
#     return model

# Train a teacher model

In [6]:
# instantiate the teacher model 
activation = 'relu' 
#activation = lambda x : tf.math.sin(x) + tf.math.cos(x)
L, A = x_train.shape[1:]

teacher_model = get_model(L, A, name='teacher')
#teacher_model = get_model(L, A, activation, name='teacher')

# compile the teacher model 
lossfn = tfk.losses.BinaryCrossentropy(name='bce')
modelmetrics = [tfk.metrics.BinaryAccuracy(name='ACC'), tfk.metrics.AUC(curve='PR', name='AUPR'), tfk.metrics.AUC(curve='ROC', name='AUROC')]
optimizer = tfk.optimizers.Adam(learning_rate=1e-2)
teacher_model.compile(loss=lossfn, metrics=modelmetrics, optimizer=optimizer)

# fit the teacher model 
num_epochs = 100
callbacks = [tfk.callbacks.EarlyStopping(monitor='val_AUROC', patience=20), 
             tfk.callbacks.ModelCheckpoint("best_teacher_model.hdf5", monitor='val_AUROC', mode='max', save_best_only=True)]
teacher_model.fit(x_train, y_train, 
                    epochs=num_epochs, 
                    batch_size=128, 
                    callbacks=callbacks, 
                    shuffle=True, 
                    initial_epoch=0,
                    validation_data=(x_valid, y_valid))
teacher_model = tfk.models.load_model('best_teacher_model.hdf5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


In [7]:
teacher_model = tfk.models.load_model('best_teacher_model.hdf5')

# Knowledge distillation

## Define a `Distiller` class that takes in a trained teacher model, an untrained student model and distills the knowledge in the teacher model onto the student model. 

In [8]:
class Distiller(keras.Model):
    def get_config(self,):
        """
        Implement the config dictionary to enable serialization
        """
        config = {}
        config['student'] = self.student
        config['teacher'] = self.teacher
        return config
    
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {"student_loss": student_loss}
        results.update({m.name: m.result() for m in self.metrics})
        return results
    
    @property
    def metrics_names(self):
        return ['student_loss']+[m.name for m in self.metrics]

In [9]:
def get_student_model(L, A, activation='relu', name='deepbind'):
    """
    Defining the deepbind architecture in here. 
    """
    x = tfk.layers.Input((L, A), name='input')
    y = tfk.layers.Conv1D(filters=16, kernel_size=24, padding='valid', kernel_regularizer=tfk.regularizers.l2(1e-6))(x)
    actfn = get_activation(activation=activation)
    y = actfn(y)
    y = tfk.layers.Lambda(lambda x : tf.reduce_max(x, axis=1))(y)  # max pooling
    y = tfk.layers.Dropout(0.5)(y)  
    y = tfk.layers.Dense(32, activation='relu')(y)
    y = tfk.layers.Dense(1, name='logits')(y)
    y = tfk.layers.Activation('sigmoid', name='output')(y)

    model = tfk.Model(inputs=x, outputs=y, name=name)
    return model

In [10]:
# instantiate the student model and the distiller 
student_model = get_student_model(L, A)
distiller = Distiller(student_model, teacher_model)

# compile the distiller
alpha = 0.8
temperature = 1. 
distiller.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=modelmetrics,
    student_loss_fn=keras.losses.BinaryCrossentropy(name='bce'),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=alpha,
    temperature=temperature,
)

# perform distillation
num_epochs = 50
callbacks = [tfk.callbacks.EarlyStopping(monitor='val_AUROC', patience=20), 
             tfk.callbacks.ModelCheckpoint("best_distiller.hdf5", monitor='val_AUROC', mode='max',save_weights_only=True, save_best_only=True)]
distiller.fit(x_train, y_train, 
                epochs=num_epochs, 
                batch_size=128, 
                callbacks=callbacks, 
                shuffle=True, 
                validation_data=(x_valid, y_valid))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


<tensorflow.python.keras.callbacks.History at 0x7fc87b4b67b8>

In [11]:
def plot_f_and_grad(model):
    # pick a random sample 
    N, L, A = x_train.shape
    xsample = x_train[np.random.randint(0, N)][None, :, :]

    # define a keras model mapping an input sequence to the logits of the teacher model
    func = tfk.Model(inputs=model.input, outputs=model.get_layer('logits').output)

    # define a set of probe sequences by sampling points in the ith nucleotide, jth channel 
    # i and j are picked randomly
    n_probe = 100
    x_probe = np.linspace(0, 1, n_probe)
    n_samples = 50
    Is, Js, y_ijs, y_ij_grads = [], [], [], []
    for i in range(n_samples):  
        i, j = np.random.randint(0, L), np.random.randint(0, A)
        Is.append(i)
        Js.append(j)
        
        x_ij_probe = np.zeros((n_probe, L, A))
        x_ij_probe[:, i, j] = x_probe
        x_ij_probe = tf.convert_to_tensor(x_ij_probe)

        with tf.GradientTape() as tape:
            tape.watch(x_ij_probe)
            y_ij_pred = func(x_ij_probe)
        y_ij_grad = tape.gradient(y_ij_pred, x_ij_probe)
        
        #y_ij_pred = func(x_ij_probe)
        y_ijs.append(y_ij_pred.numpy())
        y_ij_grads.append(y_ij_grad.numpy()[:, i, j])

    # plot
    fig = plt.figure(figsize=(14, 10))
    for k in range(4):
        idx = np.random.randint(0, len(Is))
        i = Is[idx]
        j = Js[idx]
        ax = fig.add_subplot(2,2,k+1)
        ax1 = ax.twinx()
        title="i=%d, j=%d"%(i, j)
        figure_options = {'linewidth':2}

        c, c1 = 'blue', 'red'
        ax.plot(x_probe, y_ijs[idx], color=c, label='$f(x)$',**figure_options)
        ax.tick_params(axis='y', color=c, labelcolor=c)
        ax.legend(loc='upper right', fontsize=15)
        
        ax1.plot(x_probe, y_ij_grads[idx], color=c1, label="$\\nabla f_{ij}$", **figure_options)
        ax1.tick_params(axis='y',color=c1, labelcolor=c1)
        ax1.legend(loc='lower left', fontsize=15)

        ax.set_title(title, fontsize=15)
    fig.tight_layout()

In [12]:
#plot_f_and_grad(distiller.teacher)

In [13]:
#plot_f_and_grad(distiller.student)

## Train a simple student model from scratch without distillation

In [14]:
# train a deep bind model by itself 
deepbind_model = get_student_model(L, A)

# compile the model 
lossfn = tfk.losses.BinaryCrossentropy(name='bce')
modelmetrics = [tfk.metrics.BinaryAccuracy(name='ACC'), tfk.metrics.AUC(curve='PR', name='AUPR'), tfk.metrics.AUC(curve='ROC', name='AUROC')]
optimizer = tfk.optimizers.Adam(learning_rate=1e-2)
deepbind_model.compile(loss=lossfn, metrics=modelmetrics, optimizer=optimizer)

# fit the teacher model 
num_epochs = 100
callbacks = [tfk.callbacks.EarlyStopping(monitor='val_AUROC', patience=20), 
             tfk.callbacks.ModelCheckpoint("best_deepbind_model.hdf5", monitor='val_AUROC', mode='max', save_best_only=True)]
deepbind_model.fit(x_train, y_train, 
                    epochs=num_epochs, 
                    batch_size=128, 
                    callbacks=callbacks, 
                    shuffle=True, 
                    initial_epoch=0,
                    validation_data=(x_valid, y_valid))
deepbind_model = tfk.models.load_model('best_deepbind_model.hdf5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


## Compute metrics on all 3 models

In [15]:
distilled_student_metrics = distiller.evaluate(x_test, y_test, verbose=False)
teacher_metrics = distiller.teacher.evaluate(x_test, y_test, verbose=False)
deepbind_from_scratch_metrics = deepbind_model.evaluate(x_test, y_test, verbose=False)
names = deepbind_model.metrics_names
df = pd.DataFrame(data={'Name':names, 'Student (distilled)':distilled_student_metrics, 'Student (from scratch)':deepbind_from_scratch_metrics, 'Teacher ':teacher_metrics})
df

Unnamed: 0,Name,Student (distilled),Student (from scratch),Teacher
0,loss,0.342785,0.427017,0.20147
1,ACC,0.875,0.825,0.917
2,AUPR,0.949092,0.914146,0.979375
3,AUROC,0.948674,0.916265,0.979556


In [16]:
df.to_csv('modelmetrics-greaterrange-mutlihead')

##Run experiment on varying alpha and temperature values



In [18]:
alpha_values = [.1,.15,.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95]
temperature_values = [1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.,15.]

experiment_data=[]


for alpha_value in alpha_values:
  for temperature_value in temperature_values:
    print('Alpha: ',alpha_value, ', Temperature: ',temperature_value)
    # instantiate the student model and the distiller
    student_model = get_student_model(L, A)
    distiller = Distiller(student_model, teacher_model)
    # compile the distiller
    alpha = alpha_value
    temperature = temperature_value
    distiller.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        metrics=modelmetrics,
        student_loss_fn=keras.losses.BinaryCrossentropy(name='bce'),
        distillation_loss_fn=keras.losses.KLDivergence(),
        alpha=alpha,
        temperature=temperature,
    )

    # perform distillation
    num_epochs = 50
    callbacks = [tfk.callbacks.EarlyStopping(monitor='val_AUROC', patience=20), 
                tfk.callbacks.ModelCheckpoint("best_distiller.hdf5", monitor='val_AUROC', mode='max',save_weights_only=True, save_best_only=True)]
    distiller.fit(x_train, y_train, 
                    epochs=num_epochs, 
                    batch_size=128, 
                    callbacks=callbacks, 
                    shuffle=True, 
                    validation_data=(x_valid, y_valid),
                    verbose=0)
    
    #evaluate and save Distilled metrics
    experiment_dist_student_metrics = distiller.evaluate(x_test, y_test, verbose=False)
    hyperparameters = [alpha_value, temperature_value]
    all_values = hyperparameters+experiment_dist_student_metrics
    #add data to list
    experiment_data.append(all_values)



Alpha:  0.1 , Temperature:  1.0
Alpha:  0.1 , Temperature:  2.0
Alpha:  0.1 , Temperature:  3.0
Alpha:  0.1 , Temperature:  4.0
Alpha:  0.1 , Temperature:  5.0
Alpha:  0.1 , Temperature:  6.0
Alpha:  0.1 , Temperature:  7.0
Alpha:  0.1 , Temperature:  8.0
Alpha:  0.1 , Temperature:  9.0
Alpha:  0.1 , Temperature:  10.0
Alpha:  0.1 , Temperature:  11.0
Alpha:  0.1 , Temperature:  12.0
Alpha:  0.1 , Temperature:  13.0
Alpha:  0.1 , Temperature:  14.0
Alpha:  0.1 , Temperature:  15.0
Alpha:  0.15 , Temperature:  1.0
Alpha:  0.15 , Temperature:  2.0
Alpha:  0.15 , Temperature:  3.0
Alpha:  0.15 , Temperature:  4.0
Alpha:  0.15 , Temperature:  5.0
Alpha:  0.15 , Temperature:  6.0
Alpha:  0.15 , Temperature:  7.0
Alpha:  0.15 , Temperature:  8.0
Alpha:  0.15 , Temperature:  9.0
Alpha:  0.15 , Temperature:  10.0
Alpha:  0.15 , Temperature:  11.0
Alpha:  0.15 , Temperature:  12.0
Alpha:  0.15 , Temperature:  13.0
Alpha:  0.15 , Temperature:  14.0
Alpha:  0.15 , Temperature:  15.0
Alpha:  0.2 ,

In [19]:
#put results into a data table
df = pd.DataFrame(experiment_data)
columns = ['alpha', 'temperature', 'loss', 'ACC', 'AUPR', 'AUROC']
df.columns=columns
df


Unnamed: 0,alpha,temperature,loss,ACC,AUPR,AUROC
0,0.10,1.0,0.451685,0.85425,0.931255,0.931530
1,0.10,2.0,0.455778,0.85950,0.939064,0.937212
2,0.10,3.0,0.384773,0.85675,0.935255,0.933663
3,0.10,4.0,0.409642,0.87125,0.948614,0.948540
4,0.10,5.0,0.403470,0.83400,0.915742,0.916235
...,...,...,...,...,...,...
265,0.95,11.0,0.322380,0.88225,0.950412,0.950509
266,0.95,12.0,0.430213,0.87175,0.945858,0.945768
267,0.95,13.0,0.382494,0.87650,0.948147,0.948070
268,0.95,14.0,0.340258,0.88475,0.953421,0.952706


In [20]:
df.to_csv('performancemetrics-greaterrange-multihead')

In [21]:
#find best performing combination
max_metric = experiment_data[0]
for metrics in experiment_data:
  if metrics[3]>max_metric[3]:
    max_metric = metrics

mf = pd.DataFrame(data={'Name':columns, 'Best Performance':max_metric})
mf

Unnamed: 0,Name,Best Performance
0,alpha,0.95
1,temperature,9.0
2,loss,0.368708
3,ACC,0.89625
4,AUPR,0.961247
5,AUROC,0.962551
