In [1]:
import os
import tensorflow as tf
from util import constants
from util.config_util import get_model_params, get_task_params, get_train_params
from tf2_models.trainer import Trainer
from absl import app
from absl import flags
import numpy as np
from util.models import MODELS
from util.tasks import TASKS
from notebook_utils import *

%matplotlib inline
import pandas as pd
import seaborn as sns; sns.set()

from tqdm import tqdm

In [71]:
def get_reps(inputs, model, index=1, layer=None):
    """
    If Model is LSTM:
        1: final_rnn_outputs, 
        2: hidden_activation (for all layers, including input embeddings)
    """
    outputs = model.detailed_call(inputs)
    reps = outputs[index]
    
    if layer is not None:
        reps = reps[layer]
    
    return reps

def normalized_pairwisedot_product_sim(reps1, reps2):    
    reps1 = reps1 / tf.norm(reps1, axis=-1)[...,None]
    reps2 = reps2 / tf.norm(reps2, axis=-1)[...,None]
    
    pw_dot_product = tf.cast(tf.matmul(reps1, reps2, transpose_b=True), dtype=tf.float32) 

    p_max = tf.reduce_max(pw_dot_product, axis=-1)
    p_min =  tf.reduce_min(pw_dot_product, axis=-1)
    
    
    #pw_dot_product = (pw_dot_product  - p_max) / (p_max - p_min)
    return pw_dot_product


def normalized_dot_product_sim(reps1, reps2, padding_mask):
    #normalize reps:
    reps1 = reps1 / tf.norm(reps1, axis=-1)[...,None]
    reps2 = reps2 / tf.norm(reps2, axis=-1)[...,None]
    
    norm1 = tf.norm(reps1, axis=-1)
    norm2 = tf.norm(reps2, axis=-1)

    # Elementwise multiplication
    dot_product = tf.multiply(reps1, reps2)
    
    # Sum over last axis to get the dot product similarity between corresponding pairs
    dot_product = tf.reduce_sum(dot_product, axis=-1)
    dot_product = tf.multiply(dot_product,padding_mask[:,0])
    
    return dot_product
    
    
def second_order_rep_sim(reps1, reps2, padding_mask):
    
    sims1 = normalized_pairwisedot_product_sim(reps1, reps1)
    sims2 = normalized_pairwisedot_product_sim(reps2, reps2)
    
    padding_mask = tf.ones((reps1.shape[0],1))
    so_sims = normalized_dot_product_sim(sims1, sims2, padding_mask) * padding_mask[:,0]
    mean_sim = tf.reduce_sum(so_sims) / tf.reduce_sum(padding_mask)
    
    return mean_sim, so_sims

def compare_models(inputs, model1, model2, index1=1, index2=1,layer1=None, layer2=None, padding_symbol=None):
    reps1 = get_reps(inputs, model1)
    reps2 = get_reps(inputs, model2)

    
    reps1 = tf.reshape(reps1, (-1, tf.shape(reps1)[-1]))
    reps2 = tf.reshape(reps2, (-1, tf.shape(reps2)[-1]))
    
    if padding_symbol is not None:
        padding_mask = tf.cast(1.0 - (inputs == padding_symbol), dtype=tf.float32)
        padding_mask = tf.reshape(reps2, (-1,1))
    else:
        padding_mask = tf.ones((tf.shape(reps1)[0]))
    
    similarity_measures = second_order_rep_sim(reps1, reps2, padding_mask=padding_mask)
    
    return similarity_measures

def compare_reps(reps1, reps2, padding_symbol=None, inputs=None):
    reps1 = tf.reshape(reps1, (-1, tf.shape(reps1)[-1]))
    reps2 = tf.reshape(reps2, (-1, tf.shape(reps2)[-1]))
    
    if padding_symbol is not None:
        padding_mask = tf.cast(1.0 - (inputs == padding_symbol), dtype=tf.float32)
        padding_mask = tf.reshape(reps2, (-1,1))
    else:
        padding_mask = tf.ones((tf.shape(reps1)[0],1))
        
    
    similarity_measures = second_order_rep_sim(reps1, reps2, padding_mask)
    
    return similarity_measures

In [65]:
task_name = 'word_sv_agreement_vp'
chkpt_dir='../tf_ckpts'
task = TASKS[task_name](get_task_params(), data_dir='../data')
cl_token = task.databuilder.sentence_encoder().encode(constants.bos)

Vocab len:  10032


In [66]:
config={'student_exp_name':'lisa_fd131',
    'teacher_exp_name':'0.001_samira_offlineteacher_v11',
    'teacher_config':'small_lstm_v4',
    'task_name':'word_sv_agreement_vp',
    'student_model':'cl_lstm',
    'teacher_model':'cl_lstm',
    'student_config':'small_lstm_v4',
    'distill_config':'pure_dstl_4_crs_slw',
    'distill_mode':'offline',
    'chkpt_dir':'../tf_ckpts',
       }

std_hparams=get_model_params(task, config['student_model'], config['student_config'])
std_hparams.output_attentions = True
std_hparams.output_embeddings = True
std_hparams.output_hidden_states = True

model1, _ = get_student_model(config, task, std_hparams, cl_token)

tchr_hparams=get_model_params(task, config['teacher_model'], config['teacher_config'])
tchr_hparams.output_attentions = True
tchr_hparams.output_embeddings = True
tchr_hparams.output_hidden_states = True

model2, _ = get_teacher_model(config, task, tchr_hparams, cl_token)


model config: small_lstm_v4
{'hidden_dim': 256, 'embedding_dim': 256, 'depth': 2, 'hidden_dropout_rate': 0.8, 'input_dropout_rate': 0.2, 'initializer_range': 0.1}
model config: small_lstm_v4
{'hidden_dim': 256, 'embedding_dim': 256, 'depth': 2, 'hidden_dropout_rate': 0.8, 'input_dropout_rate': 0.2, 'initializer_range': 0.1}
student_checkpoint: ../tf_ckpts/word_sv_agreement_vp/offline_pure_dstl_4_crs_slw_teacher_cl_lstm_em-256_h-256_d-2_hdrop-0.8_indrop-0.2_small_lstm_v4_0.001_samira_offlineteacher_v11_student_cl_lstm_em-256_h-256_d-2_hdrop-0.8_indrop-0.2_small_lstm_v4_lisa_fd131
Restored student from ../tf_ckpts/word_sv_agreement_vp/offline_pure_dstl_4_crs_slw_teacher_cl_lstm_em-256_h-256_d-2_hdrop-0.8_indrop-0.2_small_lstm_v4_0.001_samira_offlineteacher_v11_student_cl_lstm_em-256_h-256_d-2_hdrop-0.8_indrop-0.2_small_lstm_v4_lisa_fd131/ckpt-60
model config: small_lstm_v4
{'hidden_dim': 256, 'embedding_dim': 256, 'depth': 2, 'hidden_dropout_rate': 0.8, 'input_dropout_rate': 0.2, 'initia

In [79]:
task_name = 'word_sv_agreement_lm'
chkpt_dir='../tf_ckpts'
task = TASKS[task_name](get_task_params(), data_dir='../data')
cl_token = task.databuilder.sentence_encoder().encode(constants.bos)

config={'student_exp_name':'lisa_fd432',
    'teacher_exp_name':'0.001_lisa_crs_fst_offlineteacher_v20',
    'teacher_config':'lstm_drop31_v2',
    'task_name':'word_sv_agreement_lm',
    'student_model':'lm_gpt2',
    'teacher_model':'lm_lstm_shared_emb',
    'student_config':'very_big_gpt_v10',
    'distill_config':'dstl_6_crs_slw',
    'distill_mode':'offline',
    'chkpt_dir':'../tf_ckpts',
       }

std_hparams=get_model_params(task, config['student_model'], config['student_config'])
std_hparams.output_attentions = True
std_hparams.output_embeddings = True
std_hparams.output_hidden_states = True

model1, _ = get_student_model(config, task, std_hparams, cl_token)

tchr_hparams=get_model_params(task, config['teacher_model'], config['teacher_config'])
tchr_hparams.output_attentions = True
tchr_hparams.output_embeddings = True
tchr_hparams.output_hidden_states = True

model2, _ = get_teacher_model(config, task, tchr_hparams, cl_token)


Vocab len:  10032
model config: very_big_gpt_v10
{'embedding_dim': 512, 'resid_pdrop': 0.4, 'embd_pdrop': 0.2, 'attn_pdrop': 0.6, 'initializer_range': 0.05}
model config: lstm_drop31_v2
{'hidden_dim': 512, 'embedding_dim': 512, 'depth': 2, 'hidden_dropout_rate': 0.3, 'input_dropout_rate': 0.2}
student_checkpoint: ../tf_ckpts/word_sv_agreement_lm/offline_dstl_6_crs_slw_teacher_lm_lstm_shared_emb_em-512_h-512_d-2_hdrop-0.3_indrop-0.2_lstm_drop31_v2_0.001_lisa_crs_fst_offlineteacher_v20_student_lm_gpt2_h-512_d-6_rdrop-0.4_adrop-0.6_indrop-0.2_very_big_gpt_v10_lisa_fd432
Restored student from ../tf_ckpts/word_sv_agreement_lm/offline_dstl_6_crs_slw_teacher_lm_lstm_shared_emb_em-512_h-512_d-2_hdrop-0.3_indrop-0.2_lstm_drop31_v2_0.001_lisa_crs_fst_offlineteacher_v20_student_lm_gpt2_h-512_d-6_rdrop-0.4_adrop-0.6_indrop-0.2_very_big_gpt_v10_lisa_fd432/ckpt-60
model config: lstm_drop31_v2
{'hidden_dim': 512, 'embedding_dim': 512, 'depth': 2, 'hidden_dropout_rate': 0.3, 'input_dropout_rate': 0.2}

In [74]:
for inputs, labels in task.valid_dataset:
    reps1 = get_reps(inputs, model1, index=1, layer=None)
    reps2 = get_reps(inputs, model2, index=1, layer=None)
    mean, all = compare_reps(reps1, reps1)
    print(mean)
    mean, all = compare_models(inputs, model1, model1)
    print(mean)
    mean, all = compare_reps(reps1, reps2)
    print(mean)
    mean, all = compare_reps(reps2, reps1)
    print(mean)
    mean, all = compare_reps(reps2, reps2)
    print(mean)
    break


tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(0.9217837, shape=(), dtype=float32)
tf.Tensor(0.9217837, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)


In [None]:
outputs = model2.detailed_call(inputs)

In [None]:
outputs[1]

In [None]:
len(outputs)

In [None]:
for inputs, labels in task.valid_dataset:
    reps1 = get_reps(inputs, model1, index=1, layer=None)
    reps2 = get_reps(inputs, model2, index=1, layer=None)
    mean, all = compare_reps(reps1, reps1, padding_symbol=0, inputs=inputs)
    print(mean)
    mean, all = compare_models(inputs, model1, model1, padding_symbol=0, inputs=inputs)
    break
