In [1]:
!pip install -q tf-models-official==2.3.0

In [2]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub

os.environ["TFHUB_CACHE_DIR"] = "gs://nts2020-tpu"

from official import nlp
from official.modeling import tf_utils
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

import json

In [35]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='tpu-quickstart')
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)# TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')





INFO:tensorflow:Initializing the TPU system: tpu-quickstart


INFO:tensorflow:Initializing the TPU system: tpu-quickstart


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


Running on TPU  ['10.201.178.138:8470']


In [4]:
tf.__version__

'2.3.0'

In [5]:
def single_file_dataset(input_file, name_to_features, num_samples=None):
  """Creates a single-file dataset to be passed for BERT custom training."""
  # For training, we want a lot of parallel reading and shuffling.
  # For eval, we want no shuffling and parallel reading doesn't matter.
  d = tf.data.TFRecordDataset(input_file)
  if num_samples:
    d = d.take(num_samples)
  d = d.map(
      lambda record: decode_record(record, name_to_features),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)

  # When `input_file` is a path to a single file or a list
  # containing a single path, disable auto sharding so that
  # same input file is sent to all workers.
  if isinstance(input_file, str) or len(input_file) == 1:
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = (
        tf.data.experimental.AutoShardPolicy.OFF)
    d = d.with_options(options)
  return d

In [6]:
def decode_record(record, name_to_features):
  """Decodes a record to a TensorFlow example."""
  example = tf.io.parse_single_example(record, name_to_features)

  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
  # So cast all int64 to int32.
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.cast(t, tf.int32)
    example[name] = t

  return example

In [7]:
def create_classifier_dataset(file_path,
                              seq_length,
                              batch_size,
                              task_id,
                              is_training=True,
                              input_pipeline_context=None,
                              label_type=tf.int64,
                              lang_id = 0,
                              include_sample_weights=False,
                              num_samples=None):
  """Creates input dataset from (tf)records files for train/eval."""
  name_to_features = {
      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
      'label_ids': tf.io.FixedLenFeature([], label_type),
  }
  if include_sample_weights:
    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
  dataset = single_file_dataset(file_path, name_to_features,
                                num_samples=num_samples)

  # The dataset is always sharded by number of hosts.
  # num_input_pipelines is the number of hosts rather than number of cores.
  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                            input_pipeline_context.input_pipeline_id)

  def _select_data_from_record(record):
    x = {
        'input_word_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'input_type_ids': record['segment_ids'],
         'lang_id' : lang_id
    }
    #pdb.set_trace()
    y = record['label_ids']
    if include_sample_weights:
      w = record['weight']
      return (x, y, w)
    default = tf.constant(-1, dtype=tf.int32)
    if task_id ==0:
      return (x, (y, default))
    if task_id == 1:
      return (x, (default,y))

  if is_training:
    dataset = dataset.shuffle(100)
    dataset = dataset.repeat()

  dataset = dataset.map(
      _select_data_from_record,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  #dataset = dataset.batch(batch_size, drop_remainder=is_training)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

In [8]:
config_dict = {
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 119547
}

bert_config = bert.configs.BertConfig.from_dict(config_dict)

In [9]:
tf_records_filenames = ["gs://nts2020/xtereme/pawsx/train.en.tfrecords", "gs://nts2020/xtereme/xnli/train.en.tfrecords"]

sampling_factor = []
for fn in tf_records_filenames:
    c = 0
    for record in tf.compat.v1.python_io.tf_record_iterator(fn):
        c += 1
    sampling_factor.append(c)
    print(c)
c = sum(sampling_factor)
for i in range(0, len(sampling_factor)):
    sampling_factor[i] = sampling_factor[i]/c
sampling_factor

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


49401
392702


[0.11174092914999446, 0.8882590708500055]

In [10]:
tf_records_filenames = ["gs://nts2020/xtreme/translate_train/train.ar.tfrecords", "gs://nts2020/xtreme/translate_train/train.bg.tfrecords", "gs://nts2020/xtreme/translate_train/train.de.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.el.tfrecords","gs://nts2020/xtreme/translate_train/train.es.tfrecords","gs://nts2020/xtreme/translate_train/train.fr.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.hi.tfrecords","gs://nts2020/xtreme/translate_train/train.ru.tfrecords","gs://nts2020/xtreme/translate_train/train.sw.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.th.tfrecords","gs://nts2020/xtreme/translate_train/train.tr.tfrecords","gs://nts2020/xtreme/translate_train/train.ur.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.vi.tfrecords","gs://nts2020/xtreme/translate_train/train.zh.tfrecords"]


In [11]:
tf_records_filenames = ["gs://nts2020/xtreme/translate_train/train.ar.tfrecords", "gs://nts2020/xtreme/translate_train/train.bg.tfrecords", "gs://nts2020/xtreme/translate_train/train.de.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.el.tfrecords","gs://nts2020/xtreme/translate_train/train.es.tfrecords","gs://nts2020/xtreme/translate_train/train.fr.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.hi.tfrecords","gs://nts2020/xtreme/translate_train/train.ru.tfrecords","gs://nts2020/xtreme/translate_train/train.sw.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.th.tfrecords","gs://nts2020/xtreme/translate_train/train.tr.tfrecords","gs://nts2020/xtreme/translate_train/train.ur.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.vi.tfrecords","gs://nts2020/xtreme/translate_train/train.zh.tfrecords"]

other_langs_sampling_factor = []
for fn in tf_records_filenames:
    c = 0
    for record in tf.compat.v1.python_io.tf_record_iterator(fn):
        c += 1
    other_langs_sampling_factor.append(c)
    print(c)
c = sum(other_langs_sampling_factor)
for i in range(0, len(other_langs_sampling_factor)):
    other_langs_sampling_factor[i] = other_langs_sampling_factor[i]/c
other_langs_sampling_factor

392702
392702
392702
392702
392702
392702
392702
392702
392702
392702
392702
392702
392702
392702


[0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142]

In [12]:
other_lang_count = len(tf_records_filenames)
other_lang_aggregate_weight = 0.9
train_sampling_factor = []
for i in sampling_factor:
  train_sampling_factor.append((i* (1-other_lang_aggregate_weight))/ sum(sampling_factor))
for i in other_langs_sampling_factor:
  train_sampling_factor.append((i * other_lang_aggregate_weight))

In [13]:
train_sampling_factor[0] = 0.03

In [13]:
train_sampling_factor = [0.03,
 0.08882590708500053,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428]

In [14]:
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [36]:
def _loss_with_filter(y_true, y_pred):
  num_labels = y_pred.get_shape().as_list()[-1]
  log_probs = tf.nn.log_softmax(y_pred, axis=-1)
  log_probs = tf.reshape(log_probs, [-1, num_labels])
  labels = tf.reshape(y_true, [-1])
  one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
  per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
  loss = tf.reduce_mean(per_example_loss)
  return loss

In [37]:
import tensorflow.keras.backend as K
def accuracy_mod(y_true, y_pred):
  # Squeeze the shape to (None, ) from (None, 1) as we want to apply operations directly on y_true
  if K.ndim(y_true) == K.ndim(y_pred):
        y_true = K.squeeze(y_true, -1)

  # Normalize the y_pred values first and then take the arg at which we have a maximum value (This is the predicted label)
  y_pred = K.softmax(y_pred, axis = -1)
  y_pred = K.argmax(y_pred, axis = -1)

  # Since the ground labels can also have -1s for which we don't wanna calculate accuracy, we are filtering them off
  defa = K.constant([0], dtype=tf.float32)
  #Creating a boolean tensor for labels greater or equal to 0
  is_valid = K.greater_equal(y_true, defa)
  #Get the corresponding indices
  indices = tf.where(is_valid)

  #Gather the results of y_true and y_pred at the indices we calculated above
  fil_y_true = K.gather(y_true, K.reshape(indices, [-1])) 
  fil_y_pred = K.gather(y_pred, K.reshape(indices, [-1]))
  # K.print_tensor(res, message='res = ')
  # K.print_tensor(comp, message='comp = ')

  fil_y_true = K.cast(fil_y_true, K.floatx())
  fil_y_pred = K.cast(fil_y_pred, K.floatx())

  #pdb.set_trace()
  return K.cast(K.equal(fil_y_true, fil_y_pred), K.floatx())

In [38]:
epochs = 3
batch_size = 64
eval_batch_size = 64
max_seq_length = 128

In [39]:
paws_training_dataset = create_classifier_dataset(
"gs://nts2020/xtereme/pawsx/train.en.tfrecords",
128,
batch_size,
task_id = 0,
is_training=True)


xnli_training_dataset = create_classifier_dataset(
"gs://nts2020/xtereme/xnli/train.en.tfrecords",
128,
batch_size,
task_id =1,
is_training=True)

paws_eval_dataset = create_classifier_dataset(
"gs://nts2020/xtereme/pawsx/eval.en.tfrecords",
128,
batch_size,
task_id = 0,
is_training=False)

xnli_eval_dataset = create_classifier_dataset(
"gs://nts2020/xtereme/xnli/eval.en.tfrecords",
128,
batch_size,
task_id = 1,
is_training=False)

In [40]:
tf_records_filenames = ["gs://nts2020/xtreme/translate_train/train.ar.tfrecords", "gs://nts2020/xtreme/translate_train/train.bg.tfrecords", "gs://nts2020/xtreme/translate_train/train.de.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.el.tfrecords","gs://nts2020/xtreme/translate_train/train.es.tfrecords","gs://nts2020/xtreme/translate_train/train.fr.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.hi.tfrecords","gs://nts2020/xtreme/translate_train/train.ru.tfrecords","gs://nts2020/xtreme/translate_train/train.sw.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.th.tfrecords","gs://nts2020/xtreme/translate_train/train.tr.tfrecords","gs://nts2020/xtreme/translate_train/train.ur.tfrecords",
                        "gs://nts2020/xtreme/translate_train/train.vi.tfrecords","gs://nts2020/xtreme/translate_train/train.zh.tfrecords"]
xnli_ar_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.ar.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =1,
    is_training=True)
xnli_bg_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.bg.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =2,
    is_training=True)
xnli_de_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.de.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =3,
    is_training=True)
xnli_el_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.el.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =4,
    is_training=True)
xnli_es_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.es.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =5,
    is_training=True)
xnli_fr_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.fr.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =6,
    is_training=True)
xnli_hi_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.hi.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =7,
    is_training=True)
xnli_ru_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.ru.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =8,
    is_training=True)
xnli_sw_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.sw.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =9,
    is_training=True)
xnli_th_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.th.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =10,
    is_training=True)
xnli_tr_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.tr.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =11,
    is_training=True)
xnli_ur_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.ur.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =12,
    is_training=True)
xnli_vi_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.vi.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =13,
    is_training=True)
xnli_zh_training_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/translate_train/train.zh.tfrecords",
    128,
    batch_size,
    task_id =1,
    lang_id =14,
    is_training=True)

In [41]:
xnli_ar_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_ar.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_bg_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_bg.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_de_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_de.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_el_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_el.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_es_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_es.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_fr_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_fr.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_hi_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_hi.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_ru_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_ru.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_sw_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_sw.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_th_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_th.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_tr_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_tr.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_ur_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_ur.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_vi_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_vi.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_zh_eval_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_zh.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)

In [None]:
delta = 0.50

for i in range(0, len(train_sampling_factor)):
    train_sampling_factor[i] = train_sampling_factor[i] * (1 - delta)
train_sampling_factor[13] += delta

In [30]:
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [329]:
train_sampling_factor = 16 * [None]
train_sampling_factor[0] = 0.01
train_sampling_factor[1:] = [0.54957585, 0.00897124, 0.00887785, 0.00799495, 0.0105019 ,
       0.00910511, 0.00809789, 0.33139596, 0.00878845, 0.0100313 ,
       0.00971826, 0.00991482, 0.00882816, 0.00819859, 0.00999967]


train_sampling_factor[1] -= 0.04
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [42]:
train_sampling_factor = [0.03,
 0.30,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.50,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428]
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [43]:
training_dataset = tf.data.experimental.sample_from_datasets(
    [paws_training_dataset, xnli_training_dataset, xnli_ar_training_dataset, xnli_bg_training_dataset, xnli_de_training_dataset, xnli_el_training_dataset, xnli_es_training_dataset, 
     xnli_fr_training_dataset, xnli_hi_training_dataset, xnli_ru_training_dataset, xnli_sw_training_dataset, xnli_th_training_dataset,
     xnli_tr_training_dataset, xnli_ur_training_dataset, xnli_vi_training_dataset, xnli_zh_training_dataset], weights=tf.constant([train_sampling_factor[0], train_sampling_factor[1],
                                                                                                                                   train_sampling_factor[2], train_sampling_factor[3],
                                                                                                                                   train_sampling_factor[4],train_sampling_factor[5],
                                                                                                                                   train_sampling_factor[6],train_sampling_factor[7],
                                                                                                                                   train_sampling_factor[8],train_sampling_factor[9],
                                                                                                                                   train_sampling_factor[10],train_sampling_factor[11],
                                                                                                                                   train_sampling_factor[12],train_sampling_factor[13],
                                                                                                                                   train_sampling_factor[14],train_sampling_factor[15]]))


In [44]:
# iterator = training_dataset.as_numpy_iterator()

In [45]:
# training_dataset = tf.data.experimental.sample_from_datasets(
#     [paws_training_dataset, xnli_sw_training_dataset], weights = tf.constant([0.5,0.5]))


In [46]:
evaluation_sampling_factor = [0.3 , 0.7]

In [47]:
# evaluation_dataset = tf.data.experimental.sample_from_datasets(
#     [paws_eval_dataset, xnli_eval_dataset, xnli_ar_eval_dataset, xnli_bg_eval_dataset, xnli_de_eval_dataset, xnli_el_eval_dataset, xnli_es_eval_dataset, 
#      xnli_fr_eval_dataset, xnli_hi_eval_dataset, xnli_ru_eval_dataset, xnli_sw_eval_dataset, xnli_th_eval_dataset,
#      xnli_tr_eval_dataset, xnli_ur_eval_dataset, xnli_vi_eval_dataset, xnli_zh_eval_dataset], weights=tf.constant([train_sampling_factor[0], train_sampling_factor[1],
#                                                                                                                                    train_sampling_factor[2], train_sampling_factor[3],
#                                                                                                                                    train_sampling_factor[4],train_sampling_factor[5],
#                                                                                                                                    train_sampling_factor[6],train_sampling_factor[7],
#                                                                                                                                    train_sampling_factor[8],train_sampling_factor[9],
#                                                                                                                                    train_sampling_factor[10],train_sampling_factor[11],
#                                                                                                                                    train_sampling_factor[12],train_sampling_factor[13],
#                                                                                                                                    train_sampling_factor[14],train_sampling_factor[15]]))


In [48]:
evaluation_dataset = tf.data.experimental.sample_from_datasets(
    [paws_eval_dataset,xnli_hi_eval_dataset], weights=tf.constant([evaluation_sampling_factor[0],evaluation_sampling_factor[1]]))                                                                                                                              


In [49]:
# evaluation_dataset = tf.data.experimental.sample_from_datasets(
#     [paws_eval_dataset, xnli_eval_dataset], weights=tf.constant([sampling_factor[0], sampling_factor[1]])
# )

In [50]:
class myCallback(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('val_output1_accuracy_mod') > 0.71):   
          print("\nWe have reached %2.2f%% accuracy, so we will stopping training." %(acc_thresh*100))   
          self.model.stop_training = True

In [51]:
callbacks = myCallback()

In [None]:
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu='tpu-quickstart', project = 'moana-intern-fall-2020')
# tf.config.experimental_connect_to_cluster(resolver)
# tf.tpu.experimental.initialize_tpu_system(resolver)
# strategy = tf.distribute.TPUStrategy(resolver)

strategy = tf.distribute.TPUStrategy(tpu)

with strategy.scope():
    max_seq_length = 128
    initializer = tf.keras.initializers.TruncatedNormal(
            stddev=bert_config.initializer_range)
    bert_encoder = bert.bert_models.get_transformer_encoder(
        bert_config, max_seq_length)

    input_word_ids = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

    bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2",
                                trainable=True)
    #bert_model = hub.KerasLayer(hub_url_bert, trainable=True)
    pooled_output, seq_output = bert_model([input_word_ids, input_mask, input_type_ids])
    output1 = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
      pooled_output)

    output1 = tf.keras.layers.Dense(
      2, kernel_initializer=initializer, name='output1')(
          output1)

    output2 = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
      pooled_output)

    output2 = tf.keras.layers.Dense(
      3, kernel_initializer=initializer, name='output2')(
          output2)

    model = tf.keras.Model(
          inputs={
              'input_word_ids': input_word_ids,
              'input_mask': input_mask,
              'input_type_ids': input_type_ids
          },
          outputs=[output1, output2])

    # Set up epochs and steps

    # get train_data_size from metadata
    train_data_size = c
    steps_per_epoch = int(train_data_size / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

    # creates an optimizer with learning rate schedule
    optimizer = nlp.optimization.create_optimizer(
        2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

  
    
    
    training_dataset = training_dataset.batch(batch_size)
    evaluation_dataset = evaluation_dataset.batch(batch_size, drop_remainder=True)
    
 

    model.compile(optimizer = optimizer, loss = [_loss_with_filter, _loss_with_filter], metrics = [accuracy_mod])
    history = model.fit(training_dataset, batch_size = batch_size, epochs= 35, steps_per_epoch = 1000, validation_data=evaluation_dataset, callbacks = [callbacks])

INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Epoch 1/35


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "






Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
 176/1000 [====>.........................] - ETA: 1:29 - loss: 0.7820 - output1_loss: 0.0159 - output2_loss: 0.7661 - output1_accuracy_mod: 0.5521 - output2_accuracy_mod: 0.6546

In [341]:
xnli_hi_test_dataset = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/test_hi.tf_record",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
xnli_hi_test_dataset = xnli_hi_test_dataset.batch(batch_size, drop_remainder = True)

In [342]:
model.evaluate(xnli_hi_test_dataset)



[nan, nan, nan, 0.0, 0.7125748991966248]

In [126]:
iterat = xnli_hi_test_dataset.as_numpy_iterator()
iterat.next()

({'input_word_ids': array([[ 101,  100, 1010, ...,    0,    0,    0],
         [ 101,  100, 1010, ...,    0,    0,    0],
         [ 101,  100, 1010, ...,    0,    0,    0],
         ...,
         [ 101,  100, 2018, ...,    0,    0,    0],
         [ 101,  100, 2018, ...,    0,    0,    0],
         [ 101,  100, 2215, ...,    0,    0,    0]], dtype=int32),
  'input_mask': array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]], dtype=int32),
  'input_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int32),
  'lang_id': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [67]:
sw_impulse = create_classifier_dataset(
    "gs://nts2020/xtreme/xnli_w_dev/eval_sw.tfrecords",
    max_seq_length,
    batch_size,
    task_id =1,
    is_training=False)
sw_impulse = sw_impulse.batch(batch_size)

In [68]:
iterat = sw_impulse.as_numpy_iterator()
iterat.next()

({'input_word_ids': array([[  101, 10685, 12871, ...,     0,     0,     0],
         [  101, 10685, 12871, ...,     0,     0,     0],
         [  101, 10685, 12871, ...,     0,     0,     0],
         ...,
         [  101,   138, 12964, ...,     0,     0,     0],
         [  101,   138, 12964, ...,     0,     0,     0],
         [  101, 48511, 25327, ...,     0,     0,     0]], dtype=int32),
  'input_mask': array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]], dtype=int32),
  'input_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int32),
  'lang_id': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 

In [None]:
strategy = tf.distribute.TPUStrategy(tpu)
train_data_size = c
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
with strategy.scope():
    model.save('gs://nts2020/XNLI_SW_uniform_10epochs_tf',save_format='tf')

In [None]:
with strategy.scope():
    loaded_model = tf.keras.models.load_model('gs://nts2020/XNLI_SW_uniform_10epochs_tf', 
                                             compile=False,custom_objects={"accuracy_mod": accuracy_mod})


In [None]:
delta = 0.20

for i in range(0, len(train_sampling_factor)):
    train_sampling_factor[i] = train_sampling_factor[i] * (1 - delta)
train_sampling_factor[1] += delta

In [None]:
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [None]:
train_sampling_factor = [0.03,
 0.08882590708500053,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428,
 0.06428571428571428]

In [1112]:
train_sampling_factor

[0.01,
 0.4681950400000001,
 0.01420429,
 0.01311955,
 0.01560811,
 0.01659967,
 0.01972801,
 0.01462891,
 0.01473086,
 0.01443204,
 0.0133442,
 0.32546586,
 0.01524783,
 0.01558882,
 0.01494559,
 0.01416122]

In [314]:
new_weights = get_new_static_weights(0.66, tf.Variable(train_sampling_factor[1:]))

tf.Tensor(
[0.10267981 0.0625526  0.06249352 0.06242887 0.06256116 0.06243307
 0.06230485 0.08465985 0.06258101 0.06269672 0.06235731 0.06264312
 0.06255352 0.06243089 0.06262369], shape=(15,), dtype=float64)
tf.Tensor(
[0.22551834 0.05092506 0.05088173 0.0508343  0.05093134 0.05083738
 0.05074327 0.11284001 0.0509459  0.05103074 0.05078178 0.05099144
 0.05092574 0.05083578 0.0509772 ], shape=(15,), dtype=float64)
tf.Tensor(
[0.40431934 0.03526531 0.03523852 0.0352092  0.03526919 0.03521111
 0.03515291 0.13265491 0.03527819 0.0353306  0.03517673 0.03530633
 0.03767179 0.03521012 0.03770576], shape=(15,), dtype=float64)
tf.Tensor(
[0.4238397  0.02803608 0.02801637 0.02799479 0.02803893 0.02990628
 0.02795336 0.20248316 0.02804555 0.02808411 0.02797089 0.02998112
 0.02979741 0.02799546 0.0318568 ], shape=(15,), dtype=float64)
tf.Tensor(
[0.37729983 0.03143109 0.03020035 0.03036671 0.03070844 0.03227983
 0.0318654  0.20770386 0.03052425 0.03187087 0.03141393 0.03435792
 0.03338099 0.03105

In [212]:
new_weights.numpy()

array([0.15863377, 0.05279175, 0.06050166, 0.05709644, 0.05539879,
       0.0476955 , 0.05519443, 0.10845616, 0.06775053, 0.05859151,
       0.05222267, 0.05620628, 0.0563138 , 0.06068141, 0.05246529],
      dtype=float32)

In [233]:
new_weights.numpy()

array([0.26438723, 0.0416121 , 0.03809939, 0.04833189, 0.04568637,
       0.03971065, 0.04196758, 0.16714864, 0.05173302, 0.04493313,
       0.03946248, 0.0451804 , 0.04849535, 0.0472131 , 0.03603868])

In [253]:
new_weights.numpy()

array([0.38619243, 0.02560933, 0.02769258, 0.02933084, 0.02620112,
       0.02557785, 0.02884727, 0.22202005, 0.0365713 , 0.03394441,
       0.02719801, 0.03270463, 0.03757767, 0.0339267 , 0.02660581])

In [276]:
new_weights.numpy()

array([0.45440307, 0.01953065, 0.01855943, 0.0195817 , 0.0186784 ,
       0.0166871 , 0.01911363, 0.28467079, 0.02146498, 0.02363168,
       0.01883129, 0.02539632, 0.02050875, 0.01897668, 0.01996551])

In [299]:
new_weights.numpy()

array([0.51895109, 0.01334333, 0.01239848, 0.01136344, 0.01348014,
       0.01143068, 0.00937475, 0.31597696, 0.0137975 , 0.0156447 ,
       0.0102165 , 0.01478939, 0.01335806, 0.0113958 , 0.01447918])

In [315]:
new_weights.numpy()

array([0.54957585, 0.00897124, 0.00887785, 0.00799495, 0.0105019 ,
       0.00910511, 0.00809789, 0.33139596, 0.00878845, 0.0100313 ,
       0.00971826, 0.00991482, 0.00882816, 0.00819859, 0.00999967])

In [316]:
train_sampling_factor = 15 * [None]
train_sampling_factor[0] = 0.01
train_sampling_factor[1:] = new_weights.numpy()
# train_sampling_factor[1:] = [0.49758378, 0.01234279, 0.01297586, 0.01206473, 0.0130334 ,
#        0.01228484, 0.01269346, 0.0109312 , 0.01205256, 0.01229929,
#        0.34325856, 0.01188709, 0.01400499, 0.01077152, 0.01181594]
# train_sampling_factor[1] -= 0.04
if sum(train_sampling_factor)!=1:
  train_sampling_factor[1]+= 1- sum(train_sampling_factor)

In [322]:
training_dataset = tf.data.experimental.sample_from_datasets(
    [paws_training_dataset, xnli_training_dataset, xnli_ar_training_dataset, xnli_bg_training_dataset, xnli_de_training_dataset, xnli_el_training_dataset, xnli_es_training_dataset, 
     xnli_fr_training_dataset, xnli_hi_training_dataset, xnli_ru_training_dataset, xnli_sw_training_dataset, xnli_th_training_dataset,
     xnli_tr_training_dataset, xnli_ur_training_dataset, xnli_vi_training_dataset, xnli_zh_training_dataset], weights=tf.constant([train_sampling_factor[0], train_sampling_factor[1],
                                                                                                                                   train_sampling_factor[2], train_sampling_factor[3],
                                                                                                                                   train_sampling_factor[4],train_sampling_factor[5],
                                                                                                                                   train_sampling_factor[6],train_sampling_factor[7],
                                                                                                                                   train_sampling_factor[8],train_sampling_factor[9],
                                                                                                                                   train_sampling_factor[10],train_sampling_factor[11],
                                                                                                                                   train_sampling_factor[12],train_sampling_factor[13],
                                                                                                                                   train_sampling_factor[14],train_sampling_factor[15]]))


In [323]:
untouched_dataset = training_dataset

In [324]:
training_dataset = training_dataset.batch(batch_size)

In [325]:
evaluation_dataset = tf.data.experimental.sample_from_datasets(
    [paws_eval_dataset,xnli_hi_eval_dataset], weights=tf.constant([evaluation_sampling_factor[0],evaluation_sampling_factor[1]]))                                                                                                                              


In [326]:
evaluation_dataset = evaluation_dataset.batch(batch_size, drop_remainder=True)

In [327]:
model.fit(training_dataset, batch_size = batch_size, epochs= 15, steps_per_epoch = 1000, validation_data=evaluation_dataset)

Epoch 1/15




Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f0f1b9b0748>

In [None]:
loaded_model.compile(optimizer = optimizer, loss = [_loss_with_filter, _loss_with_filter], metrics = [accuracy_mod])
loaded_model.evaluate(evaluation_dataset)

In [None]:
loaded_model.fit(training_dataset, batch_size = batch_size, epochs= 2, steps_per_epoch = 1000, validation_data=evaluation_dataset)

In [None]:
loaded_model.fit(training_dataset, batch_size = batch_size, epochs= 8, steps_per_epoch = 1000, validation_data=evaluation_dataset)

In [307]:
itera = None
rl_dataset = untouched_dataset#.batch(batch_size)
rl_dataset = rl_dataset.batch(32)
itera = rl_dataset.as_numpy_iterator()

In [308]:
from collections import defaultdict
def get_batch_lang(iterator):
  appearances = defaultdict(int)
  for curr in iterator.next()[0]['lang_id']:
    appearances[curr] += 1
    batch_lang_count = 15 *[None]
    for i in range(15):
      batch_lang_count[i] = appearances[i]
  return batch_lang_count

In [309]:
# # initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
# # values = initializer(shape=(1, 15))
# phi = tf.Variable(train_sampling_factor[1:])


In [310]:
# initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
# input2 = initializer(shape=(1, 15))

In [311]:
# tf.squeeze(tf.nn.softmax(input2, axis = -1))

In [312]:
# input1 = phi

In [313]:
import random
target_lang = 7
cce = tf.keras.losses.CategoricalCrossentropy()
d = 15

In [291]:
# def train(opt, input1, batch_lang_count, R, loss):
#    loss = 0
#    with tf.GradientTape() as tape:
#         tape.watch(input1)
#         for i, val in enumerate(batch_lang_count):
#           loss += val * R * cce(tf.one_hot(i, depth =d), tf.squeeze(tf.nn.softmax(input1, axis = -1)))
#         gradients = tape.gradient(loss, input1)
#    opt.apply_gradients(zip([gradients], [input1]))
#    #print(loss)
#    print(tf.nn.softmax(input1, axis = -1))
#    return input1  

In [292]:
# def train(opt, input1, batch_lang_count, R, loss, priority, flag):
#    loss = 0
#    with tf.GradientTape() as tape:
#         #tape.watch(input1)
#         for i, val in enumerate(batch_lang_count):
#             loss += val * (100 - R) * cce(tf.one_hot(i, depth =d), tf.squeeze(tf.nn.softmax(input1, axis = -1))) 
#    gradients = tape.gradient(loss, input1)
#    #print(gradients)
#    opt.apply_gradients(zip([gradients], [input1]))
#    print(loss)
#    print(tf.nn.softmax(input1, axis = -1))
#    return input1

In [293]:
opt = tf.keras.optimizers.SGD(learning_rate = 0.001,clipvalue= 0.001)
loss = 0


In [294]:
def get_new_static_weights(reward, input1):
    epsilon = 0.05
    epsilon2 = 0.50
    acc = 0
#     initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
#     input1 = tf.Variable(initializer(shape=(1, 15))[0])
    #input1 = tf.Variable(train_sampling_factor[1:])
    flag = 0
    batch_data = get_batch_lang(itera)
    for i in range(100):
        draw = random.uniform(0.0, 1.0)
        batch_data = get_batch_lang(itera)
        if draw <= epsilon:
            #print("a")
            for i in range(15):
                batch_data[i] = random.random()
            
        #if draw > epsilon and draw < epsilon2:
            #print("b")
         #   batch_data = get_batch_lang(itera)

           
        if draw >= epsilon2:
            #print("c")
            ans = [i for i in range(0, len(batch_data))]
            ind = list(set(ans).difference([0, target_lang]))
            for i in ind:
                batch_data[i] = 0
        input1 = trainstep(opt, input1, batch_data, reward * 100, loss)
        acc += tf.nn.softmax(input1, axis = -1)    
    return acc/100

In [295]:
# initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
# test1 = tf.Variable(initializer(shape=(1, 15))[0])

In [296]:
# opt1 = tf.keras.optimizers.SGD(learning_rate = 0.001)

# loss = 0

In [297]:
def trainstep(opt, input1, batch_lang_count, R, loss):
   loss = 0
   with tf.GradientTape() as tape:
        #tape.watch(input1)
        for i, val in enumerate(batch_lang_count):
            loss += val * (R) * cce(tf.one_hot(i, depth =d), tf.squeeze(tf.nn.softmax(input1, axis = -1))) 
        #loss += cce(input1, tf.cast(batch_data, dtype= tf.float64))
        print(tf.nn.softmax(input1, axis = -1))

   gradients = tape.gradient(loss, input1)
   #gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
   #print(gradients)
   opt.apply_gradients(zip([gradients], [input1]))
   return input1
   #print(loss)

In [816]:
# train_sampling_factor

In [448]:
# for i in range(0, 100):
#     #if i%10 == 0:
#     batch_data = get_batch_lang(itera)
#     test1 = trainstep(opt1, test1, batch_data, 0.4 * 100, loss)


In [449]:
# batch_data

In [450]:
# def get_new_static_weights(reward):
#     refresh()
#     epsilon = 0.05
#     epsilon2 = 0.75
#     initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
#     input1 = tf.Variable(initializer(shape=(1, 15))[0])
#     flag = 0
#     batch_data = get_batch_lang()
#     for i in range(100):
#         draw = random.uniform(0.0, 1.0)
#         #batch_data = get_batch_lang()

#         if draw <= epsilon:
#             print("a")
#             initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
#             input1 = tf.Variable(initializer(shape=(1, 15))[0])
#             ans = [i for i in range(0, len(batch_data))]
#             rind = list(set(ans).difference(random.sample(range(0, 15), 2)))
#             for i in rind:
#                 batch_data[i] = 0
            
#         if draw > epsilon and draw < epsilon2:
#             print("b")
#             phi = tf.Variable(train_sampling_factor[1:])
#             input1 = phi
#             ans = [i for i in range(0, len(batch_data))]
#             find = list(set(ans).difference(random.sample(range(0, 15), 2)))
#             for i in find:
#                 batch_data[i] = 0
            
#         if draw >= epsilon2:
#             print("c")
#             ans = [i for i in range(0, len(batch_data))]
#             ind = list(set(ans).difference([0, target_lang]))
#             for i in ind:
#                 batch_data[i] = 0
#         input1 = trainstep(opt, input1, batch_data, reward * 100, loss)
            
#     return tf.nn.softmax(input1, axis = -1), max1, max2

In [None]:
paws_batched_eval_data = paws_eval_dataset.batch(batch_size)
xnli_batched_eval_data = xnli_eval_dataset.batch(batch_size)

In [None]:
model.evaluate(paws_batched_eval_data)

In [None]:
model.evaluate(xnli_batched_eval_data)

In [None]:
xnli_ar_eval_dataset = xnli_ar_eval_dataset.batch(batch_size)
xnli_bg_eval_dataset = xnli_bg_eval_dataset.batch(batch_size)
xnli_de_eval_dataset = xnli_de_eval_dataset.batch(batch_size)
xnli_el_eval_dataset = xnli_el_eval_dataset.batch(batch_size)

In [None]:
xnli_es_eval_dataset = xnli_es_eval_dataset.batch(batch_size)
xnli_fr_eval_dataset = xnli_fr_eval_dataset.batch(batch_size)
xnli_hi_eval_dataset = xnli_hi_eval_dataset.batch(batch_size)
xnli_ru_eval_dataset = xnli_ru_eval_dataset.batch(batch_size)
xnli_sw_eval_dataset = xnli_sw_eval_dataset.batch(batch_size)
xnli_th_eval_dataset = xnli_th_eval_dataset.batch(batch_size)
xnli_tr_eval_dataset = xnli_tr_eval_dataset.batch(batch_size)
xnli_ur_eval_dataset = xnli_ur_eval_dataset.batch(batch_size)
xnli_vi_eval_dataset = xnli_vi_eval_dataset.batch(batch_size)
xnli_zh_eval_dataset = xnli_zh_eval_dataset.batch(batch_size)

In [None]:
model.evaluate(xnli_ar_eval_dataset)

In [None]:
model.evaluate(xnli_bg_eval_dataset)

In [None]:
model.evaluate(xnli_de_eval_dataset)

In [None]:
model.evaluate(xnli_el_eval_dataset)

In [None]:
model.evaluate(xnli_es_eval_dataset)

In [None]:
model.evaluate(xnli_fr_eval_dataset)

In [None]:
model.evaluate(xnli_hi_eval_dataset)

In [None]:
model.evaluate(xnli_ru_eval_dataset)

In [None]:
model.evaluate(xnli_sw_eval_dataset)

In [None]:
model.evaluate(xnli_th_eval_dataset)

In [None]:
model.evaluate(xnli_tr_eval_dataset)

In [None]:
model.evaluate(xnli_ur_eval_dataset)

In [None]:
model.evaluate(xnli_vi_eval_dataset)

In [None]:
model.evaluate(xnli_zh_eval_dataset)

In [None]:
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu='tpu-quickstart', project = 'moana-intern-fall-2020')
# tf.config.experimental_connect_to_cluster(resolver)
# tf.tpu.experimental.initialize_tpu_system(resolver)
# strategy = tf.distribute.TPUStrategy(resolver)
# with strategy.scope():
#     max_seq_length = 128
#     initializer = tf.keras.initializers.TruncatedNormal(
#             stddev=bert_config.initializer_range)
#     bert_encoder = bert.bert_models.get_transformer_encoder(
#         bert_config, max_seq_length)

#     input_word_ids = tf.keras.layers.Input(
#       shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
#     input_mask = tf.keras.layers.Input(
#       shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
#     input_type_ids = tf.keras.layers.Input(
#       shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

#     bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2",
#                                 trainable=True)
#     #bert_model = hub.KerasLayer(hub_url_bert, trainable=True)
#     pooled_output, seq_output = bert_model([input_word_ids, input_mask, input_type_ids])
#     output1 = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
#       pooled_output)

#     output1 = tf.keras.layers.Dense(
#       2, kernel_initializer=initializer, name='output1')(
#           output1)

#     output2 = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
#       pooled_output)

#     output2 = tf.keras.layers.Dense(
#       3, kernel_initializer=initializer, name='output2')(
#           output2)

#     model = tf.keras.Model(
#           inputs={
#               'input_word_ids': input_word_ids,
#               'input_mask': input_mask,
#               'input_type_ids': input_type_ids
#           },
#           outputs=[output1, output2])

#     # Set up epochs and steps
#     epochs = 3
#     batch_size = 64
#     eval_batch_size = 64

#     # get train_data_size from metadata
#     train_data_size = c
#     steps_per_epoch = int(train_data_size / batch_size)
#     num_train_steps = steps_per_epoch * epochs
#     warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

#     # creates an optimizer with learning rate schedule
#     optimizer = nlp.optimization.create_optimizer(
#         2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

#         paws_training_dataset = create_classifier_dataset(
#     "gs://nts2020/xtereme/pawsx/train.en.tfrecords",
#     128,
#     batch_size,
#     task_id = 0,
#     is_training=True)

    
#     xnli_training_dataset = create_classifier_dataset(
#     "gs://nts2020/xtereme/xnli/train.en.tfrecords",
#     128,
#     batch_size,
#     task_id =1,
#     is_training=True)

#     paws_eval_dataset = create_classifier_dataset(
#     "gs://nts2020/xtereme/pawsx/eval.en.tfrecords",
#     128,
#     batch_size,
#     task_id = 0,
#     is_training=False)
    
#     xnli_eval_dataset = create_classifier_dataset(
#     "gs://nts2020/xtereme/xnli/eval.en.tfrecords",
#     128,
#     batch_size,
#     task_id = 1,
#     is_training=False)
    
#     training_dataset = tf.data.experimental.sample_from_datasets(
#     [paws_training_dataset, xnli_training_dataset], weights=tf.constant([sampling_factor[0], sampling_factor[1]]))
    
#     evaluation_dataset = tf.data.experimental.sample_from_datasets(
#     [paws_eval_dataset, xnli_eval_dataset], weights=tf.constant([sampling_factor[0], sampling_factor[1]]))
    
#     training_dataset = training_dataset.batch(batch_size)
#     evaluation_dataset = evaluation_dataset.batch(batch_size)
    
 

#     model.compile(optimizer = optimizer, loss = [_loss_with_filter, _loss_with_filter], metrics = [accuracy_mod])
#     model.fit(training_dataset, batch_size = batch_size, epochs= 13, steps_per_epoch = 1000, validation_data=evaluation_dataset)