In [1]:
import os
import sys
import json
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import absl
import datetime
from tensorflow.keras.optimizers import Adam

In [2]:
# Input data files are available in the "../input/" directory.
IS_KAGGLE = True
INPUT_DIR = "/home/jupyter/"

In [3]:
# The original Bert Joint Baseline data.
BERT_JOINT_BASE_DIR = os.path.join(INPUT_DIR ,"bert-joint-baseline")

# This nq dir contains all files for publicly use.
NQ_DIR = os.path.join(INPUT_DIR ,"nq-competition")

In [4]:
# If you want to use your own .tfrecord or new trained checkpoints, you can put them under you own nq dir (`MY_OWN_NQ_DIR`)
# Default to NQ_DIR. You have to change it to the dir containing your own working files.
MY_OWN_NQ_DIR = NQ_DIR

In [5]:
# NQ_DIR contains some packages / modules
sys.path.append(NQ_DIR)
sys.path.append(os.path.join(NQ_DIR, "transformers"))

In [6]:
from adamw_optimizer import AdamW
from tensorflow.python.lib.io.file_io import recursive_create_dir

print(tf.__version__)

2.0.0


In [7]:
from nq_flags import DEFAULT_FLAGS as FLAGS
from nq_flags import del_all_flags
from nq_dataset_utils import *

In [8]:
import sacremoses as sm
import transformers
from adamw_optimizer import AdamW

In [9]:
# # ----------------------------------------------------------------------------------------
# Ref: https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/custom_training.ipynb#scrollTo=jwJtsCQhHK-E


# Your TPU node internal ip
TPU_WORKER = 'grpc://10.240.1.2:8470'

# Your TPU Zone, for example 'europe-west4-a'
ZONE = 'europe-west4-a'

# Your project name, for example, 'kaggle-nq-123456'
PROJECT = 'project-x-262017'

# Your training tf record file on Google Storage bucket. For example, gs://kaggle-my-nq-competition/nq_train.tfrecord
TRAIN_TF_RECORD = '/home/jupyter/bert-joint-baseline/nq-train.tfrecords-00000-of-00001'

# Your checkpoint dir on Google Storage bucket. For example, "gs://kaggle-my-nq-competition/checkpoints/"
CHECKPOINT_DIR = '/home/jupyter/checkpoints/distilbert-base-uncased-distilled-squad/'

In [10]:
tf.keras.backend.clear_session()

In [11]:
IS_KAGGLE = True
INPUT_DIR = "/home/jupyter/input/"

In [12]:
# The original Bert Joint Baseline data.
BERT_JOINT_BASE_DIR = os.path.join("bert-joint-baseline")

# This nq dir contains all files for publicly use.
NQ_DIR = os.path.join("nq-competition")

In [13]:
MY_OWN_NQ_DIR = NQ_DIR

In [14]:
# For local usage.
if not os.path.isdir(INPUT_DIR):
    IS_KAGGLE = False
    INPUT_DIR = "./"
    NQ_DIR = "./"
    MY_OWN_NQ_DIR = "./"

In [15]:
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/home/jupyter/input/sample_submission.csv
/home/jupyter/input/simplified-nq-test.jsonl
/home/jupyter/input/simplified-nq-train.jsonl
/home/jupyter/input/submission_11Jan2019.csv


In [16]:
print(IS_KAGGLE)
print(INPUT_DIR)
print(NQ_DIR)
print(MY_OWN_NQ_DIR)

True
/home/jupyter/input/
nq-competition
nq-competition


In [17]:
# NQ_DIR contains some packages / modules
sys.path.append(NQ_DIR)
sys.path.append(os.path.join(NQ_DIR, "transformers"))

In [18]:
from nq_flags import DEFAULT_FLAGS as FLAGS
from nq_flags import del_all_flags
from nq_dataset_utils import *

In [19]:
import sacremoses as sm
import transformers
from adamw_optimizer import AdamW


In [20]:
from transformers import TFBertModel
from transformers import TFBertMainLayer, TFBertPreTrainedModel
from transformers.modeling_tf_utils import get_initializer

In [21]:
from transformers import BertTokenizer
from transformers import TFBertModel, TFDistilBertModel
from transformers import TFBertMainLayer, TFDistilBertMainLayer, TFBertPreTrainedModel, TFDistilBertPreTrainedModel
from transformers.modeling_tf_utils import get_initializer

In [22]:
PRETRAINED_MODELS = {
    "BERT": [
        'bert-base-uncased',
        'bert-large-uncased-whole-word-masking-finetuned-squad',
    ],
    "DISTILBERT": [
        'distilbert-base-uncased-distilled-squad'
    ]
}


In [23]:
flags = absl.flags
del_all_flags(flags.FLAGS)

In [24]:
flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

In [25]:
vocab_file = os.path.join(NQ_DIR, "vocab-nq.txt")
print(vocab_file)

nq-competition/vocab-nq.txt


In [26]:
flags.DEFINE_string("vocab_file", vocab_file,
                    "The vocabulary file that the BERT model was trained on.")


In [27]:
flags.DEFINE_integer(
    "max_seq_length_for_training", 512,
    "The maximum total input sequence length after WordPiece tokenization for training examples. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

In [28]:
flags.DEFINE_integer(
    "max_seq_length", 512,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

In [29]:
flags.DEFINE_integer(
    "doc_stride", 128,
    "When splitting up a long document into chunks, how much stride to "
    "take between chunks.")

In [30]:
flags.DEFINE_float(
    "include_unknowns_for_training", 0.02,
    "If positive, for converting training dataset, probability of including answers of type `UNKNOWN`.")

In [31]:
flags.DEFINE_float(
    "include_unknowns", -1.0,
    "If positive, probability of including answers of type `UNKNOWN`.")


In [32]:
flags.DEFINE_boolean(
    "skip_nested_contexts", True,
    "Completely ignore context that are not top level nodes in the page.")

In [33]:
flags.DEFINE_integer("max_contexts", 48,
                     "Maximum number of contexts to output for an example.")

In [34]:
flags.DEFINE_integer(
    "max_position", 50,
    "Maximum context position for which to generate special tokens.")


In [35]:
flags.DEFINE_integer(
    "max_query_length", 64,
    "The maximum number of tokens for the question. Questions longer than "
    "this will be truncated to this length.")


In [36]:
flags.DEFINE_string("train_tf_record", TRAIN_TF_RECORD,
                    "Precomputed tf records for training dataset.")

In [37]:
flags.DEFINE_bool("do_train", False, "Whether to run training dataset.")

In [38]:
flags.DEFINE_string(
    "input_checkpoint_dir", CHECKPOINT_DIR,
    "The root directory that contains checkpoints to be loaded of all trained models.")

flags.DEFINE_string("model_dir", NQ_DIR, "Root dir of all Hugging Face's models")

flags.DEFINE_string("model_name", "distilbert-base-uncased-distilled-squad", "Name of Hugging Face's model to use.")

In [39]:
flags.DEFINE_integer("epochs", 0, "Total epochs for training.")

flags.DEFINE_integer("train_batch_size", 64 * 8, "Batch size for training.")

flags.DEFINE_integer("shuffle_buffer_size", 100000, "Shuffle buffer size for training.")

flags.DEFINE_float("init_learning_rate", 5e-5, "The initial learning rate for AdamW optimizer.")

flags.DEFINE_bool("cyclic_learning_rate", True, "If to use cyclic learning rate.")

flags.DEFINE_float("init_weight_decay_rate", 0.01, "The initial weight decay rate for AdamW optimizer.")

flags.DEFINE_integer("num_warmup_steps", 0, "Number of training steps to perform linear learning rate warmup.")

flags.DEFINE_integer("num_train_examples", None, "Number of precomputed training steps in 1 epoch.")


In [40]:
# Make the default flags as parsed flags
FLAGS.mark_as_parsed()

NB_SHORT_ANSWER_TYPES = 5

In [41]:
def get_dataset(tf_record_file, seq_length, batch_size=1, shuffle_buffer_size=0, is_training=False):

    if is_training:
        features = {
            "unique_ids": tf.io.FixedLenFeature([], tf.int64),
            "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
            "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
            "start_positions": tf.io.FixedLenFeature([], tf.int64),
            "end_positions": tf.io.FixedLenFeature([], tf.int64),
            "answer_types": tf.io.FixedLenFeature([], tf.int64)
        }
    else:
        features = {
            "unique_ids": tf.io.FixedLenFeature([], tf.int64),
            "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
            "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64)
        }        

    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example

    def select_data_from_record(record):
        
        x = {
            'unique_ids': record['unique_ids'],
            'input_ids': record['input_ids'],
            'input_mask': record['input_mask'],
            'segment_ids': record['segment_ids']
        }

        if is_training:
            y = {
                'short_start_positions': record['start_positions'],
                'short_end_positions': record['end_positions'],
                'short_answer_types': record['answer_types']
            }

            return (x, y)
        
        return x

    dataset = tf.data.TFRecordDataset(tf_record_file)
    
    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    
    if shuffle_buffer_size > 0:
        dataset = dataset.shuffle(shuffle_buffer_size)
    
    dataset = dataset.batch(batch_size, drop_remainder=True)
    
    return dataset



In [42]:
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER, zone=ZONE, project=PROJECT)

In [43]:
tf.config.experimental_connect_to_cluster(cluster_resolver)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0


In [44]:
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)

INFO:tensorflow:Initializing the TPU system: 10.240.1.2:8470


INFO:tensorflow:Initializing the TPU system: 10.240.1.2:8470


UnimplementedError: 
Additional GRPC error information:
{"created":"@1579168229.175729726","description":"Error received from peer","file":"external/grpc/src/core/lib/surface/call.cc","file_line":1039,"grpc_message":"","grpc_status":12}

In [45]:
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)

INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:GPU:0, GPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:GPU:0, GPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [None]:
##### 