# Further Pre-training MobileBERT MLM with Centralized Training (Shakespeare)

In [1]:
# Copyright 2020, The TensorFlow Federated Authors.
# Copyright 2020, Ronald Seoh
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Google Colab settings

In [2]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # Mount Google Drive root directory
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/Colab Notebooks/BERTerated'
    
    # List the directory contents
    !ls

In [3]:
# IPython reloading magic
%load_ext autoreload
%autoreload 2

In [4]:
# Install required packages
!pip install -r requirements.txt

Collecting tensorflow-federated==0.17.0
  Downloading tensorflow_federated-0.17.0-py2.py3-none-any.whl (517 kB)
[K     |████████████████████████████████| 517 kB 15.7 MB/s eta 0:00:01
[?25hCollecting tensorflow-text==2.3.0
  Downloading tensorflow_text-2.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 22.7 MB/s eta 0:00:01
[?25hCollecting transformers==3.4.0
  Downloading transformers-3.4.0-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 22.3 MB/s eta 0:00:01
[?25hCollecting semantic-version~=2.8.5
  Downloading semantic_version-2.8.5-py2.py3-none-any.whl (15 kB)
Collecting retrying~=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting tensorflow~=2.3.0
  Downloading tensorflow-2.3.1-cp36-cp36m-manylinux2010_x86_64.whl (320.4 MB)
[K     |███████████████████████████▋    | 276.1 MB 70.0 MB/s eta 0:00:01

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 320.4 MB 16 kB/s 
[?25hCollecting tensorflow-addons~=0.11.1
  Downloading tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 80.1 MB/s eta 0:00:01
[?25hCollecting portpicker~=1.3.1
  Downloading portpicker-1.3.1.tar.gz (18 kB)
Collecting tensorflow-privacy~=0.5.0
  Downloading tensorflow_privacy-0.5.1-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 24.5 MB/s eta 0:00:01
[?25hCollecting grpcio~=1.29.0
  Downloading grpcio-1.29.0-cp36-cp36m-manylinux2010_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 49.4 MB/s eta 0:00:01
[?25hCollecting cachetools~=3.1.1
  Downloading cachetools-3.1.1-py2.py3-none-any.whl (11 kB)
Collecting attrs~=19.3.0
  Downloading attrs-19.3.0-py2.py3-none-any.whl (39 kB)
Collecting dm-tree~=0.1.1
  Downloading dm_tree-0.1.5-cp36-cp36m-manylinux1_x86_64.whl (294 kB)
[K     |███████████████████████████

Building wheels for collected packages: retrying, portpicker, absl-py, sacremoses, mpmath
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=9530 sha256=87ec170746dd6007ce9ab00559bb7f00681ac9f7556b21b66633f776ea141ebd
  Stored in directory: /root/.cache/pip/wheels/ac/cb/8a/b27bf6323e2f4c462dcbf77d70b7c5e7868a7fbe12871770cf
  Building wheel for portpicker (setup.py) ... [?25ldone
[?25h  Created wheel for portpicker: filename=portpicker-1.3.1-py3-none-any.whl size=10937 sha256=61b219b8d6a2731d929688cc154d9cc3fa1ffa33e982fc92b99dee7f0068f3c8
  Stored in directory: /root/.cache/pip/wheels/53/f1/26/e6fccc50aa37b340f1b9cc508827ef70b1a2767885f37539ca
  Building wheel for absl-py (setup.py) ... [?25ldone
[?25h  Created wheel for absl-py: filename=absl_py-0.9.0-py3-none-any.whl size=119396 sha256=df1cb430af47447b559de5cf1de830dad0bd155cbf155e84834e584c9f98446b
  Stored in directory: /root/.cache/pip/wheel

## Import packages

In [5]:
import os
import sys
import random
import datetime
import json
import pathlib

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import tensorflow_text as tf_text
import transformers

import nest_asyncio
nest_asyncio.apply()

import fedavg
import fedavg_client
import datasets
import utils

# Random seed settings
random_seed = 692
random.seed(random_seed) # Python
np.random.seed(random_seed) # NumPy
tf.random.set_seed(random_seed) # TensorFlow

# Tensorflow GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Test if TFF is working
tff.federated_computation(lambda: 'Hello, World!')()

Num GPUs Available:  1


b'Hello, World!'

In [6]:
# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("TensorFlow version: " + tf.__version__)
print("TensorFlow Federated version: " + tff.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
NumPy version: 1.18.5
TensorFlow version: 2.3.1
TensorFlow Federated version: 0.17.0
Transformers version: 3.4.0


In [7]:
!nvidia-smi

Sat Nov 21 15:57:51 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 108...  On   | 00000000:04:00.0 Off |                  N/A |
|  0%   35C    P2    56W / 250W |  10525MiB / 11178MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [8]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

## Experiment Settings

In [12]:
EXPERIMENT_CONFIG = {}

EXPERIMENT_CONFIG['TOTAL_ROUNDS'] = 1 # Number of total training rounds
EXPERIMENT_CONFIG['ROUNDS_PER_EVAL'] = 1 # How often to evaluate

EXPERIMENT_CONFIG['TRAIN_CLIENTS_PER_ROUND'] = 5 # How many clients to sample per round.
EXPERIMENT_CONFIG['CLIENT_EPOCHS_PER_ROUND'] = 3

# Epochs to train in centralized setting
EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'] = EXPERIMENT_CONFIG['CLIENT_EPOCHS_PER_ROUND'] * EXPERIMENT_CONFIG['TOTAL_ROUNDS']

EXPERIMENT_CONFIG['BATCH_SIZE'] = 16 # Batch size used on the client.
EXPERIMENT_CONFIG['TEST_BATCH_SIZE'] = 32 # Minibatch size of test data.

# Maximum length of input token sequence for BERT.
EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'] = 128

# Optimizer configuration
EXPERIMENT_CONFIG['SERVER_LEARNING_RATE'] = 1.0 # Server learning rate.
EXPERIMENT_CONFIG['CLIENT_LEARNING_RATE'] = 3e-5 # Client learning rate

EXPERIMENT_CONFIG['CENTRALIZED_LEARNING_RATE'] = EXPERIMENT_CONFIG['SERVER_LEARNING_RATE'] * EXPERIMENT_CONFIG['CLIENT_LEARNING_RATE']

# Client dataset setting
EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT'] = 100
EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT'] = 100

# Path to save trained weights and logs
EXPERIMENT_CONFIG['RESULTS_DIRECTORY'] = os.path.join(
    '.', 'results',
    'mobilebert_mlm_shakespeare_centralized',
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)

EXPERIMENT_CONFIG['RESULTS_LOG'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "logs")
EXPERIMENT_CONFIG['RESULTS_MODEL'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "model")
EXPERIMENT_CONFIG['RESULTS_CONFIG'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "config")

In [None]:
# Dump all the configuration into a json file
pathlib.Path(EXPERIMENT_CONFIG['RESULTS_CONFIG']).mkdir(parents=True, exist_ok=True)

with open(os.path.join(EXPERIMENT_CONFIG['RESULTS_CONFIG'], "config.json"), 'w') as config_file:
    json.dump(EXPERIMENT_CONFIG, config_file, indent=6)

## Dataset

### Load the Shakespeare dataset

In [13]:
train_client_data, test_client_data = tff.simulation.datasets.shakespeare.load_data(cache_dir='./tff_cache')

### Tokenizer

In [14]:
mobilebert_tokenizer = transformers.MobileBertTokenizer.from_pretrained(
    'google/mobilebert-uncased', cache_dir='./transformers_cache')

In [15]:
# Imitate transformers tokenizer with TF.Text Tokenizer
tokenizer_tf_text, vocab_lookup_table, special_ids_mask_table = datasets.preprocessing_for_bert.convert_huggingface_tokenizer(mobilebert_tokenizer)

### Preprocessing

In [17]:
def check_empty_snippet(x):
    return tf.strings.length(x['snippets']) > 0

def tokenizer_and_mask_wrapped(x):

    masked, labels = datasets.preprocessing_for_bert.tokenize_and_mask(tf.reshape(x['snippets'], shape=[1]),
                                                                       max_seq_length=EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'],
                                                                       bert_tokenizer_tf_text=tokenizer_tf_text,
                                                                       vocab_lookup_table=vocab_lookup_table,
                                                                       special_ids_mask_table=special_ids_mask_table,
                                                                       cls_token_id=mobilebert_tokenizer.cls_token_id,
                                                                       sep_token_id=mobilebert_tokenizer.sep_token_id,
                                                                       pad_token_id=mobilebert_tokenizer.pad_token_id,
                                                                       mask_token_id=mobilebert_tokenizer.mask_token_id)

    return (masked, labels)

def preprocess_for_train(train_dataset):
    return (
        train_dataset
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly BATCH_SIZE
        # and make the shape **exactly** (BATCH_SIZE, SEQ_LENGTH)
        .batch(EXPERIMENT_CONFIG['BATCH_SIZE'], drop_remainder=True)
        # Repeat to make each client train multiple epochs
        .repeat(count=EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'])
    )
    
def preprocess_for_test(test_dataset):
    return (
        test_dataset
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly TEST_BATCH_SIZE
        # and make the shape **exactly** (TEST_BATCH_SIZE, SEQ_LENGTH)
        .batch(EXPERIMENT_CONFIG['TEST_BATCH_SIZE'], drop_remainder=True)
    )

### Training set

In [None]:
# Since the stackoverflow dataset is pretty large, we randomly select TRAIN_NUM_CLIENT_LIMIT number of clients.
all_train_client_ids = train_client_data.client_ids

random.shuffle(all_train_client_ids)

if EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT'] > 0:
    selected_train_client_ids = all_train_client_ids[0:EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT']]
else:
    selected_train_client_ids = all_train_client_ids

In [18]:
train_client_data_all_merged = train_client_data.create_tf_dataset_for_client(
    selected_train_client_ids[0]).filter(check_empty_snippet)

if len(selected_train_client_ids) > 1:
    for i in range(1, len(selected_train_client_ids)):
        train_client_data_all_merged = train_client_data_all_merged.concatenate(
            train_client_data.create_tf_dataset_for_client(selected_train_client_ids[i]).filter(check_empty_snippet))

In [19]:
train_client_data_all_merged = preprocess_for_train(train_client_data_all_merged)

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [20]:
print(train_client_data_all_merged.element_spec)

(TensorSpec(shape=(16, 128), dtype=tf.int32, name=None),
 TensorSpec(shape=(16, 128), dtype=tf.int32, name=None))

### Test set

In [None]:
# Since the stackoverflow dataset is pretty large, we randomly select TEST_NUM_CLIENT_LIMIT number of clients.
all_test_client_ids = test_client_data.client_ids

random.shuffle(all_test_client_ids)

if EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT'] > 0:
    selected_test_client_ids = all_test_client_ids[0:EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT']]
else:
    selected_test_client_ids = all_test_client_ids

In [21]:
test_client_data_all_merged = test_client_data.create_tf_dataset_for_client(
    selected_test_client_ids[0]).filter(check_empty_snippet)

if len(selected_test_client_ids) > 1:
    for i in range(1, len(selected_test_client_ids)):
        test_client_data_all_merged = test_client_data_all_merged.concatenate(
            test_client_data.create_tf_dataset_for_client(selected_test_client_ids[i]).filter(check_empty_snippet))

In [22]:
test_client_data_all_merged = preprocess_for_test(test_client_data_all_merged)

In [None]:
print(test_client_data_all_merged.element_spec)

## Model

In [25]:
mobilebert_model = transformers.TFMobileBertForPreTraining.from_pretrained(
    'google/mobilebert-uncased', cache_dir='./transformers_cache')

All model checkpoint layers were used when initializing TFMobileBertForPreTraining.

All the layers of TFMobileBertForPreTraining were initialized from the model checkpoint at google/mobilebert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMobileBertForPreTraining for predictions without further training.


In [26]:
print(mobilebert_model.config)

MobileBertConfig {
  "_name_or_path": "google/mobilebert-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 30522
}



In [27]:
# Due to the limitations with Keras subclasses, we can only use the main layer part from pretrained models
# and add output heads by ourselves
mobilebert_keras_converted = utils.convert_huggingface_mlm_to_keras(
    huggingface_model=mobilebert_model,
    max_seq_length=EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'],
)

In [28]:
mobilebert_keras_converted.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
mobilebert (TFMobileBertMain ((None, 128, 512), (None, 24581888  
_________________________________________________________________
standalone_tf_mobile_bert_ml (None, 128, 30522)        15921466  
Total params: 40,503,354
Trainable params: 40,503,354
Non-trainable params: 0
_________________________________________________________________


## Training

### TensorBoard

In [29]:
%tensorboard --logdir {EXPERIMENT_CONFIG['RESULTS_LOG']}

### Training setups

In [30]:
mobilebert_keras_converted.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=EXPERIMENT_CONFIG['CENTRALIZED_LEARNING_RATE']),
    loss=utils.MaskedLMCrossEntropy(),
)

In [31]:
# TensorBoard Callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=EXPERIMENT_CONFIG['RESULTS_LOG'])

### Training loop

In [32]:
mobilebert_keras_converted.fit(
    train_client_data_all_merged,
    epochs=EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'],
    callbacks=[tensorboard_callback]
)

Epoch 1/3
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5f0c5f1278>

### Save the trained model

In [33]:
mobilebert_keras_converted.save(EXPERIMENT_CONFIG['RESULTS_MODEL'])

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: ./results/mobilebert_mlm_shakespeare_centralized/20201121-155929/model/assets


INFO:tensorflow:Assets written to: ./results/mobilebert_mlm_shakespeare_centralized/20201121-155929/model/assets


## Evaluation

In [35]:
# If we need to evaluate with the saved model
# mobilebert_keras_converted_load = tf.keras.models.load_model(
#    os.path.join(RESULTS_DIRECTORY, 'model'),
#    custom_objects={'MaskedLMCrossEntropy': utils.MaskedLMCrossEntropy}
#)

In [37]:
mobilebert_keras_converted.evaluate(
    test_client_data_all_merged,
)



3.5697546005249023