# Further Pre-training MobileBERT MLM with Centralized Training

In [None]:
# Copyright 2020, The TensorFlow Federated Authors.
# Copyright 2020, Ronald Seoh
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Google Colab settings

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # Mount Google Drive root directory
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/Colab Notebooks/BERTerated'
    
    # List the directory contents
    !ls

# IPython reloading magic
%load_ext autoreload
%autoreload 2

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/BERTerated
datasets
fedavg_client.py
fedavg.py
huggingface_keras_layers.py
LICENSE
mobilebert_mlm_shakespeare_centralized.ipynb
mobilebert_mlm_shakespeare_fedavg.ipynb
mobilebert_mlm_stackoverflow_fedavg.ipynb
__pycache__
README.md
requirements.txt
tff_cache
transformers_cache
utils.py


In [None]:
# Install required packages
!pip install -r requirements.txt

Collecting tensorflow-federated==0.17.0
[?25l  Downloading https://files.pythonhosted.org/packages/5c/54/900d99d3cff21b6a570281b51f4878a745c0eece7732bb7fc26eee61ef57/tensorflow_federated-0.17.0-py2.py3-none-any.whl (517kB)
[K     |▋                               | 10kB 25.6MB/s eta 0:00:01[K     |█▎                              | 20kB 33.2MB/s eta 0:00:01[K     |██                              | 30kB 38.6MB/s eta 0:00:01[K     |██▌                             | 40kB 33.4MB/s eta 0:00:01[K     |███▏                            | 51kB 35.1MB/s eta 0:00:01[K     |███▉                            | 61kB 37.9MB/s eta 0:00:01[K     |████▍                           | 71kB 27.1MB/s eta 0:00:01[K     |█████                           | 81kB 22.1MB/s eta 0:00:01[K     |█████▊                          | 92kB 23.5MB/s eta 0:00:01[K     |██████▍                         | 102kB 22.7MB/s eta 0:00:01[K     |███████                         | 112kB 22.7MB/s eta 0:00:01[K     |█████

## Import packages

In [None]:
import os
import sys
import random

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import tensorflow_text as tf_text
import transformers

import nest_asyncio
nest_asyncio.apply()

import fedavg
import fedavg_client
import datasets
import utils

# Random seed settings
random_seed = 692
random.seed(random_seed) # Python
np.random.seed(random_seed) # NumPy
tf.random.set_seed(random_seed) # TensorFlow

# Tensorflow GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Test the TFF is working:
tff.federated_computation(lambda: 'Hello, World!')()

Num GPUs Available:  1


b'Hello, World!'

In [None]:
# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("TensorFlow version: " + tf.__version__)
print("TensorFlow Federated version: " + tff.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
NumPy version: 1.18.5
TensorFlow version: 2.3.0
TensorFlow Federated version: 0.17.0
Transformers version: 3.4.0


## Experiment Settings

In [None]:
TOTAL_ROUNDS = 2 # Number of total training rounds
ROUNDS_PER_EVAL = 1 # How often to evaluate

# Number of epochs in the client to take per round.
CLIENT_EPOCHS_PER_ROUND = 3

# Epochs to train in centralized setting
CENTRALIZED_EPOCHS = CLIENT_EPOCHS_PER_ROUND * TOTAL_ROUNDS

BATCH_SIZE = 8 # Batch size used on the client.
TEST_BATCH_SIZE = 8 # Minibatch size of test data.

# Maximum length of input token sequence for BERT.
BERT_MAX_SEQ_LENGTH = 128

# Optimizer configuration
LEARNING_RATE = 2e-5

## Dataset

### Load the Shakespeare dataset

In [None]:
train_client_data, test_client_data = tff.simulation.datasets.shakespeare.load_data(cache_dir='./tff_cache')

### Tokenizer

In [None]:
mobilebert_tokenizer = transformers.MobileBertTokenizer.from_pretrained(
    'google/mobilebert-uncased', cache_dir='./transformers_cache')

In [None]:
# Imitate transformers tokenizer with TF.Text Tokenizer
tokenizer_tf_text, vocab_lookup_table, special_ids_mask_table = datasets.preprocessing_for_bert.convert_huggingface_tokenizer(mobilebert_tokenizer)

In [None]:
# Test if our new tokenizer works
ttt = tokenizer_tf_text.tokenize("This is a test.")
print("TF Text tokenizer output shape:", tf.shape(ttt.to_tensor()))
print(tf.squeeze(ttt.to_tensor(), axis=-1))
mobilebert_tokenizer.decode(tf.squeeze(ttt, axis=-1).to_list()[0])

TF Text tokenizer output shape: tf.Tensor([1 5 1], shape=(3,), dtype=int32)
tf.Tensor([[2023 2003 1037 3231 1012]], shape=(1, 5), dtype=int32)


'this is a test.'

### Preprocessing

In [None]:
def check_empty_snippet(x):
    return tf.strings.length(x['snippets']) > 0

def tokenizer_and_mask_wrapped(x):

    masked, labels, sample_weights = datasets.preprocessing_for_bert.tokenize_and_mask(tf.reshape(x['snippets'], shape=[1]),
                                                                                       max_seq_length=BERT_MAX_SEQ_LENGTH,
                                                                                       bert_tokenizer_tf_text=tokenizer_tf_text,
                                                                                       vocab_lookup_table=vocab_lookup_table,
                                                                                       special_ids_mask_table=special_ids_mask_table,
                                                                                       cls_token_id=mobilebert_tokenizer.cls_token_id,
                                                                                       sep_token_id=mobilebert_tokenizer.sep_token_id,
                                                                                       pad_token_id=mobilebert_tokenizer.pad_token_id,
                                                                                       mask_token_id=mobilebert_tokenizer.mask_token_id)

    return (masked, labels, sample_weights)

def preprocess_for_train(train_dataset):
    return (
        # Filter out empty strings
        train_dataset#.filter(lambda x: tf.strings.length(x['snippets']) > 0)
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly BATCH_SIZE
        # and make the shape **exactly** (BATCH_SIZE, SEQ_LENGTH)
        #.batch(BATCH_SIZE, drop_remainder=True)
        # Repeat to make each client train multiple epochs
        .repeat(count=CENTRALIZED_EPOCHS))
    
def preprocess_for_test(test_dataset):
    return (
        # Filter out empty strings
        test_dataset#.filter(lambda x: tf.strings.length(x['snippets']) > 0)
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly TEST_BATCH_SIZE
        # and make the shape **exactly** (TEST_BATCH_SIZE, SEQ_LENGTH)
        #.batch(TEST_BATCH_SIZE, drop_remainder=True)
    )

In [None]:
train_client_data_all_merged = train_client_data.create_tf_dataset_for_client(
    train_client_data.client_ids[0]).filter(check_empty_snippet)

if len(train_client_data.client_ids) > 1:
    for i in range(1, len(train_client_data.client_ids)):
        train_client_data_all_merged = train_client_data_all_merged.concatenate(
            train_client_data.create_tf_dataset_for_client(train_client_data.client_ids[i]).filter(check_empty_snippet))

In [None]:
train_client_data_all_merged = preprocess_for_train(train_client_data_all_merged)

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [None]:
train_client_data_all_merged.element_spec

(TensorSpec(shape=(1, 128), dtype=tf.int32, name=None),
 TensorSpec(shape=(1, 128), dtype=tf.int32, name=None),
 TensorSpec(shape=(1, 128), dtype=tf.int32, name=None))

In [None]:
test_client_data_all_merged = test_client_data.create_tf_dataset_for_client(
    test_client_data.client_ids[0]).filter(check_empty_snippet)

if len(test_client_data.client_ids) > 1:
    for i in range(1, len(test_client_data.client_ids)):
        test_client_data_all_merged = test_client_data_all_merged.concatenate(
            test_client_data.create_tf_dataset_for_client(test_client_data.client_ids[i]).filter(check_empty_snippet))

In [None]:
test_client_data_all_merged = preprocess_for_test(test_client_data_all_merged)

In [None]:
test_input = list(test_client_data_all_merged.take(1).as_numpy_iterator())

In [None]:
print(test_input)

[(array([[  101,  1996, 13410,   103,   103,  2004,  1045,   103,  1010,
         2003,  6783,  2000,  6713,  1010,  1999,  1996,  3033,  2073,
         2002, 11113,  1012,  1031,   103,  4237,  1033,   103,  2204,
         1010,  2026, 17766,   103,  2000,  3531,  2017,  2007,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
    

## Model

In [None]:
mobilebert_model = transformers.TFMobileBertForPreTraining.from_pretrained(
    'google/mobilebert-uncased', cache_dir='./transformers_cache')

All model checkpoint layers were used when initializing TFMobileBertForPreTraining.

All the layers of TFMobileBertForPreTraining were initialized from the model checkpoint at google/mobilebert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMobileBertForPreTraining for predictions without further training.


In [None]:
print(mobilebert_model.config)

MobileBertConfig {
  "_name_or_path": "google/mobilebert-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 30522
}



In [None]:
# Due to the limitations with Keras subclasses, we can only use the main layer part from pretrained models
# and add output heads by ourselves
mobilebert_keras_converted = utils.convert_huggingface_mlm_to_keras(
    huggingface_model=mobilebert_model,
    max_seq_length=BERT_MAX_SEQ_LENGTH,
    batch_size=1)

In [None]:
mobilebert_keras_converted.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(1, 128)]                0         
_________________________________________________________________
mobilebert (TFMobileBertMain ((1, 128, 512), (1, 512)) 24581888  
_________________________________________________________________
standalone_tf_mobile_bert_ml (1, 128, 30522)           15921466  
Total params: 40,503,354
Trainable params: 40,503,354
Non-trainable params: 0
_________________________________________________________________


## Training

### Training setups

In [None]:
mobilebert_keras_converted.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-6),
    loss=utils.CustomCrossEntropyError(),
)

In [None]:
mobilebert_keras_converted.fit(
    train_client_data_all_merged,
    epochs=CENTRALIZED_EPOCHS,
)

Epoch 1/6
TensorShape([1, 128])
TensorShape([1, 128, 30522])
TensorShape([1, None])
TensorShape([1, None, 30522])
      1/Unknown - 0s 516us/step - loss: 0.0790 - accuracy: 0.0000e+00TensorShape([1, 128])
TensorShape([1, 128, 30522])
TensorShape([1, None])
TensorShape([1, None, 30522])


InvalidArgumentError: ignored

In [None]:
mobilebert_keras_converted.evaluate(
    test_client_data_all_merged,
)