# Fine-Tuning MobileBERT with Federated Averaging

In [1]:
# Copyright 2020, The TensorFlow Federated Authors.
# Copyright 2020, Ronald Seoh
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Google Colab settings

In [2]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there's a package I need to install separately, do it here
    !pip install tensorflow-federated==0.17.0 tensorflow-text==2.3.0 transformers==3.4.0

    # Mount Google Drive root directory
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd 'drive/My Drive/Colab Notebooks/BERTerated'
    
    # List the directory contents
    !ls

# IPython reloading magic
%load_ext autoreload
%autoreload 2

Collecting tensorflow-federated==0.17.0
[?25l  Downloading https://files.pythonhosted.org/packages/5c/54/900d99d3cff21b6a570281b51f4878a745c0eece7732bb7fc26eee61ef57/tensorflow_federated-0.17.0-py2.py3-none-any.whl (517kB)
[K     |████████████████████████████████| 522kB 10.3MB/s 
[?25hCollecting tensorflow-text==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/28/b2/2dbd90b93913afd07e6101b8b84327c401c394e60141c1e98590038060b3/tensorflow_text-2.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 21.8MB/s 
[?25hCollecting transformers==3.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 45.0MB/s 
Collecting tensorflow-privacy~=0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/41/ae/7db0dcf76a746314a174578a7b99ff098b40b908c4c693a955a2bbc012

## Import packages

In [3]:
import os
import sys
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import tensorflow_text as tftext
import transformers

import nest_asyncio
nest_asyncio.apply()

import simple_fedavg_tf
import simple_fedavg_tff
import utils

# Random seed settings
random_seed = 692
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

# Tensorflow GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Test the TFF is working:
tff.federated_computation(lambda: 'Hello, World!')()

Num GPUs Available:  1


b'Hello, World!'

In [4]:
# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("TensorFlow version: " + tf.__version__)
print("TensorFlow Federated version: " + tff.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
NumPy version: 1.18.5
TensorFlow version: 2.3.0
TensorFlow Federated version: 0.17.0
Transformers version: 3.4.0


## Experiment Settings

In [5]:
TOTAL_ROUNDS = 2 # Number of total training rounds
ROUNDS_PER_EVAL = 1 # How often to evaluate
TRAIN_CLIENTS_PER_ROUND = 1 # How many clients to sample per round.
TEST_CLIENTS_PER_ROUND = 1 # How many clients to sample per round for test data
CLIENT_EPOCHS_PER_ROUND = 1 # Number of epochs in the client to take per round.
BATCH_SIZE = 3 # Batch size used on the client.
BUFFER_SIZE = 5  # For dataset shuffling
TEST_BATCH_SIZE = 5 # Minibatch size of test data.
SEQ_LENGTH = 64 # Maximum length of input token sequence for BERT.

# Optimizer configuration
SERVER_LEARNING_RATE = 1.0 # Server learning rate.
CLIENT_LEARNING_RATE = 0.1 # Client learning rate

## Dataset

In [6]:
train_client_data, test_client_data = tff.simulation.datasets.shakespeare.load_data(cache_dir='./tff_cache')

In [7]:
mobilebert_tokenizer = transformers.MobileBertTokenizer.from_pretrained(
    'google/mobilebert-uncased', cache_dir='./transformers_cache')

In [8]:
# Imitate transformers tokenizer with TF.Text Tokenizer
mobilebert_vocab_lookup_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(
        keys=list(mobilebert_tokenizer.vocab.keys()),
        values=tf.constant(list(mobilebert_tokenizer.vocab.values()), dtype=tf.int64)),
    default_value=0)

mobilebert_tokenizer_tf_text = tftext.BertTokenizer(
    vocab_lookup_table=mobilebert_vocab_lookup_table, lower_case=True, split_unknown_characters=True)

In [9]:
# Test if our new tokenizer works
ttt = mobilebert_tokenizer_tf_text.tokenize("This is a test.")
print(tf.shape(ttt.to_tensor()))
print(tf.squeeze(ttt.to_tensor(), axis=-1))
mobilebert_tokenizer.decode(tf.squeeze(ttt, axis=-1).to_list()[0])

tf.Tensor([1 5 1], shape=(3,), dtype=int32)
tf.Tensor([[2023 2003 1037 3231 1012]], shape=(1, 5), dtype=int64)


'this is a test.'

In [10]:
mobilebert_special_ids_mask_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(mobilebert_tokenizer.all_special_ids, dtype=tf.int64),
        values=tf.constant(1, dtype=tf.int64, shape=len(mobilebert_tokenizer.all_special_ids)),
        key_dtype=tf.int64, value_dtype=tf.int64),
    default_value=tf.constant(0, dtype=tf.int64))

### Preprocessing

In [11]:
# Based on the answers from
# https://stackoverflow.com/questions/42334646/tensorflow-pad-unknown-size-tensor-to-a-specific-size/51936821#51936821
def dynamic_padding(inp, min_size, constant_values):

    pad_size = min_size - tf.shape(inp)[1]
    paddings = [[0,0], [0, pad_size]] # assign here, during graph execution

    return tf.pad(inp, paddings, constant_values=constant_values)

# New preprocessing steps based on TF.text tokenizer
def tokenize_and_mask(x):
    # TF.text tokenizer returns RaggedTensor. Convert this to a regular tensor.
    # Note: In the third dimension, 2nd and 3rd indexes contain some sort of offset information,
    # which we will ignore for now.
    tokenized = mobilebert_tokenizer_tf_text.tokenize(tf.reshape(x['snippets'], shape=[1])).to_tensor()[:, :, 0]

    # Add special tokens: [CLS]
    cls_tensor_for_tokenized = tf.constant(mobilebert_tokenizer.cls_token_id, shape=[len(x), 1], dtype=tf.int64)
    tokenized_with_special_tokens = tf.concat([cls_tensor_for_tokenized, tokenized], axis=1)

    # Truncate if the sequence is already longer than SEQ_LENGTH
    tokenized_with_special_tokens = tf.cond(
        tf.greater_equal(tf.shape(tokenized_with_special_tokens)[1], SEQ_LENGTH),
        true_fn=lambda: tokenized_with_special_tokens[:, 0:SEQ_LENGTH-1],
        false_fn=lambda: tokenized_with_special_tokens)     

    # Add special tokens: [SEP]
    sep_tensor_for_tokenized = tf.constant(mobilebert_tokenizer.sep_token_id, shape=[len(x), 1], dtype=tf.int64)
    tokenized_with_special_tokens = tf.concat([tokenized_with_special_tokens, sep_tensor_for_tokenized], axis=1)

    # Padding with [PAD]
    # Final sequence should have the length of SEQ_LENGTH
    # Pad only if necessary
    tokenized_with_special_tokens = tf.cond(
        tf.less(tf.shape(tokenized_with_special_tokens)[1], SEQ_LENGTH),
        true_fn=lambda: dynamic_padding(tokenized_with_special_tokens, SEQ_LENGTH, mobilebert_tokenizer.pad_token_id),
        false_fn=lambda: tokenized_with_special_tokens)  

    # Random masking for the BERT MLM
    masked, labels = utils.get_masked_input_and_labels(
        tokenized_with_special_tokens,
        mobilebert_vocab_lookup_table,
        mobilebert_special_ids_mask_table,
        tf.constant(mobilebert_tokenizer.mask_token_id, dtype=tf.int64))

    # Squeeze out the first dimension
    masked = tf.squeeze(masked)
    labels = tf.squeeze(labels)

    # Manually settting the shape here so that TensorFlow graph
    # could know the sizes in advnace
    masked.set_shape(SEQ_LENGTH)
    labels.set_shape(SEQ_LENGTH)
    
    return masked, labels

def preprocess_for_train(train_dataset):
    return (
        # Filter out empty strings
        train_dataset.filter(lambda x: tf.strings.length(x['snippets']) > 0)
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenize_and_mask)
        # Shuffle
        .shuffle(BUFFER_SIZE)
        # Repeat to make each client train multiple epochs
        .repeat(count=CLIENT_EPOCHS_PER_ROUND)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly BATCH_SIZE
        # and make the shape **exactly** (BATCH_SIZE, SEQ_LENGTH)
        .batch(BATCH_SIZE, drop_remainder=True))
    
def preprocess_for_test(test_dataset):
    return (
        # Filter out empty strings
        test_dataset.filter(lambda x: tf.strings.length(x['snippets']) > 0)
        # Tokenize each samples using MobileBERT tokenizer
        .map(tokenize_and_mask)
        # Shuffle
        .shuffle(BUFFER_SIZE)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly TEST_BATCH_SIZE
        # and make the shape **exactly** (TEST_BATCH_SIZE, SEQ_LENGTH)
        .batch(TEST_BATCH_SIZE, drop_remainder=True))

In [12]:
train_client_data = train_client_data.preprocess(preprocess_fn=preprocess_for_train)
test_client_data = test_client_data.preprocess(preprocess_fn=preprocess_for_test)





Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.






In [13]:
# Create a test client dataset, just to get the element_spec info
example_dataset = train_client_data.create_tf_dataset_for_client('THE_TRAGEDY_OF_KING_LEAR_KING')
print(example_dataset.element_spec)





(TensorSpec(shape=(3, 64), dtype=tf.int64, name=None), TensorSpec(shape=(3, 64), dtype=tf.int64, name=None))


In [14]:
# Did the random masking go well?
for example_input, example_labels in example_dataset.take(10):
    print(example_input)
    print(example_labels)

    print(mobilebert_tokenizer.batch_decode(tf.squeeze(example_input).numpy()))

tf.Tensor(
[[  101  3521   999   102     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  101  2129 27092  2115  9995  1029   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  101  2007  1037 15514 21916  1997   103  1010  1037  2158  1999  2035
   1996  2088  1005   103  2047  4827   103  1010  2008  6045  1037 12927
   1997 15672  1999  2010  4167  1025  2028  20

## Model

In [15]:
def tff_model_fn():
    """Constructs a fully initialized model for use in federated averaging."""
    keras_model = transformers.TFMobileBertForMaskedLM.from_pretrained(
        'google/mobilebert-uncased', cache_dir='./transformers_cache')

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    return simple_fedavg_tf.KerasModelWrapper(keras_model, example_dataset.element_spec, loss)

model = tff_model_fn()

Some layers from the model checkpoint at google/mobilebert-uncased were not used when initializing TFMobileBertForMaskedLM: ['seq_relationship___cls', 'predictions___cls']
- This IS expected if you are initializing TFMobileBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFMobileBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFMobileBertForMaskedLM were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['mlm___cls']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

### Training setups

In [16]:
def server_optimizer_fn():
    return tf.keras.optimizers.SGD(learning_rate=SERVER_LEARNING_RATE)

In [17]:
def client_optimizer_fn():
    return tf.keras.optimizers.SGD(learning_rate=CLIENT_LEARNING_RATE)

In [18]:
iterative_process = simple_fedavg_tff.build_federated_averaging_process(
    tff_model_fn, server_optimizer_fn, client_optimizer_fn)

server_state = iterative_process.initialize()

metric = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, name='test_accuracy')

Some layers from the model checkpoint at google/mobilebert-uncased were not used when initializing TFMobileBertForMaskedLM: ['seq_relationship___cls', 'predictions___cls']
- This IS expected if you are initializing TFMobileBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFMobileBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFMobileBertForMaskedLM were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['mlm___cls']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some layers from the model checkpoint at google/mobilebert-uncased were not used

Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.


Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.


In [19]:
for round_num in range(TOTAL_ROUNDS):

    print("Choosing clients to use for training...")

    sampled_clients = np.random.choice(
        train_client_data.client_ids,
        size=TRAIN_CLIENTS_PER_ROUND,
        replace=False)

    sampled_train_data = [
        train_client_data.create_tf_dataset_for_client(client)
        for client in sampled_clients
    ]

    print("Choosing training clients complete.")

    print(f'Round {round_num} start!')

    server_state, train_metrics = iterative_process.next(server_state, sampled_train_data)

    print(f'Round {round_num} training loss: {train_metrics}')

    if round_num % ROUNDS_PER_EVAL == 0:
        model.from_weights(server_state.model_weights)

        # Test dataset generation for this round
        print("Sampling clients to use for testing...")

        sampled_test_clients = np.random.choice(
            test_client_data.client_ids,
            size=TEST_CLIENTS_PER_ROUND,
            replace=False)

        sampled_test_data = [
            test_client_data.create_tf_dataset_for_client(client)
            for client in sampled_test_clients
        ]

        sampled_test_data_merged = sampled_test_data[0]

        if len(sampled_test_data) > 1:
            for client_test in range(1, len(sampled_test_data)):
                sampled_test_data_merged.concatenate(sampled_test_data[1])

        print("Test clients selected.")

        perplexity_validation = simple_fedavg_tf.keras_evaluate(model.keras_model, sampled_test_data_merged, metric)

        print(f'Round {round_num} validation perplexity: {perplexity_validation}')

Choosing clients to use for training...




Choosing training clients complete.
Round 0 start!
Round 0 training loss: nan


NameError: ignored