# Further Pre-training MobileBERT MLM with Centralized Training (Stackoverflow)

In [None]:
# Copyright 2020, The TensorFlow Federated Authors.
# Copyright 2020, Ronald Seoh
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Google Colab settings

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # Mount Google Drive root directory
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/Colab Notebooks/BERTerated'
    
    # List the directory contents
    !ls

## CUDA Multi GPU

In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# IPython reloading magic
%load_ext autoreload
%autoreload 2

In [None]:
# Install required packages
#!pip install -r requirements.txt

## Import packages

In [None]:
import os
import sys
import random
import datetime
import json
import pathlib
import itertools
import math

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import tensorflow_text as tf_text
import transformers
import tqdm

import nest_asyncio
nest_asyncio.apply()

import fedavg
import fedavg_client
import datasets
import utils

# Random seed settings
random_seed = 692
random.seed(random_seed) # Python
np.random.seed(random_seed) # NumPy
tf.random.set_seed(random_seed) # TensorFlow

# Tensorflow GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Test if TFF is working
tff.federated_computation(lambda: 'Hello, World!')()

In [None]:
# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("TensorFlow version: " + tf.__version__)
print("TensorFlow Federated version: " + tff.__version__)
print("Transformers version: " + transformers.__version__)

In [None]:
!nvidia-smi

## Experiment Settings

In [None]:
EXPERIMENT_CONFIG = {}

EXPERIMENT_CONFIG['HUGGINGFACE_MODEL_NAME'] = 'google/mobilebert-uncased'
EXPERIMENT_CONFIG['HUGGINGFACE_CACHE_DIR'] = os.path.join('.', 'transformers_cache')

EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'] = 9
EXPERIMENT_CONFIG['VALIDATION_FREQUENCY'] = 3 

EXPERIMENT_CONFIG['BATCH_SIZE'] = 32
EXPERIMENT_CONFIG['TEST_BATCH_SIZE'] = 64

EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'] = 128

EXPERIMENT_CONFIG['CENTRALIZED_LEARNING_RATE'] = [5e-5, 3e-5, 2e-5]

EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT'] = 2000
EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT'] = 1000

EXPERIMENT_CONFIG['RESULTS_DIRECTORY'] = os.path.join(
    '.', 'results',
    'mobilebert_mlm_stackoverflow_centralized',
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)

EXPERIMENT_CONFIG['RESULTS_LOG'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "logs")
EXPERIMENT_CONFIG['RESULTS_MODEL'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "model")
EXPERIMENT_CONFIG['RESULTS_CONFIG'] = os.path.join(EXPERIMENT_CONFIG['RESULTS_DIRECTORY'], "config")

In [None]:
# Dump all the configuration into a json file
pathlib.Path(EXPERIMENT_CONFIG['RESULTS_CONFIG']).mkdir(parents=True, exist_ok=True)

with open(os.path.join(EXPERIMENT_CONFIG['RESULTS_CONFIG'], "config.json"), 'w') as config_file:
    json.dump(EXPERIMENT_CONFIG, config_file, indent=6)

## Dataset

### Dataset loader

In [None]:
train_client_data = tff.python.simulation.hdf5_client_data.HDF5ClientData(
    os.path.join('.', 'tff_cache', 'datasets', 'stackoverflow_train.h5'))
held_out_client_data = tff.python.simulation.hdf5_client_data.HDF5ClientData(
    os.path.join('.', 'tff_cache', 'datasets', 'stackoverflow_held_out.h5'))
test_client_data = tff.python.simulation.hdf5_client_data.HDF5ClientData(
    os.path.join('.', 'tff_cache', 'datasets', 'stackoverflow_test.h5'))

### Tokenizer

In [None]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained(
    EXPERIMENT_CONFIG['HUGGINGFACE_MODEL_NAME'], cache_dir=EXPERIMENT_CONFIG['HUGGINGFACE_CACHE_DIR'])

In [None]:
# Imitate transformers tokenizer with TF.Text Tokenizer
tokenizer_tf_text, vocab_lookup_table, special_ids_mask_table = \
datasets.preprocessing_for_bert.convert_huggingface_tokenizer(bert_tokenizer)

### Preprocessing

In [None]:
def check_empty_snippet(x):
    return tf.strings.length(x['tokens']) > 0

def tokenizer_and_mask_wrapped(x):

    masked, labels = datasets.preprocessing_for_bert.tokenize_and_mask(tf.reshape(x['tokens'], shape=[1]),
                                                                       max_seq_length=EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'],
                                                                       bert_tokenizer_tf_text=tokenizer_tf_text,
                                                                       vocab_lookup_table=vocab_lookup_table,
                                                                       special_ids_mask_table=special_ids_mask_table,
                                                                       cls_token_id=bert_tokenizer.cls_token_id,
                                                                       sep_token_id=bert_tokenizer.sep_token_id,
                                                                       pad_token_id=bert_tokenizer.pad_token_id,
                                                                       mask_token_id=bert_tokenizer.mask_token_id)

    return (masked, labels)

def preprocess_for_train(train_dataset):
    return (
        train_dataset
        # Tokenize each samples using MobileBERT tokenizer
        #.map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=20, deterministic=False)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly BATCH_SIZE
        # and make the shape **exactly** (BATCH_SIZE, SEQ_LENGTH)
        .batch(EXPERIMENT_CONFIG['BATCH_SIZE'])
        # NOTE: THIS SHOULD BE COMMENTED OUT FOR CENTRALIZED TRAINING
        #.repeat(count=EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'])
    )
    
def preprocess_for_test(test_dataset):
    return (
        test_dataset
        # Tokenize each samples using MobileBERT tokenizer
        #.map(tokenizer_and_mask_wrapped, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)
        .map(tokenizer_and_mask_wrapped, num_parallel_calls=20, deterministic=False)
        # Shuffle
        .shuffle(100000)
        # Form minibatches
        # Use drop_remainder=True to force the batch size to be exactly TEST_BATCH_SIZE
        # and make the shape **exactly** (TEST_BATCH_SIZE, SEQ_LENGTH)
        .batch(EXPERIMENT_CONFIG['TEST_BATCH_SIZE'])
    )

### Training set

In [None]:
# Since the dataset is pretty large, we randomly select TRAIN_NUM_CLIENT_LIMIT number of clients.
all_train_client_ids = train_client_data.client_ids

random.shuffle(all_train_client_ids)

if EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT'] > 0:
    selected_train_client_ids = all_train_client_ids[0:EXPERIMENT_CONFIG['TRAIN_NUM_CLIENT_LIMIT']]
else:
    selected_train_client_ids = all_train_client_ids

In [None]:
train_client_data_all_merged = None
train_client_data_all_merged_length = 0

for i in tqdm.notebook.tqdm(range(len(selected_train_client_ids))):
    # Get the current client dataset while filtering out empty data points
    current_client_data = train_client_data.create_tf_dataset_for_client(selected_train_client_ids[i]).filter(check_empty_snippet)

    # How many data points in this client's dataset?
    # Apparently iterating through each of them is the only way to get the lengths of tf.data.Dataset
    # This is not very cool tbh.
    for _ in tqdm.notebook.tqdm(current_client_data, leave=False):
        train_client_data_all_merged_length = train_client_data_all_merged_length + 1 

    if train_client_data_all_merged is None:
        train_client_data_all_merged = current_client_data
    else:
        train_client_data_all_merged = train_client_data_all_merged.concatenate(current_client_data)
        
print(len(selected_train_client_ids), "train clients processed.")
print(train_client_data_all_merged_length, "train data points available.")

In [None]:
# How many training steps will be there?
import math
num_training_steps = math.ceil(train_client_data_all_merged_length / EXPERIMENT_CONFIG['BATCH_SIZE'])
print("There will be", num_training_steps, "training steps.")

In [None]:
train_client_data_all_merged = preprocess_for_train(train_client_data_all_merged)

In [None]:
print(train_client_data_all_merged.element_spec)

### Test set

In [None]:
# Since the stackoverflow dataset is pretty large, we randomly select TEST_NUM_CLIENT_LIMIT number of clients.
all_test_client_ids = test_client_data.client_ids

random.shuffle(all_test_client_ids)

if EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT'] > 0:
    selected_test_client_ids = all_test_client_ids[0:EXPERIMENT_CONFIG['TEST_NUM_CLIENT_LIMIT']]
else:
    selected_test_client_ids = all_test_client_ids

In [None]:
test_client_data_all_merged = None
test_client_data_all_merged_length = 0

for i in tqdm.notebook.tqdm(range(len(selected_test_client_ids))):
    # Get the current client dataset while filtering out empty data points
    current_client_data = test_client_data.create_tf_dataset_for_client(selected_test_client_ids[i]).filter(check_empty_snippet)

    # How many data points in this client's dataset?
    # Apparently iterating through each of them is the only way to get the lengths of tf.data.Dataset
    # This is not very cool tbh.
    for _ in tqdm.notebook.tqdm(current_client_data, leave=False):
        test_client_data_all_merged_length = test_client_data_all_merged_length + 1 

    if test_client_data_all_merged is None:
        test_client_data_all_merged = current_client_data
    else:
        test_client_data_all_merged = test_client_data_all_merged.concatenate(current_client_data)
        
print(len(selected_test_client_ids), "train clients processed.")
print(test_client_data_all_merged_length, "train data points available.")

In [None]:
test_client_data_all_merged = preprocess_for_test(test_client_data_all_merged)

In [None]:
print(test_client_data_all_merged.element_spec)

## Model

In [None]:
bert_model = transformers.TFAutoModelForPreTraining.from_pretrained(
    EXPERIMENT_CONFIG['HUGGINGFACE_MODEL_NAME'], cache_dir=EXPERIMENT_CONFIG['HUGGINGFACE_CACHE_DIR'])

In [None]:
print(bert_model.config)

In [None]:
# Generate a new model with pretrained weights
# Due to the limitations with Keras subclasses,
# we can only use the main layer part from pretrained models
# and add output heads by ourselves
bert_keras_converted = utils.convert_huggingface_mlm_to_keras(
    huggingface_model=bert_model,
    max_seq_length=EXPERIMENT_CONFIG['BERT_MAX_SEQ_LENGTH'],
)

In [None]:
bert_keras_converted.summary()

## Training / Testing

In [None]:
for lr in EXPERIMENT_CONFIG['CENTRALIZED_LEARNING_RATE']:  
    config_name = "%.7f" % (lr)
    logdir = os.path.join(EXPERIMENT_CONFIG['RESULTS_LOG'], config_name)
    
    print("----")
    print("Running", config_name)

    # A fresh copy of the model for the current config
    bert_keras_converted_cloned = tf.keras.models.clone_model(bert_keras_converted)

    # Always start with the pretrained weights
    bert_keras_converted_cloned.set_weights(bert_keras_converted.get_weights())

    # Optimizer
    adamw_optimizer, lr_schedule = transformers.create_optimizer(
        init_lr=lr,
        num_train_steps=num_training_steps,
        num_warmup_steps=int(num_training_steps * (2/5)),
        weight_decay_rate=0.01,
    )

    bert_keras_converted_cloned.compile(
        optimizer=adamw_optimizer,
        loss=utils.MaskedLMCrossEntropy(),
    )

    bert_keras_converted_cloned.fit(
        train_client_data_all_merged,
        epochs=EXPERIMENT_CONFIG['CENTRALIZED_EPOCHS'],
        validation_data=test_client_data_all_merged,
        validation_freq=EXPERIMENT_CONFIG['VALIDATION_FREQUENCY'],
        callbacks=[
            tf.keras.callbacks.TensorBoard(logdir),
        ],
        worker=20,
    )

    bert_keras_converted_cloned.evaluate(test_client_data_all_merged)

    # Save the trained model for the current configuartion
    bert_keras_converted_cloned.save(
        os.path.join(EXPERIMENT_CONFIG['RESULTS_MODEL'], config_name))
    
    print()