# Comparing TensorFlow (original) and PyTorch models

You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.

To run this notebook, follow these instructions:
- make sure that your Python environment has both TensorFlow and PyTorch installed,
- download the original TensorFlow implementation,
- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,
- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.

If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.

In [1]:
import os
os.chdir('../')

## 1/ TensorFlow code

In [28]:
original_tf_inplem_dir = "./tensorflow_code/"
model_dir = "../google_models/uncased_L-12_H-768_A-12/"

vocab_file = model_dir + "vocab.txt"
bert_config_file = model_dir + "bert_config.json"
init_checkpoint = model_dir + "bert_model.ckpt"

input_file = "./samples/input.txt"
max_seq_length = 128
max_predictions_per_seq = 20

masked_lm_positions = [6]

In [7]:
import importlib.util
import sys
# import tensorflow as tf
import pytorch_pretrained_bert as ppb

# def del_all_flags(FLAGS):
#     flags_dict = FLAGS._flags()    
#     keys_list = [keys for keys in flags_dict]    
#     for keys in keys_list:
#         FLAGS.__delattr__(keys)

# del_all_flags(tf.flags.FLAGS)
# import tensorflow_code.extract_features as ef
# del_all_flags(tf.flags.FLAGS)
# import tensorflow_code.modeling as tfm
# del_all_flags(tf.flags.FLAGS)
# import tensorflow_code.tokenization as tft
# del_all_flags(tf.flags.FLAGS)
# import tensorflow_code.run_pretraining as rp
# del_all_flags(tf.flags.FLAGS)
# import tensorflow_code.create_pretraining_data as cpp

In [30]:
import re
class InputExample(object):
    """A single instance example."""

    def __init__(self, tokens, segment_ids, masked_lm_positions,
                 masked_lm_labels, is_random_next):
        self.tokens = tokens
        self.segment_ids = segment_ids
        self.masked_lm_positions = masked_lm_positions
        self.masked_lm_labels = masked_lm_labels
        self.is_random_next = is_random_next
    def __repr__(self):
        return '\n'.join(k + ":" + str(v) for k, v in self.__dict__.items())


def read_examples(input_file, tokenizer, masked_lm_positions):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:
            line = reader.readline()
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            tokens_a = tokenizer.tokenize(text_a)
            tokens_b = None
            if text_b:
                tokens_b = tokenizer.tokenize(text_b)
            tokens = tokens_a + tokens_b
            masked_lm_labels = []
            for m_pos in masked_lm_positions:
                masked_lm_labels.append(tokens[m_pos])
                tokens[m_pos] = '[MASK]'
            examples.append(
                InputExample(
                    tokens = tokens,
                    segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b),
                    masked_lm_positions = masked_lm_positions,
                    masked_lm_labels = masked_lm_labels,
                    is_random_next = False))
            unique_id += 1
    return examples

In [29]:
# bert_config = tfm.BertConfig.from_json_file(bert_config_file)
tokenizer = ppb.BertTokenizer(
    vocab_file=vocab_file, do_lower_case=True)
examples = read_examples(input_file, tokenizer, masked_lm_positions=masked_lm_positions)

print(examples[0])

ValueError: Can't find a vocabulary file at path '../google_models/uncased_L-12_H-768_A-12/vocab.txt'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`

In [13]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, masked_lm_positions,
                 masked_lm_ids, masked_lm_weights, next_sentence_label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.masked_lm_positions = masked_lm_positions
        self.masked_lm_ids = masked_lm_ids
        self.masked_lm_weights = masked_lm_weights
        self.next_sentence_labels = next_sentence_label

    def __repr__(self):
        return '\n'.join(k + ":" + str(v) for k, v in self.__dict__.items())

def pretraining_convert_examples_to_features(instances, tokenizer, max_seq_length,
                                 max_predictions_per_seq):
    """Create TF example files from `TrainingInstance`s."""
    features = []
    for (inst_index, instance) in enumerate(instances):
        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = list(instance.segment_ids)
        assert len(input_ids) <= max_seq_length

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        masked_lm_positions = list(instance.masked_lm_positions)
        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
        masked_lm_weights = [1.0] * len(masked_lm_ids)

        while len(masked_lm_positions) < max_predictions_per_seq:
            masked_lm_positions.append(0)
            masked_lm_ids.append(0)
            masked_lm_weights.append(0.0)

        next_sentence_label = 1 if instance.is_random_next else 0

        features.append(
            InputFeatures(input_ids, input_mask, segment_ids,
                          masked_lm_positions, masked_lm_ids,
                          masked_lm_weights, next_sentence_label))

        if inst_index < 5:
            tf.logging.info("*** Example ***")
            tf.logging.info("tokens: %s" % " ".join(
                [str(x) for x in instance.tokens]))
            tf.logging.info("features: %s" % str(features[-1]))
    return features

In [26]:
features = pretraining_convert_examples_to_features(
    instances=examples, max_seq_length=max_seq_length, 
    max_predictions_per_seq=max_predictions_per_seq, tokenizer=tokenizer)

NameError: name 'examples' is not defined

In [19]:
# def input_fn_builder(features, seq_length, max_predictions_per_seq, tokenizer):
#     """Creates an `input_fn` closure to be passed to TPUEstimator."""

#     all_input_ids = []
#     all_input_mask = []
#     all_segment_ids = []
#     all_masked_lm_positions = []
#     all_masked_lm_ids = []
#     all_masked_lm_weights = []
#     all_next_sentence_labels = []

#     for feature in features:
#         all_input_ids.append(feature.input_ids)
#         all_input_mask.append(feature.input_mask)
#         all_segment_ids.append(feature.segment_ids)
#         all_masked_lm_positions.append(feature.masked_lm_positions)
#         all_masked_lm_ids.append(feature.masked_lm_ids)
#         all_masked_lm_weights.append(feature.masked_lm_weights)
#         all_next_sentence_labels.append(feature.next_sentence_labels)

#     def input_fn(params):
#         """The actual input function."""
#         batch_size = params["batch_size"]

#         num_examples = len(features)

#         # This is for demo purposes and does NOT scale to large data sets. We do
#         # not use Dataset.from_generator() because that uses tf.py_func which is
#         # not TPU compatible. The right way to load data is with TFRecordReader.
#         d = tf.data.Dataset.from_tensor_slices({
#             "input_ids":
#                 tf.constant(
#                     all_input_ids, shape=[num_examples, seq_length],
#                     dtype=tf.int32),
#             "input_mask":
#                 tf.constant(
#                     all_input_mask,
#                     shape=[num_examples, seq_length],
#                     dtype=tf.int32),
#             "segment_ids":
#                 tf.constant(
#                     all_segment_ids,
#                     shape=[num_examples, seq_length],
#                     dtype=tf.int32),
#             "masked_lm_positions":
#                 tf.constant(
#                     all_masked_lm_positions,
#                     shape=[num_examples, max_predictions_per_seq],
#                     dtype=tf.int32),
#         "masked_lm_ids":
#                 tf.constant(
#                     all_masked_lm_ids,
#                     shape=[num_examples, max_predictions_per_seq],
#                     dtype=tf.int32),
#         "masked_lm_weights":
#                 tf.constant(
#                     all_masked_lm_weights,
#                     shape=[num_examples, max_predictions_per_seq],
#                     dtype=tf.float32),
#         "next_sentence_labels":
#                 tf.constant(
#                     all_next_sentence_labels,
#                     shape=[num_examples, 1],
#                     dtype=tf.int32),
#         })

#         d = d.batch(batch_size=batch_size, drop_remainder=False)
#         return d

#     return input_fn


In [18]:
# def model_fn_builder(bert_config, init_checkpoint, learning_rate,
#                      num_train_steps, num_warmup_steps, use_tpu,
#                      use_one_hot_embeddings):
#     """Returns `model_fn` closure for TPUEstimator."""

#     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
#         """The `model_fn` for TPUEstimator."""

#         tf.logging.info("*** Features ***")
#         for name in sorted(features.keys()):
#             tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

#         input_ids = features["input_ids"]
#         input_mask = features["input_mask"]
#         segment_ids = features["segment_ids"]
#         masked_lm_positions = features["masked_lm_positions"]
#         masked_lm_ids = features["masked_lm_ids"]
#         masked_lm_weights = features["masked_lm_weights"]
#         next_sentence_labels = features["next_sentence_labels"]

#         is_training = (mode == tf.estimator.ModeKeys.TRAIN)

#         model = tfm.BertModel(
#             config=bert_config,
#             is_training=is_training,
#             input_ids=input_ids,
#             input_mask=input_mask,
#             token_type_ids=segment_ids,
#             use_one_hot_embeddings=use_one_hot_embeddings)

#         (masked_lm_loss,
#          masked_lm_example_loss, masked_lm_log_probs) = rp.get_masked_lm_output(
#             bert_config, model.get_sequence_output(), model.get_embedding_table(),
#             masked_lm_positions, masked_lm_ids, masked_lm_weights)

#         (next_sentence_loss, next_sentence_example_loss,
#          next_sentence_log_probs) = rp.get_next_sentence_output(
#             bert_config, model.get_pooled_output(), next_sentence_labels)

#         total_loss = masked_lm_loss + next_sentence_loss

#         tvars = tf.trainable_variables()

#         initialized_variable_names = {}
#         scaffold_fn = None
#         if init_checkpoint:
#             (assignment_map,
#              initialized_variable_names) = tfm.get_assigment_map_from_checkpoint(
#                 tvars, init_checkpoint)
#             if use_tpu:

#                 def tpu_scaffold():
#                     tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
#                     return tf.train.Scaffold()

#                 scaffold_fn = tpu_scaffold
#             else:
#                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

#         tf.logging.info("**** Trainable Variables ****")
#         for var in tvars:
#             init_string = ""
#             if var.name in initialized_variable_names:
#                 init_string = ", *INIT_FROM_CKPT*"
#             tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
#                             init_string)

#         output_spec = None
#         if mode == tf.estimator.ModeKeys.TRAIN:
#             masked_lm_positions = features["masked_lm_positions"]
#             masked_lm_ids = features["masked_lm_ids"]
#             masked_lm_weights = features["masked_lm_weights"]
#             next_sentence_labels = features["next_sentence_labels"]
#             train_op = optimization.create_optimizer(
#                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

#             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
#                 mode=mode,
#                 loss=total_loss,
#                 train_op=train_op,
#                 scaffold_fn=scaffold_fn)
#         elif mode == tf.estimator.ModeKeys.EVAL:
#             masked_lm_positions = features["masked_lm_positions"]
#             masked_lm_ids = features["masked_lm_ids"]
#             masked_lm_weights = features["masked_lm_weights"]
#             next_sentence_labels = features["next_sentence_labels"]

#             def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
#                           masked_lm_weights, next_sentence_example_loss,
#                           next_sentence_log_probs, next_sentence_labels):
#                 """Computes the loss and accuracy of the model."""
#                 masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
#                                                  [-1, masked_lm_log_probs.shape[-1]])
#                 masked_lm_predictions = tf.argmax(
#                     masked_lm_log_probs, axis=-1, output_type=tf.int32)
#                 masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
#                 masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
#                 masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
#                 masked_lm_accuracy = tf.metrics.accuracy(
#                     labels=masked_lm_ids,
#                     predictions=masked_lm_predictions,
#                     weights=masked_lm_weights)
#                 masked_lm_mean_loss = tf.metrics.mean(
#                     values=masked_lm_example_loss, weights=masked_lm_weights)

#                 next_sentence_log_probs = tf.reshape(
#                     next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
#                 next_sentence_predictions = tf.argmax(
#                     next_sentence_log_probs, axis=-1, output_type=tf.int32)
#                 next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
#                 next_sentence_accuracy = tf.metrics.accuracy(
#                     labels=next_sentence_labels, predictions=next_sentence_predictions)
#                 next_sentence_mean_loss = tf.metrics.mean(
#                     values=next_sentence_example_loss)

#                 return {
#                     "masked_lm_accuracy": masked_lm_accuracy,
#                     "masked_lm_loss": masked_lm_mean_loss,
#                     "next_sentence_accuracy": next_sentence_accuracy,
#                     "next_sentence_loss": next_sentence_mean_loss,
#                 }

#             eval_metrics = (metric_fn, [
#                 masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
#                 masked_lm_weights, next_sentence_example_loss,
#                 next_sentence_log_probs, next_sentence_labels
#             ])
#             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
#                 mode=mode,
#                 loss=total_loss,
#                 eval_metrics=eval_metrics,
#                 scaffold_fn=scaffold_fn)
#         elif mode == tf.estimator.ModeKeys.PREDICT:
#             masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
#                                                 [-1, masked_lm_log_probs.shape[-1]])
#             masked_lm_predictions = tf.argmax(
#                 masked_lm_log_probs, axis=-1, output_type=tf.int32)

#             next_sentence_log_probs = tf.reshape(
#                 next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
#             next_sentence_predictions = tf.argmax(
#                 next_sentence_log_probs, axis=-1, output_type=tf.int32)

#             masked_lm_predictions = tf.reshape(masked_lm_predictions,
#                                                 [1, masked_lm_positions.shape[-1]])
#             next_sentence_predictions = tf.reshape(next_sentence_predictions,
#                                                 [1, 1])

#             predictions = {
#                 "masked_lm_predictions": masked_lm_predictions,
#                 "next_sentence_predictions": next_sentence_predictions
#             }

#             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
#                 mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
#             return output_spec
#         else:
#             raise ValueError("Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode))

#         return output_spec

#     return model_fn

In [15]:
# is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
# run_config = tf.contrib.tpu.RunConfig(
#     master=None,
#     tpu_config=tf.contrib.tpu.TPUConfig(
#         num_shards=1,
#         per_host_input_for_training=is_per_host))

# model_fn = model_fn_builder(
#     bert_config=bert_config,
#     init_checkpoint=init_checkpoint,
#     learning_rate=0,
#     num_train_steps=1,
#     num_warmup_steps=1,
#     use_tpu=False,
#     use_one_hot_embeddings=False)

# # If TPU is not available, this will fall back to normal Estimator on CPU
# # or GPU.
# estimator = tf.contrib.tpu.TPUEstimator(
#     use_tpu=False,
#     model_fn=model_fn,
#     config=run_config,
#     predict_batch_size=1)

# input_fn = input_fn_builder(
#     features=features, seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq,
# tokenizer=tokenizer)

In [14]:
# tensorflow_all_out = []
# for result in estimator.predict(input_fn, yield_single_examples=True):
#     tensorflow_all_out.append(result)

In [16]:
# print(len(tensorflow_all_out))
# print(len(tensorflow_all_out[0]))
# print(tensorflow_all_out[0].keys())
# print("masked_lm_predictions", tensorflow_all_out[0]['masked_lm_predictions'])
# print("predicted token", tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions']))

In [17]:
# tensorflow_outputs = tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions'])[:len(masked_lm_positions)]
# print("tensorflow_output:", tensorflow_outputs)

## 2/ PyTorch code

In [2]:
from examples import extract_features
from examples.extract_features import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
init_checkpoint_pt = "../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin"

In [9]:
device = torch.device("cuda")
model = ppb.BertForPreTraining.from_pretrained('bert-base-uncased')
model.to(device)

01/09/2019 01:01:12 - INFO - pytorch_pretrained_bert.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz not found in cache, downloading to /tmp/tmp5yzsxut4
100%|██████████| 407873900/407873900 [00:09<00:00, 44944052.42B/s]
01/09/2019 01:01:21 - INFO - pytorch_pretrained_bert.file_utils -   copying /tmp/tmp5yzsxut4 to cache at /home/raghu/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/09/2019 01:01:22 - INFO - pytorch_pretrained_bert.file_utils -   creating metadata file for /home/raghu/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/09/2019 01:01:22 - INFO - pytorch_pretrained_bert.file_utils -   removing temp file /tmp/tmp5yzsxut4
01/09/2019 01:01:22 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
    

In [21]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_positions)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

model.eval()

NameError: name 'features' is not defined

In [25]:
import numpy as np
pytorch_all_out = []
for input_ids, input_mask, segment_ids, tensor_masked_lm_positions in eval_dataloader:
    print(input_ids)
    print(input_mask)
    print(segment_ids)
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    prediction_scores, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
    prediction_scores = prediction_scores[0, tensor_masked_lm_positions].detach().cpu().numpy()
    print(prediction_scores.shape)
    masked_lm_predictions = np.argmax(prediction_scores, axis=-1).squeeze().tolist()
    print(masked_lm_predictions)
    pytorch_all_out.append(masked_lm_predictions)

NameError: name 'eval_dataloader' is not defined

In [24]:
pytorch_outputs = tokenizer.convert_ids_to_tokens(pytorch_all_out[0])[:len(masked_lm_positions)]
print("pytorch_output:", pytorch_outputs)
# print("tensorflow_output:", tensorflow_outputs)

NameError: name 'tokenizer' is not defined