This script takes a pickle input file of parallel uncompressed and compressed sentences and:

 - tokenizese the raw text using BERT's wordpiece tokenizer
 - add the relevant input id for each wordpiece
 - derives the compression labels for ingestion by the model
 - outputs a single tensorflow records file with each compression pair wrapped in a tensorflow features object.
 
 
 Set input, output and vocab file paths below. The vocab file is the vocab used by BERT for wordpiece to ID lookup, which can be downloaded as part of the BERT base model: <https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip>.
 
The tokenization module can be downloaded from: <https://github.com/google-research/bert>

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import csv
import os
import pickle
import tokenization
import tensorflow as tf

VOCAB_FILE = "" # Path to vocab file
DO_LOWER_CASE = True

In [None]:
# Function to read the inputs

def _read_leg_compressions(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
        full_sents = []
        compressed = []
        for i in data:
            full_sents.append(data[i]['full_text'])
            compressed.append(data[i]['compressed_text'])
        return full_sents, compressed

In [None]:
# Get labels. 2 => token is retained. 3 => token is deleted.

def get_labels(token_sentences, token_compressions):
    
    labels_data = []
    bad_parse = 0

    for i in range(len(token_sentences)):
        labels = []
        trace = 0
        sent = token_sentences[i]
        compressed_sent = token_compressions[i]
        for token in sent:
            if trace < len(compressed_sent):
                if token == compressed_sent[trace]:
                    trace += 1
                    labels.append(2)
                else:
                    labels.append(3)
            else:
                labels.append(3)                   
        if (2 not in labels) or (len(labels) != len(sent)):
            bad_parse += 1
            print("bad parse")
            labels_data.append(["bad_parse"])
        else:
            labels_data.append(labels)
    return labels_data

In [None]:
# Adapted from run_classifcation.py released with BERT (https://github.com/google-research/bert)
# Function to convert a single input example into a features object

def convert_single_example(ex_index, example, max_seq_length, tokenizer):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_ids=[0] *max_seq_length,
            is_real_example=False)
    
    labels_a = example.label_ids
    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    r_id = example.r_id
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
    # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]
            labels_a = labels_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    label_ids = []
    segment_ids = []
    
    tokens.append("[CLS]")
    label_ids.append(1)
    segment_ids.append(0)
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    for label in labels_a:
        label_ids.append(label)
        
    tokens.append("[SEP]")
    label_ids.append(4)
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

  # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        label_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length

    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("tokens: %s" % " ".join(
            [tokenization.printable_text(x) for x in tokens]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))

    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=label_ids,
        r_id=[r_id],
        is_real_example=True)
    return feature

In [None]:
# Adapted from run_classifcation.py released with BERT (https://github.com/google-research/bert)
# InputExample object.

class InputExample(object):
    #"""A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label_ids=None, r_id=None):
    #"""Constructs a InputExample.
    #Args:
    #  guid: Unique id for the example.
    #  text_a: string. The untokenized text of the first sequence. For single
    #    sequence tasks, only this sequence must be specified.
    #  text_b: (Optional) string. The untokenized text of the second sequence.
    #    Only must be specified for sequence pair tasks.
    #  label: (Optional) string. The label of the example. This should be
    #    specified for train and dev examples, but not for test examples.
    #"""
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label_ids = label_ids
        self.r_id = r_id

In [None]:
# Adapted from run_classifcation.py released with BERT (https://github.com/google-research/bert)
# InputFeatures object.

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, 
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_ids,
                 r_id=None,
                 is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.r_id = r_id
        self.is_real_example = is_real_example

In [None]:
# Adapted from run_classifcation.py released with BERT (https://github.com/google-research/bert)

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """

In [None]:
# Adapted from run_classifcation.py released with BERT (https://github.com/google-research/bert)

def file_based_convert_examples_to_features(
    examples, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_ids)
        features["r_id"] = create_int_feature(feature.r_id)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()

In [None]:
# Get examples list for TFR writing

def get_examples_list(input_file, tokenizer):
    
    sentences, compressions = _read_leg_compressions(input_file)    
    token_sentences = []
    token_compressions = []

    for i in range(len(sentences)):
        token_sentences.append(tokenizer.tokenize(sentences[i]))
        token_compressions.append(tokenizer.tokenize(compressions[i]))
        
    labels_data = get_labels(token_sentences, token_compressions)
    
    examples_list = []

    for i in range(len(sentences)):
        if labels_data[i] != ["bad_parse"]:
            guid = "train-%d" % (i)
            text_a = tokenization.convert_to_unicode(sentences[i])
            label_ids = labels_data[i]
            examples_list.append(InputExample(guid=guid, text_a=text_a, label_ids=label_ids, r_id=i))
    
    return examples_list

In [None]:
# Set write parameters

input_file = "" # path to input file
max_seq_length = 128 # maximum sequence length
VOCAB_FILE = "" # BERT vocab file
DO_LOWER_CASE = True # Whether to lowercase or not
output_file = "" # path to output file
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

In [None]:
examples_list = get_examples_list(input_file, tokenizer)

In [None]:
# Use the file_based_convert_examples_to_features function to convert to TF records

file_based_convert_examples_to_features(examples_list, 
                                        max_seq_length, 
                                        tokenizer, 
                                        output_file)