# Setup

In [1]:
import numpy as np
import pandas as pd
import collections
import os


import tensorflow as tf
from datetime import datetime

from bert import tokenization
from bert import modeling
from bert import optimization

In [2]:
print(tf.__version__)

2.3.1


In [4]:
bert_vocab = './BERT Files/vocab.txt'
bert_init_checkpnt = './BERT Files/bert_model.ckpt'
bert_config_file = './BERT Files/bert_config.json'

In [5]:
train_data_path = '../data/train[1].csv'
train_data = pd.read_csv(train_data_path)

In [6]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
label_columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

## Setup Tokenizer

In [8]:
tokenization.validate_case_matches_checkpoint(True, bert_init_checkpnt)
tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab, do_lower_case=True)

## Input Example and Feature Classes

In [9]:
class InputExample(object):
    """
    Single training/test example
    """
    
    def __init__(self, guid, text_a, text_b=None, labels=None):
        """
        Constructs Object of InputExample

        Args:
            guid (string): Unique id
            text_a (string): Untokenized text of the first sequence
            text_b (string, optional): Untokenized text of the second sequence
            labels (string, optional)
        """
        
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels
        

class PaddingInputExample(object):
    """
    Fake Padding example used to make sure that the number of examples
    match the batch sizes. 
    """
        
        
class InputFeatures(object):
    """
    Single Set of Features of Data
    """
    
    def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True):
        """
        Constructs Object of InputFeatures
        
        Args:
            input_ids (List(string)): Ids of Tokens
            input_mask (List(int)): Mask to determine whether it is a real token or padding
            segment_ids (List(int)): Ids of Segments. 0 is tokens_a, 1 is tokens_b (sequence pair)
            label_ids (List(int)): Ids of labels
            is_real_example (boolean): Is real example or padding
        """
        
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.is_real_example = is_real_example

## Feature Builders 

In [10]:
def convert_single_example(example,  max_seq_length, tokenizer):
    """
    Converts a 'InputExample' into 'InputFeatures'
    """
    
    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        """
        Truncates a sequence pair so that it'll have max_length
        """
        
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
    
    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_id=0,
            is_real_example=False)
    
    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]
    
    # Sandwich token_a with [CLS] and [SEP] and append to tokens
    tokens = ['[CLS]'] + tokens_a + ['[SEP]']
    segment_ids = [0] * len(tokens)

    # If token_b, then add to token  and append [SEP]
    if tokens_b:
        tokens += tokens_b + ['[SEP]']
        segment_ids += [1] * (len(tokens_b) + 1)
    
    # Get Input Ids based on Token
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # 1 for Real tokens, 0 for Padding
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    
    # Add Label Ids
    labels_ids = []
    for label in example.labels:
        labels_ids.append(int(label))

    feature = InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_ids=labels_ids)
    
    return feature

In [11]:
def file_based_convert_examples_to_features(examples, max_seq_length, tokenizer, output_file):
    """
    Convert a list of `InputExample`s to a TFRecord file.
    """
    
    def create_int_feature(values):
        """
        Converts feature into Int64List
        """
        f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
        return f
    
    writer = tf.io.TFRecordWriter(output_file)

    for ex_index, example in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.compat.v1.logging.info(f"Writing example {ex_index} of {len(examples)}")

        feature = convert_single_example(example, max_seq_length, tokenizer)
        
        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(feature.input_ids)
        features['input_mask'] = create_int_feature(feature.input_mask)
        features['segment_ids'] = create_int_feature(feature.segment_ids)
        features['label_ids'] = create_int_feature(feature.label_ids)
        features['is_real_example'] = create_int_feature([int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
        
    writer.close()

In [12]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """
    Convert a list of `InputExample`s to a list of `InputFeatures`.
    """

    features = []
    for ex_index, example in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.compat.v1.logging.info(f"Writing example {ex_index} of {len(examples)}")

        feature = convert_single_example(example, max_seq_length, tokenizer)
        features.append(feature)
        
    return features

## Input_fn Builders

In [13]:
def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
    """
    Creates an `input_fn` closure to be passed to TPUEstimator using a TFRecord file.
    """

    name_to_features = {
        "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.io.FixedLenFeature([6], tf.int64),
        "is_real_example": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """
        Decodes a record to a TensorFlow example.
        """
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example

    def input_fn(params):
        """
        The actual input function.
        """
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn

In [14]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):
    """
    Creates an `input_fn` closure to be passed to TPUEstimator using a list of features.
    """

    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.segment_ids)
        all_label_ids.append(feature.label_ids)

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        num_examples = len(features)

        d = tf.data.Dataset.from_tensor_slices({
            "input_ids":
                tf.constant(
                    all_input_ids, shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "input_mask":
                tf.constant(
                    all_input_mask,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "segment_ids":
                tf.constant(
                    all_segment_ids,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "label_ids":
                tf.constant(all_label_ids, shape=[num_examples, len(LABEL_COLUMNS)], dtype=tf.int32),
        })

        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
        return d

    return input_fn

## Model_fn Builders

In [15]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 
                 labels, num_labels, use_one_hot_embeddings):
    """
    Creates the multi-label clsasification model.
    """
    
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value
    
    # Creates a variable "output_weights" in the tensorflow graph
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    # Creates a variable "output_bias" in the tensorflow graph
    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())
    
    # Create Variable 'loss'
    with tf.variable_scope("loss"):
        if is_training:
            # 0.1 dropout Layer
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        
        # Multiplies output layer by output weights
        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        
        # Adds bias
        logits = tf.nn.bias_add(logits, output_bias)
        
        # Sigmoid used since multilabel. Softmax will make it so then
        # all probabilities of each label add to 1
        probabilities = tf.nn.sigmoid(logits)
        
        labels = tf.cast(labels, tf.float32)
        tf.compat.v1.logging.info(f"num_labels:{num_labels};logits:{logits};labels:{labels}")
        
        # Sigmoid cross entropy used for multilabel
        per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(per_example_loss)

        return loss, per_example_loss, logits, probabilities
        

In [16]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """
    Returns `model_fn` closure for TPUEstimator.
    """

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """
        The `model_fn` for TPUEstimator.
        """
        
        tf.compat.v1.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.compat.v1.logging.info(f"  name = {name}, shape = {features[name].shape}")
        
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        
        # Create Model
        total_loss, per_example_loss, logits, probabilities = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        # If loading from checkpoint
        if init_checkpoint:
            assignment_map, initialized_variable_names = \
            modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            
            if use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.compat.v1.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.compat.v1.logging.info(f"  name = {var.name}, shape = {var.shape}{init_string}")
        
        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, probabilities, is_real_example):
                """
                Metric Function - AUC of Every Class
                """
                logits_split = tf.split(probabilities, num_labels, axis=-1)
                label_ids_split = tf.split(label_ids, num_labels, axis=-1)
                eval_dict = {}
                for j, logits in enumerate(logits_split):
                    label_id_ = tf.cast(label_ids_split[j], dtype=tf.int32)
                    current_auc, update_op_auc = tf.metrics.auc(label_id_, logits)
                    eval_dict["eval_AUC_" + str(j)] = (current_auc, update_op_auc)
                eval_dict['eval_loss'] = tf.metrics.mean(values=per_example_loss)
                return eval_dict

            eval_metrics = metric_fn(per_example_loss, label_ids, probabilities, is_real_example)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metric_ops=eval_metrics,
                scaffold=scaffold_fn)
        else:
            print("mode:", mode,"probabilities:", probabilities)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold=scaffold_fn)
        return output_spec

    return model_fn

# Data Preprocessing

## Process Training Data

## Data Configurations

In [17]:
MAX_SEQ_LENGTH = 128

# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_SUMMARY_STEPS = 500
SAVE_CHECKPOINTS_STEPS = 1000

### Create Examples

In [19]:
def create_examples(df, labels_available=True):
    """
    Create examples for the training and test sets
    """
    examples = []
    for i, row in enumerate(df.values):
        guid = row[0]
        text_a = row[1]
        if labels_available:
            labels=row[2:]
        else:
            labels = [0] * 6
        
        examples.append(InputExample(guid=guid, text_a=text_a, labels=labels))
    
    return examples
    

In [20]:
train_examples = create_examples(train_data)

In [21]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_examples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

### Convert to TFRecord

In [22]:
train_file = os.path.join('./BERT Files/working', 'train.tf_record')
if not os.path.exists(train_file):
    open(train_file, 'w').close()

In [23]:
file_based_convert_examples_to_features(
            train_examples, MAX_SEQ_LENGTH, tokenizer, train_file)
tf.compat.v1.logging.info("***** Running training *****")
tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
tf.compat.v1.logging.info("  Batch size = %d", BATCH_SIZE)
tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)

INFO:tensorflow:Writing example 0 of 159571


KeyboardInterrupt: 

### Convert to Input_fn

In [24]:
train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

# Train Model

## Setup

In [25]:
output_dir = './BERT Files/working/output'

# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=output_dir,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    keep_checkpoint_max=1,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

## Train Model

In [26]:
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= len(label_columns),
  init_checkpoint=bert_init_checkpnt,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': './BERT Files/working/output', '_tf_random_seed': None, '_save_summary_steps': 500, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [27]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


AttributeError: module 'tensorflow' has no attribute 'contrib'

## Evaluate Model

In [None]:
eval_file = os.path.join('./BERT Files/working', "eval.tf_record")
if not os.path.exists(eval_file):
    open(eval_file, 'w').close()

eval_examples = create_examples(x_val)
file_based_convert_examples_to_features(
    eval_examples, MAX_SEQ_LENGTH, tokenizer, eval_file)

In [None]:
# This tells the estimator to run through the entire validation set.
eval_steps = None

eval_drop_remainder = False
eval_input_fn = file_based_input_fn_builder(
    input_file=eval_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

In [None]:
output_eval_file = os.path.join("./Bert Files/working", "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
    tf.compat.v1.logging.info("***** Eval results *****")
    for key in sorted(result.keys()):
        tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

## Make Predictions

In [None]:
# x_test = 
# test_examples = create_examples(x_test, False)

In [None]:
# test_features = convert_examples_to_features(predict_examples, MAX_SEQ_LENGTH, tokenizer)

In [None]:
# print('Beginning Predictions!')
# current_time = datetime.now()

# predict_input_fn = input_fn_builder(features=test_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
# predictions = estimator.predict(predict_input_fn)
# print("Prediction took time ", datetime.now() - current_time)

In [None]:
# def create_output(predictions):
#     probabilities = []
#     for (i, prediction) in enumerate(predictions):
#         preds = prediction["probabilities"]
#         probabilities.append(preds)
#     df = pd.DataFrame(probabilities)
#     df.columns = label_columns
    
#     return df

In [None]:
# output_df = create_output(predictions)
# merged_df =  pd.concat([x_test, output_df], axis=1)