Keras Bert

Simple wrapper of tf-hub Bert models for use in Keras.

In [1]:
!wget -q https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/optimization.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/tokenization.py 

In [2]:
import os
import numpy as np
import pandas as pd
import datetime
import sys
import zipfile
import modeling
import optimization
import run_classifier
import tokenization

from tokenization import FullTokenizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split

import tensorflow_hub as hub
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model

W0418 04:23:57.447848 140140242679168 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
sess = tf.Session()

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 128

# Load Data

In [None]:
train_df = pd.read_csv('../input/train.csv', index_col='id')
val_df = pd.read_csv('../input/valid.csv', index_col='id')
test_df = pd.read_csv('../input/test.csv', index_col='id')

In [4]:
df = pd.read_csv('../input/abstract-clusters/abstract_clusters.csv',encoding='utf-8',error_bad_lines=False,engine='python')

In [5]:
from sklearn.utils import shuffle
df = shuffle(df)

In [6]:
label_encoder = LabelEncoder().fit(df['Label'])

In [7]:
X =df['Abstract'].values

In [8]:
y = label_encoder.fit_transform(df['Label'])

In [None]:
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [9]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
        X,y, test_size=0.2, random_state=0, stratify = y
        )

In [10]:
X_val, X_test, y_val, y_test = train_test_split(
        X_val_test,y_val_test, test_size=0.5, random_state=0, stratify = y_val_test
        )

In [11]:
train_text = X_train
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train

val_text = X_val
val_text = [' '.join(t.split()[0:max_seq_length]) for t in val_text]
val_text = np.array(val_text, dtype=object)[:, np.newaxis]
val_label = y_val

test_text = X_test
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

# Bert tf-hub


In [12]:
import tensorflow as tf
import tensorflow_hub as hub
import os
import re
import numpy as np
from tqdm import tqdm_notebook
#from tensorflow.keras import backend as K
from keras import backend as K
from keras.layers import Layer


class BertLayer(Layer):
    
    '''BertLayer which support next output_representation param:
    
    pooled_output: the first CLS token after adding projection layer () with shape [batch_size, 768]. 
    sequence_output: all tokens output with shape [batch_size, max_length, 768].
    mean_pooling: mean pooling of all tokens output [batch_size, max_length, 768].
    
    
    You can simple fine-tune last n layers in BERT with n_fine_tune_layers parameter. For view trainable parameters call model.trainable_weights after creating model.
    
    '''
    
    def __init__(self, n_fine_tune_layers=10, tf_hub = None, output_representation = 'pooled_output', trainable = False, **kwargs):
        
        self.n_fine_tune_layers = n_fine_tune_layers
        self.is_trainble = trainable
        self.output_size = 768
        self.tf_hub = tf_hub
        self.output_representation = output_representation
        self.supports_masking = True
        
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):

        self.bert = hub.Module(
            self.tf_hub,
            trainable=self.is_trainble,
            name="{}_module".format(self.name)
        )
        
        
        variables = list(self.bert.variable_map.values())
        if self.is_trainble:
            # 1 first remove unused layers
            trainable_vars = [var for var in variables if not "/cls/" in var.name]
            
            
            if self.output_representation == "sequence_output" or self.output_representation == "mean_pooling":
                # 1 first remove unused pooled layers
                trainable_vars = [var for var in trainable_vars if not "/pooler/" in var.name]
                
            # Select how many layers to fine tune
            trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
            
            # Add to trainable weights
            for var in trainable_vars:
                self._trainable_weights.append(var)

            # Add non-trainable weights
            for var in self.bert.variables:
                if var not in self._trainable_weights:
                    self._non_trainable_weights.append(var)
                
        else:
             for var in variables:
                self._non_trainable_weights.append(var)
                

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
        
        if self.output_representation == "pooled_output":
            pooled = result["pooled_output"]
            
        elif self.output_representation == "mean_pooling":
            result_tmp = result["sequence_output"]
        
            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result_tmp, input_mask)
            
        elif self.output_representation == "sequence_output":
            
            pooled = result["sequence_output"]
       
        return pooled
    
    def compute_mask(self, inputs, mask=None):
        
        if self.output_representation == 'sequence_output':
            inputs = [K.cast(x, dtype="bool") for x in inputs]
            mask = inputs[1]
            
            return mask
        else:
            return None
        
        
    def compute_output_shape(self, input_shape):
        if self.output_representation == "sequence_output":
            return (input_shape[0][0], input_shape[0][1], self.output_size)
        else:
            return (input_shape[0][0], self.output_size)

Using TensorFlow backend.


In [13]:
import keras

In [14]:
def build_model(max_seq_length, tf_hub, n_classes, n_fine_tune): 
    in_id = keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=n_fine_tune, tf_hub = tf_hub, output_representation = 'mean_pooling', trainable = True)(bert_inputs)
    drop = keras.layers.Dropout(0.3)(bert_output)
    dense = keras.layers.Dense(256, activation='sigmoid')(drop)
    drop = keras.layers.Dropout(0.3)(dense)
    dense = keras.layers.Dense(64, activation='sigmoid')(drop)
    pred = keras.layers.Dense(n_classes, activation='softmax')(dense)
    
    model = keras.models.Model(inputs=bert_inputs, outputs=pred)
    Adam = keras.optimizers.Adam(lr = 0.0005)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam, metrics=['sparse_categorical_accuracy'])
    model.summary()

    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [15]:
n_classes = len(label_encoder.classes_)
n_fine_tune_layers = 48
model = build_model(max_seq_length, bert_path, n_classes, n_fine_tune_layers)

# Instantiate variables
initialize_vars(sess)

Instructions for updating:
Colocations handled automatically by placer.


W0418 04:25:36.984525 140140242679168 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0418 04:25:39.493424 140140242679168 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0418 04:25:39.652037 140140242679168 deprecation.py:506] From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 128)          0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
bert_layer_1 (BertLayer)        (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]                
          

In [16]:
model.trainable_weights

[<tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/query/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/query/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/key/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/key/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/value/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/self/value/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/output/dense/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attention/output/dense/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert_layer_1_module/bert/encoder/layer_9/attentio

# Tokenization 

In [17]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module(tf_hub):
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(tf_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )
    
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    
    #print(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples


In [18]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module(bert_path)

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
val_examples = convert_text_to_examples(val_text, val_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(val_input_ids, val_input_masks, val_segment_ids, val_labels
) = convert_examples_to_features(tokenizer, val_examples, max_seq_length=max_seq_length)



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0418 04:25:45.851674 140140242679168 saver.py:1483] Saver not created because there are no variables in the graph to restore


HBox(children=(IntProgress(value=0, description='Converting examples to features', max=31238, style=ProgressSt…




HBox(children=(IntProgress(value=0, description='Converting examples to features', max=3905, style=ProgressSty…




# Train model

In [19]:
from keras.callbacks import EarlyStopping

BATCH_SIZE = 256
MONITOR = 'val_sparse_categorical_accuracy'
print('BATCH_SIZE is {}'.format(BATCH_SIZE))
e_stopping = EarlyStopping(monitor=MONITOR, patience=3, verbose=1, mode='max', restore_best_weights=True)
callbacks =  [e_stopping]

history = model.fit(
   [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data = ([val_input_ids, val_input_masks, val_segment_ids], val_labels),
    epochs = 10,
    verbose = 1,
    batch_size = BATCH_SIZE,
    callbacks= callbacks
)

BATCH_SIZE is 256
Instructions for updating:
Use tf.cast instead.


W0418 04:28:48.591721 140140242679168 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Train on 31238 samples, validate on 3905 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Restoring model weights from the end of the best epoch
Epoch 00007: early stopping


# Test

In [20]:
test_examples = convert_text_to_examples(test_text, np.zeros(len(test_text)))

In [21]:
(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

HBox(children=(IntProgress(value=0, description='Converting examples to features', max=3905, style=ProgressSty…




In [22]:
prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids], verbose = 1)



In [23]:
preds = label_encoder.classes_[np.argmax(prediction, axis =1)]

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       583
           1       0.80      0.67      0.73       449
           2       1.00      1.00      1.00        34
           3       0.79      0.81      0.80       466
           4       0.64      0.74      0.69       349
           5       0.76      0.79      0.77       482
           6       0.71      0.72      0.72       429
           7       0.91      0.64      0.75       321
           8       0.83      0.79      0.81       405
           9       0.70      0.84      0.76       387

   micro avg       0.78      0.78      0.78      3905
   macro avg       0.80      0.79      0.79      3905
weighted avg       0.78      0.78      0.77      3905



In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

0.7751600512163892

In [None]:
pd.DataFrame(preds, columns=['label']).to_csv('bert_keras_submission.csv',
                                                  index_label='id')