In [1]:
import os
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf




In [2]:
os.environ["WANDB_API_KEY"] = "0" # to silence warning


In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)


Number of replicas: 1


In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


In [5]:
df_train.shape # check data


(12120, 6)

In [6]:
df_test.shape # check data


(5195, 5)

In [7]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)


In [8]:
len(tokenizer.vocab) # check the vocabulary size


119547

In [9]:
def encode_sentence(s):
    """ ENCODE SENTENCES WITH TOKENIZER"""
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)


In [10]:
encode_sentence("you know they can't really defend themselves")


[13028, 21852, 10689, 10944, 112, 188, 30181, 60041, 20182, 102]

In [11]:
def bert_encode(hypotheses, premises, tokenizer):
    """ ENCODE DATA FOR BERT"""
    num_examples = len(hypotheses)
    print("num_examples = ", num_examples)
    sentence1 = tf.ragged.constant([encode_sentence(s) for s in np.array(hypotheses)])
    print("sentence1.shape = ", sentence1.shape)
    sentence2 = tf.ragged.constant([encode_sentence(s) for s in np.array(premises)])
    print("sentence2.shape = ", sentence2.shape)
    cls_ = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    input_word_ids = tf.concat([cls_, sentence1, sentence2], axis=-1)
    print("input_word_ids.shape = ", input_word_ids.shape)
    # 300 - as my example
    # because we have train_input (12120; 259), test_input (5159; 234)
    # and shape[1] should be the same in each dataset
    # that is why we creating (xxx; 300) shape in to_tensor() functions  
    input_mask = tf.ones_like(input_word_ids).to_tensor(shape=(input_word_ids.shape[0], 300)) 
    print("input_mask.shape = ", input_mask.shape)
    
    type_cls = tf.zeros_like(cls_)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor(shape=(input_word_ids.shape[0], 300))
    
    inputs = {'input_word_ids': input_word_ids.to_tensor(shape=(input_word_ids.shape[0], 300)),
              'input_mask': input_mask,
              'input_type_ids': input_type_ids}
    print()
    
    return inputs
    


In [12]:
# encode data
train_input = bert_encode(df_train["premise"].values, df_train["hypothesis"].values, tokenizer)
test_input = bert_encode(df_test["premise"].values, df_test["hypothesis"].values, tokenizer)


num_examples =  12120
sentence1.shape =  (12120, None)
sentence2.shape =  (12120, None)
input_word_ids.shape =  (12120, None)
input_mask.shape =  (12120, 300)

num_examples =  5195
sentence1.shape =  (5195, None)
sentence2.shape =  (5195, None)
input_word_ids.shape =  (5195, None)
input_mask.shape =  (5195, 300)



In [13]:
train_input # check train input


{'input_word_ids': <tf.Tensor: shape=(12120, 300), dtype=int32, numpy=
 array([[  101, 10111, 11762, ...,     0,     0,     0],
        [  101, 13252, 10301, ...,     0,     0,     0],
        [  101, 13810, 32181, ...,     0,     0,     0],
        ...,
        [  101, 10117, 12452, ...,     0,     0,     0],
        [  101, 11699, 10105, ...,     0,     0,     0],
        [  101, 11399, 14764, ...,     0,     0,     0]], dtype=int32)>,
 'input_mask': <tf.Tensor: shape=(12120, 300), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape=(12120, 300), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 

In [14]:
test_input # check test input


{'input_word_ids': <tf.Tensor: shape=(5195, 300), dtype=int32, numpy=
 array([[  101,   764, 28744, ...,     0,     0,     0],
        [  101, 13498, 11917, ...,     0,     0,     0],
        [  101, 10131, 24552, ...,     0,     0,     0],
        ...,
        [  101,  3239,  5755, ...,     0,     0,     0],
        [  101, 98370,   112, ...,     0,     0,     0],
        [  101, 10167, 15078, ...,     0,     0,     0]], dtype=int32)>,
 'input_mask': <tf.Tensor: shape=(5195, 300), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape=(5195, 300), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, 

In [15]:
max_len = train_input["input_word_ids"].shape[1]

def create_model():
    """ BUILD MODEL """
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")

    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


In [16]:
with strategy.scope():
    model = create_model()
    model.summary()


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 300)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 300)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 300)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['input_word_ids[0][0]',         
                                thPoolingAndCrossAt               'input_mask[0][0]',         

  super(Adam, self).__init__(name, **kwargs)


In [17]:
model_history = model.fit(train_input, 
                          df_train["label"].values, 
                          epochs = 3, 
                          verbose = 1,
                          batch_size = 128, 
                          validation_split = 0.2)


Epoch 1/3
 3/76 [>.............................] - ETA: 3:38:07 - loss: 1.1205 - accuracy: 0.3594

KeyboardInterrupt: 

In [None]:
def plot_NN_history(model_history, suptitle):
    # plot data
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,6))
    fig.suptitle(suptitle, fontsize=18)
    
    axes[0].plot(model_history.history['accuracy'], label='train accuracy', color='g', axes=axes[0])
    axes[0].plot(model_history.history['val_accuracy'], label='val accuracy', color='r', axes=axes[0])
    axes[0].set_title("Model Accuracy", fontsize=16) 
    axes[0].legend(loc='upper left')

    axes[1].plot(model_history.history['loss'], label='train loss', color='g', axes=axes[1])
    axes[1].plot(model_history.history['val_loss'], label='val loss', color='r', axes=axes[1])
    axes[1].set_title("Model Loss", fontsize=16) 
    axes[1].legend(loc='upper left')

    plt.show()


In [None]:
plot_NN_history(model_history, "BERT")


In [None]:
def calculate_results(y_true, y_pred):
    """ CALCULATE RESULTS"""
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1": model_f1}
    return model_results


In [None]:
# get the probabilities
y_prob = model.predict(test_input)
# get the classes
y_hat = y_prob.argmax(axis=-1)


In [None]:
# submission = df_test.id.copy().to_frame()
# submission['prediction'] = y_hat
# submission.head() # check submission
# submission.to_csv("submission.csv", index = False) # save file
