Recommended to run on Kaggle

In [1]:
# Scikit learn library functionalities

from sklearn.metrics import accuracy_score, classification_report                    
from sklearn.model_selection import train_test_split           

# Tensorflow libraries

import tensorflow as tf
import tensorflow.keras.backend as K

# Deep learning models

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel, BertTokenizer
import transformers

# For other data manipulations

import gc   # Garbage collector
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [36]:
class Configs():
    
    # Most of the parameters that appear throughout the training process will be kept here
    # with some predefined values
    def __init__(
        self,
        model_name = "jplu/tf-xlm-roberta-large",                                  
        max_length = 64,                                # Only important if the param 
                                                        # pad_to_max_length in the tokenizer is
                                                        # set to True. 
        batch_size = 16,                                # Samples per batch
        epochs = 20,                                    # Times to go through the data
        accelerator = "TPU"                             # Preferred since we work on Kaggle
    ):
    
        self.ACCELERATOR = accelerator
        self.MODEL_NAME = model_name
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_NAME)
        self.MAX_LENGTH = max_length
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epochs
        
        self.initialize_accelerator()

    # Since we are using Kaggle for TPUs, the following code is required in order to run
    # our models faster
    
    def initialize_accelerator(self):
        if self.ACCELERATOR == "TPU":
            try:
                tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
                print(f"Running on TPU {tpu.master()}")
            except ValueError:
                print("Could not connect to TPU")
                tpu = None
            if tpu:
                try:
                    print("Initializing TPU")
                    tf.config.experimental_connect_to_cluster(tpu)
                    tf.tpu.experimental.initialize_tpu_system(tpu)
                    self.strategy = tf.distribute.experimental.TPUStrategy(tpu)
                    self.tpu = tpu
                    print("TPU initialized")
                except _:
                    print("Failed to initialize TPU")
            else:
                print("Unable to initialize TPU")
                self.ACCELERATOR = "GPU"

        # In case TPU is not available it checks for GPU, if none are available it 
        # continues with CPU
        
        if self.ACCELERATOR != "TPU":
            print("Using default strategy for CPU and single GPU")
            self.strategy = tf.distribute.get_strategy()

        if self.ACCELERATOR == "GPU":
            
            print(f"GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

        self.AUTO = tf.data.experimental.AUTOTUNE
        self.REPLICAS = self.strategy.num_replicas_in_sync
        print(f"REPLICAS: {self.REPLICAS}")

In [37]:
# Function to encode the sentences 

def encode(data, tokenizer, max_length):
    
    text = data.values.tolist()   # Receives the two columns with the premise and hypothesis

    # The model has its own tokenizer so we use it (AutoTokenizer or BertTokenizer)
    # pad_to_max length is set to True so that all samples will be of same length
    
    encoded = tokenizer.batch_encode_plus(text, pad_to_max_length = True, max_length = max_length)
    return encoded

In [38]:
def to_dataset(X, y, auto, label = True, repeat = False, shuffle = False, batch_size = 128):
    
    if label == True:
        data = (tf.data.Dataset.from_tensor_slices((X["input_ids"], y)))
    else:
        data = (tf.data.Dataset.from_tensor_slices(X["input_ids"]))

    if repeat:
        data = data.repeat()

    if shuffle:
        data = data.shuffle(2048)

    data = data.batch(batch_size)
    data = data.prefetch(auto)

    return data

In [39]:
# Plots the train history for the keras model

def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

In [40]:
# Build the model

def build_model(model_name, max_length):
    
    # Define encoded inputs
    input_ids = Input(shape = (max_length,), dtype = tf.int32, name = "input_ids")    
    
    # Define transformer model embeddings
    transformer_model = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings = transformer_model(input_ids)[0]
    
    # Define the output layer, as a dense layer with softmax activation function
    output_values = Dense(3, activation = "softmax")(transformer_embeddings[:, 0, :])

    # We define the model with the input and output values we defined previously
    # The model is pretrained, except for the output layer
    
    model = Model(inputs = input_ids, outputs = output_values)        
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
    model.compile(optimizer = Adam(learning_rate = 1e-5), loss = loss, metrics = ['accuracy'])

    return model

In [44]:
# Run the model

def run_model(train, config):

    """Reads the train and test sets and the class withe the configurations"""
    
    # First we initialize the accelerator in the config class
    
    if config.ACCELERATOR == "TPU":
        if config.tpu:
            config.initialize_accelerator()

    # Then we build the model under TPU, the model needs to only be built under TPU, when 
    # we train it it knows with which accelerator it should work
    
    K.clear_session()
    with config.strategy.scope():
        model = build_model(config.MODEL_NAME, config.MAX_LENGTH)
        print(model.summary())

    # Splitting data into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(train[["premise","hypothesis"]], train["label"], test_size=0.25, random_state=42)
    
    print("\nTokenizing")

    # Encoding text data using tokenizer
    tokenizer = config.TOKENIZER
    X_train_encoded = encode(X_train, tokenizer, config.MAX_LENGTH)
    X_test_encoded = encode(X_test, tokenizer, config.MAX_LENGTH)

    # Creating TF Dataset, because we can't work with the encodings format outputed from 
    # the encoding function
    # The batch size we pass is defined as the predefined batch size times the num of tpu
    # replicas found. This practice is recommended by Kaggle
    
    train_data = to_dataset(X_train_encoded, y_train, config.AUTO, repeat = True, shuffle = True, batch_size = config.BATCH_SIZE * config.REPLICAS)
    test_data = to_dataset(X_test_encoded, y_test, config.AUTO, batch_size = config.BATCH_SIZE * config.REPLICAS * 4)

    n_train = X_train.shape[0]
   
    # Saving model at best accuracy epoch
    sv = tf.keras.callbacks.ModelCheckpoint(
        "model.h5",
        monitor = "val_accuracy",
        verbose = 0,
        save_best_only = True,
        save_weights_only = True,
        mode = "max",
        save_freq = "epoch"
    )

    print("\nTraining")

    # Training model
    model_history = model.fit(
        train_data,
        epochs = config.EPOCHS,
        callbacks = [sv],
        steps_per_epoch = n_train / config.BATCH_SIZE // config.REPLICAS,
        validation_data = test_data,
        verbose = 0
    )
    
    # Visualize the loss and accuracy after each epoch
    show_train_history(model_history,'accuracy','val_accuracy')
    show_train_history(model_history,'loss','val_loss')

    print("\nValidating")

    # scoring validation data
    model.load_weights("model.h5")
    test_data = to_dataset(X_test_encoded, -1, config.AUTO, label = False, batch_size = config.BATCH_SIZE * config.REPLICAS * 4)

    preds_test = model.predict(test_data, verbose = 0)
    acc = accuracy_score(y_test, np.argmax(preds_test, axis = 1))

    print(f"\n Accuracy: {round(acc, 4)}\n")

    g = gc.collect()

    return preds_test, y_test

In [45]:
## reading data
train = pd.read_csv("../input/nli-sentences/translated_train.csv")

In [None]:
config = Configs(max_length = 64, batch_size = 64)
preds_test, y_test = run_model(train,config)

Running on TPU grpc://10.0.0.2:8470
Initializing TPU


In [None]:
report = classification_report(np.argmax(preds_test, axis = 1), y_test )

In [None]:
print(report)