https://www.kaggle.com/rhtsingh/utilizing-transformer-representations-efficiently

In [None]:
#!pip install transformers
#!pip install sentencepiece
#!pip install wrapt --upgrade --ignore-installed
#!pip install tensorflow
#!pip install pydot
#!pip install pydotplus
#!sudo apt-get install graphviz
#!pip install keras
#!pip install focal_loss

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#import zipfile
#with zipfile.ZipFile("./drive/MyDrive/HSE DS data/model2.zip", "r") as zip_ref:
#    zip_ref.extractall("./data")

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
import os
import gc
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm # progress bar

from tensorflow.keras.layers import Dense, Input, Average, SpatialDropout1D, Dropout, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.utils.vis_utils import model_to_dot
from tensorflow.keras import backend

import transformers
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

from IPython.display import SVG, FileLink
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc, classification_report

#### Seed

In [None]:
def seed_everything(seed = 0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

SEED = 0
seed_everything(SEED)

#### TPU configuration

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

#### Configuration

In [None]:
config = {
  "MAX_LEN": 224,
  "BATCH_SIZE": 16 * strategy.num_replicas_in_sync,
  "EPOCHS": 3,
  "LEARNING_RATE": 1e-5,
  "MODEL": 'jplu/tf-xlm-roberta-large',
  "SHUFFLE": 2048,
  "PREFETCH": tf.data.experimental.AUTOTUNE
}

config

#### Load data

In [None]:
x_validation = np.load("../input/balancedtrain2/x_validation_comment_text_224.npy", allow_pickle = True)[()]
y_validation = np.load("../input/balancedtrain2/y_validation.npy", allow_pickle = True)

x_test = np.load("../input/balancedtrain2/x_test_comment_text_224.npy", allow_pickle = True)[()]

In [None]:
submission = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")

### Model

#### Model from Hugging Face

In [None]:
%%time
with strategy.scope():
    conf = AutoConfig.from_pretrained(config["MODEL"])
    conf.output_hidden_states = False
    transformer_layer = TFAutoModel.from_pretrained(config["MODEL"], config = conf)

#### Datasets for TensorFlow

In [None]:
validation_ds = (
  tf.data.Dataset
  .from_tensor_slices((x_validation, y_validation))
  .batch(config["BATCH_SIZE"])
  .cache()
  .prefetch(config["PREFETCH"])
)

test_ds = (
  tf.data.Dataset
  .from_tensor_slices(x_test)
  .batch(config["BATCH_SIZE"])
)

#### Learning rate schedule (Exponential decay with warmup)

In [None]:
def exponential_schedule_with_warmup(epoch):
    '''
    Create a schedule with a learning rate that decreases exponentially after linearly increasing during a warmup period.
    '''
    
    warmup_epochs = 3
    hold_max_epochs = 0
    lr_start = 1e-6
    lr_max = config['LEARNING_RATE']
    lr_min = 1e-7
    decay = 0.8
        
    if epoch < warmup_epochs:
        lr = (lr_max - lr_start) / warmup_epochs * epoch + lr_start
    elif epoch < warmup_epochs + hold_max_epochs:
        lr = lr_max
    else:
        lr = lr_max * (decay ** (epoch - warmup_epochs - hold_max_epochs))
        if lr_min is not None:
            lr = tf.math.maximum(lr_min, lr)
            
    return lr

rng = [i for i in range(config['EPOCHS'])]
y = [exponential_schedule_with_warmup(x) for x in rng]

sns.set(style='whitegrid')
fig, ax = plt.subplots(figsize=(20, 6))
plt.plot(rng, y)

print(f'Learning rate schedule: {y[0]:.3g} to { max(y):.3g} to { y[-1]:.3g}')

#### Callbacks

In [None]:
model_path = "xlm-roberta.h5"

checkpoint = ModelCheckpoint(model_path, monitor='val_auc', mode='max', save_best_only=True, save_weights_only=True, verbose=1)

es = EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=False, verbose=1)

rp = ReduceLROnPlateau(monitor='val_auc', factor=0.8, patience=3, verbose=1, mode='max')

#lr = LearningRateScheduler(exponential_schedule_with_warmup, verbose=0)

callbacks = [checkpoint, es, rp]

#### Prepare model

In [None]:
class MetricsHelper:
    
    def __init__(self):
        sns.set(style="whitegrid")
    
    def plot_metrics(self, history, metric_list):
        fig, axes = plt.subplots(len(metric_list), 1, sharex='col', figsize=(20, 18))
        axes = axes.flatten()

        for index, metric in enumerate(metric_list):
            axes[index].plot(history[metric], label='Train %s' % metric)
            axes[index].plot(history['val_%s' % metric], label='Validation %s' % metric)
            axes[index].legend(loc='best', fontsize=16)
            axes[index].set_title(metric)

        plt.xlabel('Epochs', fontsize=16)
        sns.despine()
        plt.show()
        
    def get_metrics_report(self, y_valid, valid_pred):
        print('ROC AUC %.4f' % roc_auc_score(y_valid, valid_pred))
        print(classification_report(y_valid,  np.round(valid_pred)))

    def plot_aur_curve(self, y_valid, valid_pred):
        fpr_valid, tpr_valid, _ = roc_curve(y_valid, valid_pred)
        roc_auc_valid = auc(fpr_valid, tpr_valid)

        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr_valid, tpr_valid, color='purple', label='ValidationAUC = %0.2f' % roc_auc_valid)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

    def plot_confusion_matrix(self, y_valid, valid_pred, labels=[0, 1]):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
        validation_cnf_matrix = confusion_matrix(y_valid, valid_pred)

        validation_cnf_matrix_norm = validation_cnf_matrix.astype('float') / validation_cnf_matrix.sum(axis=1)[:, np.newaxis]

        validation_df_cm = pd.DataFrame(validation_cnf_matrix_norm, index=labels, columns=labels)

        sns.heatmap(validation_df_cm, annot=True, fmt='.2f', cmap=sns.cubehelix_palette(8),ax=ax2).set_title('Validation')
        plt.show()

In [None]:
metricsHelper = MetricsHelper()

In [None]:
class ModelHelper:
        
    def create_model(self, transformer, learning_rate, max_len):
        tf.keras.backend.clear_session()
        
        with strategy.scope():      
            model = self.build_model(transformer, learning_rate, max_len)
        
        return model
        
    def print_model_description(self, model):
        model.summary()
        display(SVG(model_to_dot(model, dpi=70).create(prog='dot', format='svg')))
    
    def make_submission(self, model, ds):
        submission['toxic'] = model.predict(ds)
        submission.to_csv('submission.csv', index=False)
        display(FileLink('submission.csv'))
    
    def train(self, model, epochs, callbacks, train_ds, validation_ds, n_steps):
        return model.fit(train_ds, steps_per_epoch = n_steps, validation_data = validation_ds, epochs = epochs, callbacks = callbacks)
        
    def build_model(self, transformer, learning_rate, max_len):
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
        attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
        
        sequence_output = transformer({"input_ids": input_word_ids, "attention_mask": attention_mask})[0]
        
        cls_token = sequence_output[:, 0, :]

        samples = []
        sample_mask = Dense(128, activation='relu')
        for n in range(8):
            sample = Dropout(.5)(cls_token)
            sample = sample_mask(sample)
            sample = Dense(1, activation='sigmoid', name=f'sample_{n}')(sample)
            samples.append(sample)

        out = Average(name='output')(samples)

        # build and compile the model
        model = Model(inputs = {
                     "input_ids": input_word_ids,
                     "attention_mask": attention_mask
                    },  outputs = out)
        model.compile(Adam(lr = learning_rate), loss = 'binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

        return model

In [None]:
modelHelper = ModelHelper()

In [None]:
%%time
model = modelHelper.create_model(transformer_layer, config["LEARNING_RATE"], config["MAX_LEN"])
modelHelper.print_model_description(model)

In [None]:
x_train = np.load("../input/balancedtrain2/x_en_train_shuffled_comment_text_224.npy", allow_pickle = True)[()]
y_train = np.load("../input/balancedtrain2/y_en_train_shuffled.npy", allow_pickle = True)

In [None]:
train_ds = (
  tf.data.Dataset
  .from_tensor_slices((x_train, y_train))
  .repeat()
  .shuffle(config["SHUFFLE"])
  .batch(config["BATCH_SIZE"], drop_remainder=True)
  .prefetch(config["PREFETCH"])
)

In [None]:
N_STEPS = len(y_train) // (config["BATCH_SIZE"] * 4)

In [None]:
del [[x_train, y_train, x_test, x_validation]]
gc.collect()

In [None]:
model_history = modelHelper.train(model, config["EPOCHS"], callbacks, train_ds, validation_ds, N_STEPS)
metricsHelper.plot_metrics(model_history.history, metric_list = ['loss', 'accuracy', 'auc'])

In [None]:
del [[train_ds]]
gc.collect()

In [None]:
with strategy.scope():
    model.load_weights(model_path)

#### Validation

In [None]:
validation_pred = model.predict(validation_ds)
metricsHelper.get_metrics_report(y_validation, validation_pred)
metricsHelper.plot_aur_curve(y_validation, validation_pred)

#### Submission

In [None]:
modelHelper.make_submission(model, test_ds)

In [None]:
# display(FileLink(model_path))