I cannot train model properly.......

### Reference
> @ragnar123 [bert-baseline](https://www.kaggle.com/ragnar123/bert-baseline)

Thank you for sharing

In [None]:
import os
import math
import random
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

from transformers import AutoTokenizer, TFAutoModel 
warnings.simplefilter('ignore')

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
SEED = 42
EPOCHS = 20
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
TFM_PATH = 'bert-base-uncased'
TOKENIZER_PATH = 'bert-base-uncased'
LR = 1e-3

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
seed_everything(SEED)

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
N_CLASSES = train_df['label_group'].nunique()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.save_pretrained('tokenizer')

In [None]:
def load_df():
    train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
    train_df['label_group'] = LabelEncoder().fit_transform(train_df['label_group'])
    N_CLASSES = train_df['label_group'].nunique()
    train_x, valid_x = train_test_split(train_df[['title', 'label_group']], shuffle=True, stratify=train_df['label_group'], random_state=SEED, test_size=0.33)
    return train_x, valid_x

In [None]:
def tokenize(df):
    inputs = tokenizer(df.title.tolist(), return_tensors='tf', max_length=64, padding='max_length', truncation=True)
    return inputs['input_ids'].numpy(), inputs['attention_mask'].numpy()

In [None]:
def load_ds(tokens, masks, labels, mode='train'):
    text_ds = tf.data.Dataset.from_tensor_slices((tokens, masks, labels))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((text_ds, label_ds))
    if mode == 'train':
        ds = ds.repeat()
        ds = ds.shuffle(len(tokens))
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
def load():
    train_df, valid_df = load_df()
    STEPS_PER_EPOCH = train_df.shape[0] // BATCH_SIZE
    if train_df.shape[0] % BATCH_SIZE != 0: STEPS_PER_EPOCH += 1
    train_x, valid_x = tokenize(train_df), tokenize(valid_df)
    train_ds, valid_ds = load_ds(*train_x, train_df.label_group.values), load_ds(*valid_x, valid_df.label_group.values, mode='valid')
    return train_ds, valid_ds, STEPS_PER_EPOCH

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )[:, 0, :]
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
class RobertaArcFace(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.roberta = TFAutoModel.from_pretrained(TFM_PATH)
        self.arc_margin = ArcMarginProduct(
            n_classes=N_CLASSES, 
            s=30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
        )
        self.softmax = tf.keras.layers.Softmax(dtype='float32')
    def call(self, inputs):
        tokens, masks, labels = inputs
        out = self.roberta(tokens, masks)
        feats = out.last_hidden_state[:, 0, :]
        out = self.arc_margin((feats, labels))
        out = self.softmax(out)
        return out

In [None]:
def build_model():
    model = RobertaArcFace()
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LR),
                  loss=[tf.keras.losses.SparseCategoricalCrossentropy()],
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    return model

In [None]:
def main():
    train_ds, valid_ds, STEPS_PER_EPOCH = load()
    checkpoint = ModelCheckpoint(
        f'bert-arcface.h5', 
        monitor = 'val_loss', 
        save_best_only = True,
        save_weights_only = True, 
        mode = 'min'
    )
    reduce_lr = ReduceLROnPlateau()
    
    with strategy.scope():
        model = build_model()
    model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=EPOCHS,
        steps_per_epoch=STEPS_PER_EPOCH,
        callbacks=[checkpoint, reduce_lr]
    )

In [None]:
main()