Hi! This notebook illustrates a simple way to make a multi-language of every transformer model by simply using XLM-R embedding, and then feed to the architecture you want (i.e. GPT2 in this notebook, so that you have XLM-GPT2), and then finetune it. 

This notebook is about several months ago, and use a bit dated versions of TF and Transformers, so if you use the latest version, you may need to modify the code a bit. Please see Version 12 for the acutal running :)

In [None]:
!pip install -q tensorflow==2.2

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import gc
from tensorflow.keras.mixed_precision import experimental as mixed_precision

MIX = False

if MIX:
    tf.config.optimizer.set_jit(True)
    policy = mixed_precision.Policy('mixed_bfloat16')
    mixed_precision.set_policy(policy)
#     tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

In [None]:
print(tf.__version__)
print(transformers.__version__)

## Helper Functions

In [None]:
def fix_fast(ids):
    ids2 = [xx+1 for xx in ids]
    return [0] + ids2 +[2]

def fast_encode(texts, tokenizer, chunk_size=256, maxlen=384):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([fix_fast(enc.ids) for enc in encs])
    
    return np.array(all_ids)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
def build_xlmr(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid', dtype='float32')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    opt = Adam(lr=1e-5)
    if MIX:
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
    model.compile(opt, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(),'accuracy'])
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
# GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 3
LR = 3e-4
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

if MIX:
    BATCH_SIZE = 32 * strategy.num_replicas_in_sync

print(BATCH_SIZE)
    
MAX_LEN = 192
# MODEL = '../input/mlm-epoch3-ppl505'
MODEL = '../input/mlm-epoch2-ppl469' #'jplu/tf-xlm-roberta-large'

## Create fast tokenizer

In [None]:
# First load the real tokenizer 
tokenizer = AutoTokenizer.from_pretrained(MODEL,use_fast=True)

from tokenizers import SentencePieceBPETokenizer
fast_tokenizer = SentencePieceBPETokenizer('../input/mlm-epoch1-ppl583/xlmr_vocab.json', '../input/mlm-epoch1-ppl583/xlmr_merges.txt')

fast_tokenizer

In [None]:
text = "Hello my name is Jung Прежде всего, это было хорошее Você é especialista? Você não pode"
print(tokenizer.encode(text))
print(fast_tokenizer.encode(text).ids) # fast tokenizer cannot be used directly
ids2 = fix_fast(fast_tokenizer.encode(text).ids)
print(ids2)
print(len(text.split()),
      len(tokenizer.encode(text)), 
      len(ids2))

In [None]:
print(tokenizer.decode(tokenizer.encode(text)))
print(tokenizer.decode(ids2))

## Load text data into memory

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

# option1
train2.toxic = train2.toxic.round().astype(int) 
# option2
# train2.loc[train2.toxic >= 0.5,'toxic'] = 1
# train2.loc[train2.toxic < .5,'toxic'] = 0
# train2.toxic = train2.toxic.astype(int)

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [None]:
valid = valid.sample(frac=1, random_state=0)

train = pd.concat([
    train1[['comment_text', 'toxic']], #.sample(n=50000, random_state=0)
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
print(train.shape)

In [None]:
%%time
# x_train = np.load('/kaggle/input/jigsaw20-private-data/train_en_full2019.npz')['x'] # (1902194, 192) # from 2019 only
x_train = np.load('/kaggle/input/jigsaw20-tpu-xlm-roberta/train_en.npz')['x'] # (1000000, 192) # from 2019 only
y_train = np.load('/kaggle/input/jigsaw20-tpu-xlm-roberta/train_en.npz')['y'] #
x_valid = np.load('/kaggle/input/jigsaw20-tpu-xlm-roberta/valid_en.npz')['x'] # (8000, 192)
y_valid = np.load('/kaggle/input/jigsaw20-tpu-xlm-roberta/valid_en.npz')['y'] #
x_test = np.load('/kaggle/input/jigsaw20-tpu-xlm-roberta/test_en.npz')['x'] # (63812, 192)

print(x_train.shape, x_valid.shape, x_test.shape)

In [None]:
print(y_train[:5])
y_train[y_train < 0.5] = 0
y_train[y_train >= 0.5] = 1
y_train = y_train.astype(np.int32)
print(y_train[:5], len(y_train), len(y_train[y_train == 1]))

In [None]:
def create_xtest_new(x_test, batch=BATCH_SIZE):
    '''
    Ensure that x_test_new can be divided by BATCH_SIZE
    '''
    orig_len = len(x_test)
    new_len = (orig_len//batch + 1)*(batch)
    new_shape = list(x_test.shape)
    new_shape[0] = new_len
        
    x_test_new = np.ones(new_shape)
    x_test_new[:orig_len] = x_test
    return x_test_new.astype(np.int32), orig_len

In [None]:
x_test_new, orig_len = create_xtest_new(x_test, batch=BATCH_SIZE)
print(x_test_new.shape, orig_len)
# print(x_test_new[-1], x_test_new[0])

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE, drop_remainder=True)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test_new)
    .batch(BATCH_SIZE)
)

## Build datasets objects

## Load model into the TPU

In [None]:
wpath = '../input/xlmr-test/xlmr_large_fixed_embed.h5'

In [None]:
%%time
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, TFGPT2Model
tokenizerg = GPT2Tokenizer.from_pretrained('gpt2',pad_token=' ')
tokenizerg.save_pretrained('.')

with strategy.scope():
    xlmr_layer = TFAutoModel.from_pretrained(MODEL, from_pt=True) #TFAutoModel.from_pretrained(MODEL)
    xlmr_layer.layers[0].embeddings.trainable = False # add one line
    
    gpt2_layer = TFGPT2Model.from_pretrained('gpt2') #TFGPT2LMHeadModel.from_pretrained('gpt2-medium')


In [None]:
print(MODEL, wpath)

## Now combine everything to make XLM-GPT2 TF Model!!!

In [None]:
from tensorflow.keras.layers import *
from tensorflow.keras import Model

class XLMTransformers(tf.keras.Model):
    def __init__(self, emb_layer, body_layer, connect_dim=None, dropout_rate=0.2, batch_size=BATCH_SIZE//strategy.num_replicas_in_sync):
        super().__init__()
        self.emb_layer = emb_layer
        self.body_layer = body_layer
        self.connect_dim = connect_dim
        
        if self.connect_dim is None:
            conf = self.body_layer.layers[-1].get_config()
            # TODO : config are different for each arch, below works only for GPT2 but not Albert
            self.connect_dim = conf['transformers_config']['n_embd'] 

        if self.connect_dim > 0:
            self.connect_layer = Dense(self.connect_dim,activation='linear') # TODO : identity initializer or ?

        self.pooling_layer = GlobalMaxPooling1D()
        self.drop_layer = Dropout(dropout_rate)
        self.pred_layer = Dense(1,activation='sigmoid')
        
        self.batch_size = batch_size
        
    def call(self, input_ids):
        pos_ids = self.emb_layer.create_position_ids_from_input_ids(input_ids)
        token_type_ids = tf.zeros([self.batch_size, input_ids.shape[1]])
        
        x = self.emb_layer([input_ids, pos_ids, token_type_ids, None])
        if self.connect_dim > 0:
            x = self.connect_layer(x)
        x = self.body_layer({'input_ids':None, 'inputs_embeds':x})[0]
        x = self.pooling_layer(x)
        x = self.drop_layer(x)
        x = self.pred_layer(x)
        return x
    
    def predict_numpy(self, x_test, batch=BATCH_SIZE):
        '''
        Purpose: 
          just to make sure that x_test_new can be divided by self.batch_size
          Currently, unusable with unknown reason
        ''' 
        x_test_new, orig_len = create_xtest_new(x_test)
        pred = self.predict(x_test_new.astype(np.int32), verbose=1)
        
        return pred[:orig_len]
        

In [None]:
text = "Hello my name is Jung Прежде всего, это было хорошее Você é especialista? Você não pode asdf asdfljsd fj"
enc = tokenizer.batch_encode_plus([text], return_token_type_ids=True, return_attention_mask=False, pad_to_max_length=True)

with strategy.scope():
    xlm_gpt2 = XLMTransformers(emb_layer = xlmr_layer.layers[0].embeddings, 
                           body_layer = gpt2_layer)
    
    y = xlm_gpt2(tf.constant(enc['input_ids']))
    xlm_gpt2.compile(Adam(lr=LR), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

print(y.shape)
# xlm_gpt2.summary()
gc.collect()

## Train Model

First, we train on the subset of the training set, which is completely in English.

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = xlm_gpt2.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
xlm_gpt2.evaluate(valid_dataset) # 

In [None]:
xlm_gpt2.save_weights('xlm_gpt2_small_fixed_embed_finetuned_en_only.h5')

Now that we have pretty much saturated the learning potential of the model on english only data, we train it for one more epoch on the `validation` set, which is significantly smaller but contains a mixture of different languages.

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = xlm_gpt2.fit(
    valid_dataset.shuffle(2048).repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

## Submission

In [None]:
## note that test_dataset contains more elements than original
pred = xlm_gpt2.predict(test_dataset, 
                        verbose=1)
print(pred.shape)

In [None]:
sub['toxic'] = pred[:orig_len]
sub.to_csv('submission.csv', index=False)
sub.tail()