### The abstract from the paper is the following:

> Position encoding in transformer architecture provides supervision for dependency modeling between elements at different positions in the sequence. We investigate various methods to encode positional information in transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing experiment for English benchmark will soon be updated.

#### notebook based on this wonderful work: [https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633](http://)

# Load Libraries

In [None]:
# this library is required if you want to train the roformer tokenizer instead of the bert tokenizer
!pip install rjieba

In [None]:
import os
import random
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from transformers import TFRoFormerForTokenClassification, RoFormerTokenizer, RoFormerTokenizerFast, AutoTokenizer, BertTokenizerFast
from transformers import RoFormerConfig, AutoConfig, TFAutoModel, TFRoFormerModel, AdamWeightDecay, TFRoFormerForSequenceClassification, TFAutoModelForSequenceClassification
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

print('TF version',tf.__version__)

In [None]:
def auto_select_accelerator():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy

# Load Train

In [None]:
train = pd.read_csv('../input/feedback-prize-2021/train.csv')
print( train.shape )
train.head()

In [None]:
print('The train labels are:')
train.discourse_type.unique()

In [None]:
IDS = train.id.unique()
print('There are',len(IDS),'train texts.')

In [None]:
# TOKENIZER
# Since the English version of RoFormer is under development, and Bert has the same structure, I use the Bert tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")

In [None]:
# THE TOKENS AND ATTENTION ARRAYS
targets = np.load('../input/roformer-tpu/roformer_tpu/targets_bert_large_uwwm_1024.npy')
train_tokens = np.load('../input/roformer-tpu/roformer_tpu/tokens_bert_large_uwwm_1024.npy')
train_attention = np.load('../input/roformer-tpu/roformer_tpu/attention_bert_large_uwwm_1024.npy')
print('Loaded NER tokens')

# Build Model

In [None]:
from transformers import RoFormerConfig
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'attention', dtype=tf.int32)
    
    config = RoFormerConfig('junnyu/roformer_chinese_base')
    backbone = TFRoFormerModel.from_pretrained('junnyu/roformer_chinese_base', config=config)
    x = backbone(tokens, attention_mask=attention)
    
    x = tf.keras.layers.Dense(256, activation='relu')(x[0])
    x = tf.keras.layers.Dense(15, activation='softmax', dtype='float32')(x)
    
    model = tf.keras.Model(inputs=[tokens,attention], outputs=output) #outputs=x)
    # model.load_weights(f'/content/drive/MyDrive/feedback-prize/save/best_r_bert_notcls_luwwm_v4_bs.h5')
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5), #AdamWeightDecay(learning_rate=3e-5, weight_decay_rate=0.5), #tf.keras.optimizers.Nadam(learning_rate = 3e-5), #(lr = 1e-5), #Adam, Nadam, Adamax
                  loss = [tf.keras.losses.CategoricalCrossentropy()],
                  metrics = [tf.keras.metrics.CategoricalAccuracy()])
    
    return model

In [None]:
MAX_LEN = 1024
strategy = auto_select_accelerator()
# BATCH_SIZE = strategy.num_replicas_in_sync/2 #* 16
with strategy.scope():
    model = build_model()

# Train or Load Model

In [None]:
# TRAIN VALID SPLIT 90% 10%
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)
np.random.seed(None)
print('Train size',len(train_idx),', Valid size',len(valid_idx))

In [None]:
# LEARNING RATE SCHEDULE AND MODEL CHECKPOINT
EPOCHS = 15
LRS = [0.25e-4, 0.25e-4, 0.25e-4, 0.25e-4, 0.25e-5, 0.25e-5, 0.25e-5, 0.25e-6, 0.25e-6, 0.25e-6, 0.25e-6, 0.25e-6, 0.25e-6, 0.25e-6, 0.25e-6] 
def lrfn(epoch):
    return LRS[epoch]

In [None]:
steps_per_epoch = train_idx.shape[0]//16 #BATCH_SIZE

checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f'./best_roformer_luwwm.h5', save_best_only=True, monitor='val_categorical_accuracy', mode='max', save_weights_only=True) #'min') 'auto')

lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_categorical_accuracy", patience=3, min_lr=1e-8, mode='max')

lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

model.fit(x = [train_tokens[train_idx,], train_attention[train_idx,]],
      y = targets[train_idx,],
      validation_data = ([train_tokens[valid_idx,], train_attention[valid_idx,]],
                         targets[valid_idx,]),
      callbacks =[checkpoint], #[checkpoint, lr_reducer], #[checkpoint, lr_callback],
      epochs = 10, #EPOCHS,
      steps_per_epoch=steps_per_epoch,
      batch_size = 16, #BATCH_SIZE,
      verbose = 2)

# SAVE MODEL WEIGHTS

# with open('./config.json', 'w') as f1:
#     json.dump(model.get_config(), f1, indent=4)

# model.save_weights(f'./roformer_bert_luwwm_v1_w.h5')
# tokenizer.save_pretrained('./') # работает
# model.save(f'./roformer_bert_lowwm_v1_m.h5')

In [None]:
from IPython.display import FileLink
FileLink(r'./best_roformer_luwwm.h5')