# DeBERTa LLRD + LastLayerReinit with TensorFlow





**Fold specific strategy**

|Ver |Fold (seed)| lr_head/lr_base| decay| layer-decay|decay_steps|maxlen| CV |LB|Strategies|
|---|---|---|---|---|---|---|---|---|---|
|5| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.9**|5*len(ds)|512|.452/.464/.468/.456/.459|.43|4-Layer Weighted-Pool + Reinit|
|6| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.9**|5*len(ds)|600|.451/.464/.467/.455/.458|.43|4-Layer Weighted-Pool + Reinit|
|| 0/1/2/3/4 (42)| 1e-4/.9e-5 |0.3|1/.9|5*len(ds)|800|.452/.460/.465/.461/.460| .44|reinit+4-l-weight|

**Layer-wise LR + Weighted MeanPool**

|Ver |Fold (seed)| lr_head/base| decay| layer-decay|decay_steps|maxlen| CV |LB|Remarks|
|---|---|---|---|---|---|---|---|---|---|
|16| 0-4 (42)| 1e-4/.9e-5 |0.3|0.3/.5|5*len(ds)|512|.451/.461/.462/**.450**/**.459**|.44|**BEST LB**|
|1| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.5**|5*len(ds)|512|.452/.459/.465/.456/.458|.44|Mean Pool|
|2| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.5**|5*len(ds)|512|.454/.461/.465/.455/.459||4-Layer Weighted-Pool|
|3| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.5**|5*len(ds)|512|.454/.462/.467/.458/.457||4-Layer Weighted-Pool + Reinit|
|4| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.7**|5*len(ds)|512|.451/.461/.464/.457/.457|.44|4-Layer Weighted-Pool + Reinit|
|5| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.9**|5*len(ds)|512|.452/.464/.468/.456/.459|.43|4-Layer Weighted-Pool + Reinit|
|6| 0-4 (42)| 1e-4/.9e-5 |0.3|**1.0/.9**|5*len(ds)|600|.451/.464/.467/.455/.458|.43|4-Layer Weighted-Pool + Reinit|
|REF|0-4 (42)| 1e-4/1.e-5 |0.3|1.0/.9|5*len(ds)|512|**.450/.459/.462/.455/.452**|.43|Zhang chang| 


# Initialize strategy for  GPU

import tensorflow as tf


if tf.config.list_physical_devices('GPU'):
    strategy = tf.distribute.MirroredStrategy() 
else: 
    strategy = tf.distribute.get_strategy()
    
#tf.keras.mixed_precision.set_global_policy("float32")

# USE AUTO-MIXED PRECISION
#tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
# https://keras.io/api/mixed_precision/loss_scale_optimizer/

print('Mixed precision enabled')


# Imports

In [None]:
import os, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print(f'TF version: {tf.__version__}')
import tensorflow_addons as tfa
from tensorflow.keras import layers

import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import sys
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# SEED

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(42)

# Hyperparameters

In [None]:
class Hyperparameters():
    
    RUN = 6
    
    EPOCHS = 5   
    
    TRAIN_FOLD = (0, 5)
    
    N_FOLDS = 5
    BATCH_SIZE = 4
    MAX_LENGTH = 600
    
    LLRDR = 0.9                 # Layerwise LR Decay Rate
    INIT_LR = 0.9e-5             # Initial LR of Basemodel
    INITIAL_LR_HEAD = 1e-4
    
    #pool_types = {0: 'mean', 1: 'mean', 2: 'mean', 3: 'mean', 4: 'mean'}
    #pool_types = {0: 'weighted-mean-2', 1: 'weighted-mean-2', 2: 'weighted-mean-2', 3: 'weighted-mean-2', 4: 'weighted-mean-2'}
    pool_types = {0: 'weighted-mean-4', 1: 'weighted-mean-4', 2: 'weighted-mean-4', 3: 'weighted-mean-4', 4: 'weighted-mean-4'}
    
    #reinit_last_layers = {0:False, 1: False, 2: False, 3: False, 4: False}
    reinit_last_layers = {0: True, 1: True, 2: True, 3: True, 4: True}
        
    DEBERTA_MODEL = "../input/tf-deberta-v3-base/model" #"../input/debertav3base"
    
    #pretrained_model_path = '../input/tf-deberta-v3-base/model'    
    #pretrained_model_path = '../input/tf-bart-base/model'
    #pretrained_model_path = '../input/tf-roberta-base/model'

    
    model_weight_name = f'fb3-train-v5-run{RUN}'
    
hp = Hyperparameters()

# Load DataFrame

In [None]:
df = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
display(df.head())
print('\n---------DataFrame Summary---------')
df.info()

# CV Split

In [None]:
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

skf = MultilabelStratifiedKFold(n_splits=hp.N_FOLDS, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(skf.split(df, df[TARGET_COLS])):
    df.loc[val_index, 'fold'] = int(n)
df['fold'] = df['fold'].astype(int)

df['fold'].value_counts()

df.to_csv('./df_folds.csv', index=False)

# TOkenizer 


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(hp.DEBERTA_MODEL)
#tokenizer.save_pretrained('./tokenizer/')

#cfg = transformers.AutoConfig.from_pretrained(hp.DEBERTA_MODEL, output_hidden_states=True)
#cfg.hidden_dropout_prob = 0
#cfg.attention_probs_dropout_prob = 0
#cfg.save_pretrained('./tokenizer/')

# Data Process Functions

In [None]:
def deberta_encode(texts, tokenizer=tokenizer):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=hp.MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")


def get_dataset(df):
    inputs = deberta_encode(df['full_text'])
    targets = np.array(df[TARGET_COLS], dtype="float32")
    return inputs, targets

## MeanPool and Weighted

In [None]:

## MeanPool
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum
        
## WeightedLayerPool
class WeightsSumOne(tf.keras.constraints.Constraint):
    def __call__(self, w):
        return tf.nn.softmax(w, axis=0)

# Loss

In [None]:
def MSE(y_true, y_pred):

    weights = tf.constant(
        [1., 1., 1., 1., 1., 1.], dtype=tf.float32
    )
    weighted_sq_error = tf.math.multiply(
        weights, tf.square(y_true - y_pred)
    )
    return tf.reduce_mean(weighted_sq_error)


# get_model()

In [None]:
def get_model(hp, fold, len_train_df=3129):
    
    input_ids = tf.keras.layers.Input(
        shape=(hp.MAX_LENGTH,), dtype=tf.int32, name="input_ids"
    )
    attention_masks = tf.keras.layers.Input(
        shape=(hp.MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
    )

    cfg = transformers.AutoConfig.from_pretrained(hp.DEBERTA_MODEL, output_hidden_states=True)
    base_model = transformers.TFAutoModel.from_pretrained(hp.DEBERTA_MODEL, config=cfg)
    
    #Last Layer Reinitialization or Partially Reinitialization
    #Uncommon next three lines to check deberta encoder block
    #print('DeBERTa Encoder Block:')
    #for layer in base_model.deberta.encoder.layer:
    #print(layer)

    if hp.reinit_last_layers[fold]:
        REINIT_LAYERS = 1
        normal_initializer = tf.keras.initializers.GlorotUniform()
        zeros_initializer = tf.keras.initializers.Zeros()
        ones_initializer = tf.keras.initializers.Ones()

        #     print(f'\nRe-initializing encoder block:')
        for encoder_block in base_model.deberta.encoder.layer[-REINIT_LAYERS:]:
            #         print(f'{encoder_block}')
            for layer in encoder_block.submodules:
                if isinstance(layer, tf.keras.layers.Dense):
                    layer.kernel.assign(normal_initializer(shape=layer.kernel.shape, dtype=layer.kernel.dtype))
                    if layer.bias is not None:
                        layer.bias.assign(zeros_initializer(shape=layer.bias.shape, dtype=layer.bias.dtype))

                elif isinstance(layer, tf.keras.layers.LayerNormalization):
                    layer.beta.assign(zeros_initializer(shape=layer.beta.shape, dtype=layer.beta.dtype))
                    layer.gamma.assign(ones_initializer(shape=layer.gamma.shape, dtype=layer.gamma.dtype))
                    
    if hp.pool_types[fold] == 'weighted-mean-4':
        # WeightedLayerPool + MeanPool of the last 4 hidden states
        deberta_output = base_model.deberta(
            input_ids, attention_mask=attention_masks
        )
        hidden_states = deberta_output.hidden_states
        stack_meanpool = tf.stack(
            [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-4:]], 
            axis=2)
        weighted_layer_pool = layers.Dense(1,use_bias=False, kernel_constraint=WeightsSumOne())(stack_meanpool)
        weighted_layer_pool = tf.squeeze(weighted_layer_pool, axis=-1)
        output = layers.Dense(6)(weighted_layer_pool)
        # New variable - trainable_head_layers
        trainable_head_layers = 4
    

    if hp.pool_types[fold] == 'mean':
        # Mean Pool Only
        x = base_model.deberta(input_ids, attention_mask=attention_masks)[0]
        x = MeanPool()(x, mask=attention_masks)
        output = layers.Dense(6)(x)
        trainable_head_layers = 2
    
    model = tf.keras.Model(
        inputs=[input_ids, attention_masks], 
        outputs=output
    )
   
    print('Last 4 layers of the model: ')
    print(model.layers[-4:])
    print('')
    
    #Compile model with Layer-wise Learning Rate Decay
    layer_list = [base_model.deberta.embeddings] + list(base_model.deberta.encoder.layer)
    layer_list.reverse()
    
    LR_SCH_DECAY_STEPS =  hp.EPOCHS * len_train_df //  hp.BATCH_SIZE
    #LR_SCH_DECAY_STEPS =  1600 #hp.EPOCHS * len_train_df //  hp.BATCH_SIZE
    
    lr_schedules = [tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=hp.INIT_LR * hp.LLRDR ** i, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3) for i in range(len(layer_list))]
    
    lr_schedule_head = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=hp.INITIAL_LR_HEAD, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3)
    
    optimizers = [tf.keras.optimizers.Adam(learning_rate=lr_sch) for lr_sch in lr_schedules]
    
    optimizers_and_layers = [
        (tf.keras.optimizers.Adam(learning_rate=lr_schedule_head), model.layers[-trainable_head_layers:])
    ] +\
        list(zip(optimizers, layer_list))
    
    #     Uncomment next three lines to check optimizers_and_layers
    #     print('\nLayer-wise Learning Rate Decay Initial LR:')
    #     for o,l in optimizers_and_layers:
    #         print(f'{o._decayed_lr("float32").numpy()} for {l}')

    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)   
    
    
    model.compile(optimizer=optimizer,
                 loss=MSE, #'huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    return model

# 5 Folds Training Loop

In [None]:
valid_rmses = []

#for fold in range(hp.N_FOLDS):
for fold in range(hp.TRAIN_FOLD[0], hp.TRAIN_FOLD[1]):
    
    
    print(f'\n-----------FOLD {fold} ------------')
    
    #Create dataset
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)
    
    train_dataset = get_dataset(train_df)
    valid_dataset = get_dataset(valid_df)
    
    print('Data prepared.')
    print(f'Training data input_ids shape: {train_dataset[0][0].shape} dtype: {train_dataset[0][0].dtype}') 
    print(f'Training data attention_mask shape: {train_dataset[0][1].shape} dtype: {train_dataset[0][1].dtype}')
    print(f'Training data targets shape: {train_dataset[1].shape} dtype: {train_dataset[1].dtype}')
    print(f'Validation data input_ids shape: {valid_dataset[0][0].shape} dtype: {valid_dataset[0][0].dtype}')
    print(f'Validation data attention_mask shape: {valid_dataset[0][1].shape} dtype: {valid_dataset[0][1].dtype}')
    print(f'Validation data targets shape: {valid_dataset[1].shape} dtype: {valid_dataset[1].dtype}')
    
    #Create model
    tf.keras.backend.clear_session()
    
    #with strategy.scope(): 
    model = get_model(hp, fold, len(train_df))
    
    print(model.summary())
    print('Model prepared.')
    
    #Training model
    print('Start training...')
    filepath=f'{hp.model_weight_name}-fold{fold}.h5' #f"best_model_fold{fold}.h5"
    callbacks = tf.keras.callbacks.ModelCheckpoint(filepath,
                                           monitor="val_loss",
                                           mode="min",
                                           save_best_only=True,
                                           verbose=1,
                                           save_weights_only=True)
    history = model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        validation_data=valid_dataset, 
        epochs=hp.EPOCHS,
        shuffle=True,
        batch_size=hp.BATCH_SIZE,
        callbacks=[callbacks]
    )
    
    valid_rmses.append(np.min(history.history['val_root_mean_squared_error']))
    print('Training finished.')
    del train_dataset, valid_dataset, train_df, valid_df
    gc.collect()
    
print(f'{len(valid_rmses)} Folds validation RMSE:\n{valid_rmses}')
print(f'Local CV Average score: {np.mean(valid_rmses)}')

# Inference and Submission



In [None]:
test_df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
test_df.head()

test_dataset = deberta_encode(test_df['full_text'])


fold_preds = []
for fold in range(hp.TRAIN_FOLD[0], hp.TRAIN_FOLD[1]):
    tf.keras.backend.clear_session()
    
    model = get_model(hp, fold, len(test_dataset))
    
    filepath=f'{hp.model_weight_name}-fold{fold}.h5' #f"best_model_fold{fold}.h5"

    model.load_weights(filepath)
    print(f'\nFold {fold} inference...')
    
    pred = model.predict(test_dataset, batch_size=hp.BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()
    
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission.csv', index=False)

sub_df.head()