# Toxic: Valid Bi-Encoder Train

Train Bi-Encoder on Annotated data from validation_data.csv

---
### <a href='#hyperparameters'> Hyperparameters </a> | <a href='#training'> Training </a>

#### GitHub Repo: https://github.com/sarthak-314/toxic

The GitHub Repo contains all the boilerplate reusable code. I use it for all the notebooks in this competition. Some of it's features: 
- __Auto Imports__: Imports all the useful libraries. No need for manual imports like import pandas as pd, import tensorflow as tf. 
- __Sepration of Hyperparameters and Logic__: Seperates hyperparameters from the core logic. You can define all the hyperparameters in a yaml fashion. This makes it easy to run multiple experiments without changing the code. You can also download the hyperparameter file easily.  

In [None]:
# Sync Notebook with VS Code
!git clone https://github.com/sarthak-314/toxic
import sys; sys.path.append('toxic')

from src import *
from src.tflow import *

## Hyperparameters
---
### <a href='#training'> Training </a> | <a href='#data-factory'> Data Factory </a> 

<a name='hyperparameters'>

In [None]:
%%hyperparameters 

## Model Architecture ## 
max_seq_len: 128 # Use between 64-196
batch_size: 16 # Can go upto 256, but I am using low batch size for regularization
    
backbone: roberta-base # You can try other backbones from huggingface like albert-base-v2, roberta-large, etc.
attention_dropout: 0.125 # default: 0.10 for roberta | 0 for albert-base-v2 

hidden_dropout: 0.15 # default: 0.10
hidden_layers: [256, 64, 16, 4] # Dense layers with mish activation

## Model Compile ## 
optimizer:
    _target_: AdamW # Only AdamW Implemented for now
    weight_decay: 3e-5 
    epsilon: 1e-6 
    beta_1: 0.9
    beta_2: 0.999
    max_grad_norm: 10.0 # Roberta does not use max grad norm, but uses higher weight decay
    use_swa: True
    use_lookahead: False
    # average_decay: 0.999
    # dynamic_decay: True
    
    
# LR Schedule 
# Use plot_first_epoch to visualize the LR schedule curve with different values 
warmup_epochs: 1
warmup_power: 1.5
lr_cosine:
    max_lr: 16e-6
    min_lr: 0
    lr_gamma: 0.75
    num_cycles: 2
    step_gamma: 2
        
## Model Training ## 
max_epochs: 20
checkpoints_per_epoch: 4 # Multiple checkpoints per epoch
early_stop_patience: 0.5

# Loss Function # 
low_agree_margin: 0.001 # 0.001 margin for 14k comments ~ distance of 14 comments
high_agree_margin: 0.01 # 0.01 margin ~ distance of 140 comments

## Data ## 
fold: 3
random_state: 69420
add_special_tokens: False # Add Special tokens for the features
add_old_hard_pos: True # Augment the data by adding hard positives from 2016 competition data

In [None]:
# (Optional) Download hyperparameters for this run by clicking in the link below. 
from IPython.display import FileLink
FileLink('experiment.yaml')

In [None]:
FEATURES = ['severe_toxic', 'identity_hate', 'threat', 'toxic', 'insult', 'obscene']
SPECIAL_TOKENS = [
    '[SEVERE_TOXIC]', '[IDENTITY_HATE]', '[THREAT]', '[TOXIC]', '[INSULT]', '[OBSCENE]', 
    '[SOFT]', # Soft labels for the features generated by a model trained on 2016 data
    '[HARD]', # Human annotated hard labels 
]

FEAT_TO_TOKEN = {feat: token for feat, token in zip(FEATURES, SPECIAL_TOKENS)}

In [None]:
# bfloat16: False to disable mixed precision 
# jit_compile: True to enable JIT compliation (Use for larger training datasets)

# !wandb login 'xxxxxxxx' # Your WandB API Code Here. Uncomment this line to use WandB
STRATEGY = tf_accelerator(bfloat16=True, jit_compile=False)
set_seed(HP.random_state)

with STRATEGY.scope(): 
    backbone = TFAutoModel.from_pretrained(
        HP.backbone, 
        attention_probs_dropout_prob=HP.attention_dropout, 
    )
tokenizer = AutoTokenizer.from_pretrained(
    HP.backbone, 
    additional_special_tokens=SPECIAL_TOKENS, 
)

In [None]:
backbone.config

## Data Factory
---
### <a href='#training'> Training </a> | <a href='#hyperparameters'> Hyperparameters </a> 
<a name='data-factory'>

In [None]:
def add_old_hard_positive(): 
    old = pd.read_csv('../input/toxic-public-dataframes/old_pseudo_label.csv')
    old['y'] = old.loc[:, 'toxic': 'identity_hate'].sum(axis=1)
    neg = old[old.y==0]
    pos = pd.concat([old[old.identity_hate==1], old[old.severe_toxic==1]])
    
    dd = {'more_toxic': [], 'less_toxic': []}
    for i in range(len(pos)-1): 
        dd['more_toxic'].append(pos.iloc[i].comment_text)
        dd['less_toxic'].append(neg.iloc[i].comment_text)
    df = pd.DataFrame(dd)
    df['margin'] = HP.high_agree_margin
    return df

In [None]:
def add_special_tokens_LT(row):
    text = row.less_toxic 
    for feat in reversed(FEATURES): 
        if row[f'LT_{feat}'] > 0.5: 
            text = f'{FEAT_TO_TOKEN[feat]} ' + text
    if (row.LT_toxic == -1) or (1 < row.LT_toxic < 0): 
        text = '[SOFT] ' + text
    else: 
        text = '[HARD] ' + text
    return text

def add_special_tokens_MT(row):
    text = row.more_toxic 
    for feat in reversed(FEATURES): 
        if row[f'MT_{feat}'] > 0.5: 
            text = f'{FEAT_TO_TOKEN[feat]} ' + text
    if (row.MT_toxic == -1) or (1 < row.MT_toxic < 0): 
        text = '[SOFT] ' + text
    else: 
        text = '[HARD] ' + text
    return text

def add_feature_values(df): 
    old = pd.read_csv('/kaggle/input/toxic-public-dataframes/old_pseudo_label.csv')
    old_dict = old.set_index('comment_text').to_dict()
    for feat in FEATURES: 
        df[f'MT_{feat}'] = df.more_toxic.map(old_dict[feat])
        df[f'LT_{feat}'] = df.less_toxic.map(old_dict[feat])
    return df

In [None]:
df = pd.read_csv('/kaggle/input/toxic-public-dataframes/valid.csv')
dfc = pd.read_csv('/kaggle/input/toxic-public-dataframes/comments.csv')
df = add_feature_values(df)

def add_special_tokens(df): 
    df = df.fillna(-1)
    if not HP.add_special_tokens: 
        return df
    df.more_toxic = df.apply(add_special_tokens_MT, axis=1)
    df.less_toxic = df.apply(add_special_tokens_LT, axis=1)
    return df

def build_train(): 
    df['margin'] = df.agree.apply(lambda a: HP.low_agree_margin if a<1 else HP.high_agree_margin)
    train = df[df.fold!=HP.fold]
    if HP.add_old_hard_pos: 
        train = pd.concat([train, add_old_hard_positive()])
    return add_special_tokens(train)

def build_valid(): 
    df['margin'] = df.agree.apply(lambda a: HP.low_agree_margin if a<1 else HP.high_agree_margin)
    valid = df[df.fold==HP.fold]
    return add_special_tokens(valid)

train, valid = build_train(), build_valid()
train
valid

In [None]:
%%time
def tokenize_text(text): 
    return tokenizer(
        text, 
        max_length=HP.max_seq_len, 
        padding='max_length', 
        truncation=True, 
    )

def convert_to_features(example_batch): 
    M_tokenized = tokenize_text(example_batch['more_toxic'])
    L_tokenized = tokenize_text(example_batch['less_toxic'])
    return {
        'MT_ids': M_tokenized['input_ids'], 
        'MT_mask': M_tokenized['attention_mask'], 
        'LT_ids': L_tokenized['input_ids'], 
        'LT_mask': L_tokenized['attention_mask'], 
    }

def dataset_to_tfds(dataset):
    dataset.set_format(type='numpy')
    
    model_inputs = {
        'MT_ids': dataset['MT_ids'].astype(np.int32), 
        'MT_mask': dataset['MT_mask'].astype(np.int32), 
        'LT_ids': dataset['LT_ids'].astype(np.int32), 
        'LT_mask': dataset['LT_mask'].astype(np.int32), 
        'margin': dataset['margin'].astype(np.float32),
    }
    ds = tf.data.Dataset.from_tensor_slices(model_inputs)
    return ds

def df_to_tfds(df, is_valid=False): 
    raw_dataset = datasets.Dataset.from_pandas(df[['less_toxic', 'more_toxic', 'margin']])
    processed_dataset = raw_dataset.map(
        convert_to_features, 
        batched=True, 
        batch_size=1024, 
        num_proc=4, 
        desc='Running tokenizer on the dataset', 
    )
    ds = dataset_to_tfds(processed_dataset)
    if not is_valid:
        ds = ds.shuffle(len(processed_dataset), reshuffle_each_iteration=True).repeat()
    ds = ds.batch(HP.batch_size)
    if is_valid: 
        ds = ds.cache()
    steps = len(processed_dataset)//HP.batch_size + 1
    return ds.prefetch(tf.data.AUTOTUNE), steps

train_ds, train_steps = df_to_tfds(train, is_valid=False)
valid_ds, valid_steps = df_to_tfds(valid, is_valid=True)

In [None]:
def margin_ranking_loss(MT_pred, LT_pred, margin): 
    MT_pred, LT_pred, margin = tf.cast(MT_pred, tf.float32), tf.cast(LT_pred, tf.float32), tf.cast(margin, tf.float32)
    return tf.math.maximum(margin + (LT_pred-MT_pred), 0)

class ToxicModel(tf.keras.Model):     
    @tf.function
    def train_step(self, data): 
        with tf.GradientTape() as tape:
            MT_pred = self((data['MT_ids'], data['MT_mask']), training=True)
            LT_pred = self((data['LT_ids'], data['LT_mask']), training=True)
            
            loss = self.compiled_loss(MT_pred, LT_pred, regularization_losses=self.losses)
            loss = tf.cast(loss, tf.float32)
            loss += margin_ranking_loss(MT_pred, LT_pred, data['margin'])
            
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(MT_pred, LT_pred)
        return {m.name: m.result() for m in self.metrics}
    
    @tf.function
    def test_step(self, data):
        MT_pred = self((data['MT_ids'], data['MT_mask']), training=False)
        LT_pred = self((data['LT_ids'], data['LT_mask']), training=False)
        self.compiled_loss(MT_pred, LT_pred, regularization_losses=self.losses)
        self.compiled_metrics.update_state(MT_pred, LT_pred)
        return {m.name: m.result() for m in self.metrics}
    
def bert_initalizer(initializer_range = 0.02):
    return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)

def build_hidden_layer(hidden_layer_units, hidden_dropout): 
    layers = []
    for units in hidden_layer_units: 
        layers.append(tf.keras.layers.Dropout(hidden_dropout))
        layers.append(tf.keras.layers.Dense(
            units, 
            activation=tfa.activations.mish, 
            kernel_initializer=bert_initalizer(backbone.config.initializer_range)
        ))
    return tf.keras.Sequential(layers, name='hidden_layer')
    
    
def build_model(backbone): 
    input_ids = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32)
    attention_mask = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32)
    
    backbone_outputs = backbone(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        return_dict=True,
    )
    
    if 'pooler_output' in backbone_outputs: 
        x = backbone_outputs.pooler_output
    else: 
        # TODO: Take average instead 
        x = tf.squeeze(backbone_outputs.last_hidden_state[:, -1:, :], axis=1)
    hidden_layer = build_hidden_layer(HP.hidden_layers, HP.hidden_dropout)
    x = hidden_layer(x)
    
    score_layer = tf.keras.Sequential([
        tf.keras.layers.Dropout(HP.hidden_dropout), 
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=bert_initalizer(backbone.config.initializer_range)), 
    ], name='score')
    
    model = ToxicModel([input_ids, attention_mask],  outputs=score_layer(x))
    model.trainable = True
    return model

with STRATEGY.scope(): 
    model = build_model(backbone)

In [None]:
from src.tflow.factory import lr_scheduler_factory, plot_first_epoch
HP.steps_per_execution = 1024 # Set None to debug

def loss_fn(y_more_toxic, y_less_toxic):
    return tf.math.maximum(1e-4 + (y_less_toxic-y_more_toxic), 0)

def MT(MT_pred, LT_pred): 
    return tf.reduce_mean(MT_pred)
def LT(MT_pred, LT_pred): 
    return tf.reduce_mean(LT_pred)
    
def accuracy(MT_y, LT_y): 
    correct = tf.math.reduce_sum(tf.where(MT_y>LT_y, 1, 0))
    wrong = tf.math.reduce_sum(tf.where(LT_y>=MT_y, 1, 0))
    return correct / (correct + wrong)

def model_compile(): 
    lr_scheduler = lr_scheduler_factory(HP.warmup_epochs, HP.warmup_power, HP.lr_cosine, train_steps+4)
    optimizer = optimizer_factory(HP.optimizer, lr_scheduler)
    
    model.compile(
        optimizer=optimizer, 
        loss=loss_fn, 
        metrics=[accuracy, MT, LT], 
        steps_per_execution=HP.steps_per_execution, 
        run_eagerly=HARDWARE == 'CPU', 
    )    
with STRATEGY.scope(): 
    model_compile()

### Training
---
#### <a href='#hyperparameters'> Hyperparameters </a> | <a href='#dataframes'> Dataframes </a> | <a href='#data-factory'> Data Factory </a> 

<a name='training'>

In [None]:
accuracy_checkpoint = tf.keras.callbacks.ModelCheckpoint('checkpoint_acc.h5', monitor='val_accuracy', mode='max', save_weights_only=True, save_best_only=True, verbose=1)
early_stop = tf.keras.callbacks.EarlyStopping(patience=HP.early_stop_patience)

history = model.fit(
    train_ds, steps_per_epoch=train_steps//HP.checkpoints_per_epoch+1, epochs=HP.max_epochs*HP.checkpoints_per_epoch, 
    validation_data=valid_ds, validation_steps=valid_steps-1, 
    callbacks=[accuracy_checkpoint], 
)

In [None]:
# # Save best model to WandB
# with STRATEGY.scope(): 
#     model.load_weights('checkpoint_acc.h5')
#     model.save_weights('valid_only_robertab_val75xx_3rdDec.h5')
#     wandb.save('valid_only_robertab_val75xx_3rdDec.h5')

### Hyperparameter Optimization
---

### Weights Loader

In [None]:
# def build_dummy_model(backbone):
#     input_ids = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32)
#     attention_mask = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32)
#     backbone_outputs = backbone(
#         input_ids=input_ids, 
#         attention_mask=attention_mask, 
#         return_dict=True,
#     )
#     if 'pooler_output' in backbone_outputs: 
#         x = backbone_outputs.pooler_output
#     else: 
#         x = tf.squeeze(backbone_outputs.last_hidden_state[:, -1:, :], axis=1)
#     score_layer = tf.keras.layers.Dense(1)
#     model = tf.keras.Model([input_ids, attention_mask],  outputs=score_layer(x))
#     return model

# def add_wandb_backbones(wandb_weights):
#     backbone_weights = []
#     for run, weights in tqdm(wandb_weights): 
#         wandb.restore(weights, run)
#         temp_model = build_dummy_model(backbone)
#         backbone.save_weights(weights)
#         backbone_weights.append(weights)
#     return backbone_weights

# def add_huggingface_backbones(huggingface_backbones): 
#     huggingface_backbone_weights = []
#     for huggingface_model in huggingface_backbones: 
#         bb = TFAutoModel.from_pretrained(huggingface_model)
#         backbone_weights = f"{huggingface_model.replace('/', '_')}.h5"
#         bb.save_weights(backbone_weights)
#         huggingface_backbone_weights.append(backbone_weights)
#     return huggingface_backbone_weights

# backbone.save_weights('init_backbone_weights.h5')
# model.get_layer('hidden_layer').save_weights('init_hidden_layer_weights.h5')

# BACKBONE_WEIGHTS = ['init_backbone_weights.h5']
# # WandB Backbone Weights
# if HP.backbone == 'roberta-base': 
#     WANDB_WEIGHTS = [
#         # LB: 0.805 | Trained on Old Only Data
#         ('uncategorized/runs/35gso3iz', 'nolapval755_oldonlyv2_robertab_17thDec.h5'), 
          # Best LB: 0.831: Trained on old data with pseudo labels. 
#     ]
#     BACKBONE_WEIGHTS += add_wandb_backbones(WANDB_WEIGHTS)

# # Huggingface Backbones 
# if HP.backbone == 'roberta-base': 
#     HUGGINGFACE_BACKBONES = [
#         'cardiffnlp/twitter-roberta-base-hate', 
#         'cardiffnlp/twitter-roberta-base-offensive', 
#     ]
#     BACKBONE_WEIGHTS += add_huggingface_backbones(HUGGINGFACE_BACKBONES)

In [None]:
# TRIALS = 100
# max_max_accuracy = 0
# for i in tqdm(range(TRIALS)): 
#     tf.keras.backend.clear_session()
#     print('-'*50, f'RUN #{i} ', '-'*50)
    
#     # Load Weights
#     with STRATEGY.scope(): 
#         backbone_weights = random.choice(BACKBONE_WEIGHTS)
#         print('backbone_weights: ', red(backbone_weights))
#         backbone.load_weights(backbone_weights)
        
#     # Build Model 
#     HP.max_seq_len = random.choice([64, 96, 128, 192, 256])
#     HP.hidden_layers = random.choice([
#         [256, 64, 16, 4],
#         [256], [64], 
#     ])
#     HP.hidden_dropout = random.uniform(0, 0.20)
#     print('max_seq_len:', blue(HP.max_seq_len))
#     print('hidden_layers:', blue(HP.hidden_layers))
#     print('hidden_dropout:', blue(HP.hidden_dropout))
#     with STRATEGY.scope(): model = build_model(backbone)
    
    
#     # OPTIMIZER KWARGS 
#     HP.optimizer.weight_decay = random.choice([1e-4, 3e-4, 1e-4, 3e-5, 1e-5])
#     HP.optimizer.epsilon = random.choice([3e-6, 1e-6, 3e-7, 1e-7, 3e-7, 1e-8])
#     HP.optimizer.beta_1 = random.uniform(0.875, 0.925)
#     HP.optimizer.beta_2 = random.choice([0.995, 0.999, 0.9995])
#     HP.optimizer.max_grad_norm = random.choice([0.3, 1.0, 3, 10, 100])
#     if 'roberta' in HP.backbone: 
#         HP.optimizer.max_grad_norm = random.choice([1.0, 3, 10, 30, 100, 1000])
#         HP.optimizer.weight_decay = random.choice([1e-3, 3e-4, 1e-4, 3e-5])
#     HP.optimizer.use_swa = random.choice([True, True, False])
#     # HP.optimizer.moving_average = random.choice([0.99, 0.999])
#     # HP.optimizer.dynamic_decay = random.choice([True, False])
#     print('weight_decay:', blue(HP.optimizer.weight_decay))
#     print('epsilon:', blue(HP.optimizer.epsilon))
#     print('beta_1:', blue(HP.optimizer.beta_1))
#     print('beta_2:', blue(HP.optimizer.beta_2))
#     print('max_grad_norm:', blue(HP.optimizer.max_grad_norm))
#     print('use_swa:', blue(HP.optimizer.use_swa))
#     # print('moving_average:', blue(HP.optimizer.moving_average))
#     # print('dynamic_decay:', blue(HP.optimizer.dynamic_decay))
    
#     # LR KWARGS 
#     HP.warmup_epochs = random.choice([0.0625, 0.125, 0.25, 0.5, 0.75, 1, 1.5, 2, 3])
#     HP.warmup_power = random.choice([1, 1.25, 1.5, 2])
#     HP.lr_cosine.max_lr = random.uniform(0, 8e-5)
#     max_lr = HP.lr_cosine.max_lr
#     HP.lr_cosine.min_lr = random.choice([0, max_lr/10, max_lr/3, max_lr])
#     HP.lr_cosine.lr_gamma = random.uniform(0.5, 1)
#     HP.lr_cosine.step_gamma = random.choice([2, 3])
#     print('warmup_epochs:', blue(HP.warmup_epochs))
#     print('warmup_power:', blue(HP.warmup_power))
#     print('max_lr:', blue(HP.lr_cosine.max_lr))
#     print('min_lr:', blue(HP.lr_cosine.min_lr))
#     print('lr_gamma:', blue(HP.lr_cosine.lr_gamma))
#     print('step_gamma:', blue(HP.lr_cosine.step_gamma))
    
#     # Loss Function
#     HP.high_agree_margin = random.choice([0.1, 0.01, 0.001])
#     HP.low_agree_margin = random.choice([0.05, 0.01, 0.001, 0.0001])
#     HP.low_agree_margin = min(HP.low_agree_margin, HP.high_agree_margin)
#     print('high_agree_margin:', blue(HP.high_agree_margin))
#     print('low_agree_margin:', blue(HP.low_agree_margin))
#     # HP.steps_per_execution=None
#     with STRATEGY.scope(): model_compile()
    
#     # Model Training 
#     HP.checkpoints_per_epoch = random.randint(1, 3)
#     HP.max_epochs = 100
#     HP.batch_size = random.choice([8, 16, 32, 64])
#     print('checkpoints_per_epoch:', blue(HP.checkpoints_per_epoch))
#     print('batch_size:', blue(HP.batch_size))
    
    
#     # Data HParams
#     HP.add_old_hard_pos = random.choice([True, False])
#     HP.add_test_hard_pos = random.choice([True, False])
#     HP.add_special_tokens = random.choice([True, False, False])
#     print('add_old_hard_pos:', blue(HP.add_old_hard_pos))
#     print('add_test_hard_pos:', blue(HP.add_test_hard_pos))
#     print('add special tokens: ', red(HP.add_special_tokens))
    
#     train, valid = build_train(), build_valid()
#     train_ds, train_steps = df_to_tfds(train, is_valid=False)
#     valid_ds, valid_steps = df_to_tfds(valid, is_valid=True)
    
#     accuracy_checkpoint = tf.keras.callbacks.ModelCheckpoint(
#         f'checkpoint_acc_{i}.h5', monitor='val_accuracy', mode='max', save_weights_only=True, save_best_only=True, verbose=1, 
#     )
#     history = model.fit(
#         train_ds, steps_per_epoch=train_steps//HP.checkpoints_per_epoch, epochs=HP.max_epochs*HP.checkpoints_per_epoch, 
#         validation_data=valid_ds, validation_steps=valid_steps-1,
#         callbacks = [
#             tf.keras.callbacks.EarlyStopping(patience=5*HP.checkpoints_per_epoch), accuracy_checkpoint
#         ]
#     )
#     max_acc = max(history.history['val_accuracy'])
#     print('Maximum accuracy for the run: ', max_acc)
#     if max_acc < max_max_accuracy: 
#         max_max_accuracy = max_acc
#         print(f'NEW BEST ACCURACY ({i}th run): ', max_max_accuracy)