# Toxic: TF Ranking
---


In [None]:
# IDEA: Maybe one round with full backbone fine tuning. 

In [None]:
# !pip install -q tensorflow-ranking
# import tensorflow_ranking as tfr

In [None]:
SUBMISSION = True

## Model Architecture ## 
# FEATURE_HIDDEN_LAYERS = [256, 64, 16, 4]
# FINAL_HIDDEN_LAYERS = [4, 1]
# HIDDEN_DROPOUT = 0.10

## Model Inputs ## 
# LIST_FEATURES = ['old_lb827', 'new_lb777', 'ruddit_lb785', 'fold3_lb820', 'fold3_old_lb805']
LIST_FEATURES = ['old_lb827', 'new_lb777', 'ruddit_lb785']
LIST_FEATURE_DIMS = [768, 768, 768, 1024, 1024]
MAX_SCORER_LENGTH = 96

UNIT_FEATURES = ['tfidf_lb864']
MODEL_INPUTS = UNIT_FEATURES + LIST_FEATURES

## Model Training ##
BATCH_SIZE = 8

### Notebook Imports & Setup

In [None]:
import tensorflow_addons as tfa
import tensorflow as tf

import transformers
import tokenizers
import datasets

from functools import partial
from tqdm.auto import tqdm
from pathlib import Path
import tensorflow as tf
from time import time
import pandas as pd
import numpy as np
import joblib    
import re

# Enable Mixed Precision, JIT Compilation & set random seed
def _enable_mixed_precision(): 
    from tensorflow.keras.mixed_precision import experimental as mixed_precision
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_policy(policy)

# _enable_mixed_precision()
tf.config.optimizer.set_jit(True)
tf.random.set_seed(12)

# Cache Paths
BACKBONES_DIR = Path('../input/toxic-internet-deep-model-backbones')
WTS = Path('../input/toxic-monster-model-internet')

## âš’ Data Factory
---

In [None]:
TOXIC_FEATURES = ['severe_toxic', 'identity_hate', 'threat', 'toxic', 'insult', 'obscene']

In [None]:
old = pd.read_csv('../input/toxic-public-dataframes/old_pseudo_label.csv')
df = pd.read_csv('../input/toxic-dataframes/valid.csv')
dfc = pd.read_csv('../input/toxic-dataframes/comments.csv')

df = df[df.more_toxic.isin(old.comment_text) & df.less_toxic.isin(old.comment_text)]
dfc = dfc[dfc.comment_text.isin(df.more_toxic) | dfc.comment_text.isin(df.less_toxic)] 

if SUBMISSION: 
    sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
    sub['comment_text'] = sub.text
else: 
    sub = pd.read_csv('../input/toxic-dataframes/test_comments.csv')
    sub['text'] = sub.comment_text
    sub = sub.drop_duplicates('comment_text')
    
# Map Feature Values
old_dict = old.set_index('comment_text').to_dict()
for feat in TOXIC_FEATURES: 
    sub[feat] = sub.comment_text.map(old_dict[feat])
    df[f'MT_{feat}'] = df.more_toxic.map(old_dict[feat])
    df[f'LT_{feat}'] = df.less_toxic.map(old_dict[feat])
    
df

## Embedding Extractor
---

In [None]:
def dataset_to_test_ds(dataset): 
    'Processed huggingface dataset to tensorflow dataset'
    dataset.set_format(type='numpy')
    input_ids_ds = tf.data.Dataset.from_tensor_slices(dataset['input_ids'].astype(np.int32))
    attention_mask_ds = tf.data.Dataset.from_tensor_slices(dataset['attention_mask'].astype(np.int32))
    ds = tf.data.Dataset.zip((input_ids_ds, attention_mask_ds))
    ds = tf.data.Dataset.zip((ds, ds))
    return ds.batch(1024).prefetch(tf.data.AUTOTUNE)

def load_tokenizer_and_backbone(folder): 
    print('Loading tokenizer and backbone from', folder)
    tokenizer = transformers.AutoTokenizer.from_pretrained(str(folder))
    with tf.device('/device:GPU:0'): 
        backbone = transformers.TFAutoModel.from_pretrained(str(folder))
    return tokenizer, backbone

In [None]:
%%time
# 8 min

def build_scorer_model(backbone): 
    input_ids = tf.keras.Input((MAX_SCORER_LENGTH,), dtype=tf.int32)
    attention_mask = tf.keras.Input((MAX_SCORER_LENGTH,), dtype=tf.int32)
    
    backbone_outputs = backbone(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        return_dict=True,
    )
    x = backbone_outputs.pooler_output
    score_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=score_layer(x))

with tf.device('/device:GPU:0'): 
    robertaB_tokenizer, robertaB_backbone = load_tokenizer_and_backbone(BACKBONES_DIR/'roberta_base')
    robertaL_tokenizer, robertaL_backbone = load_tokenizer_and_backbone(BACKBONES_DIR/'roberta_large')

raw_dataset = datasets.Dataset.from_pandas(dfc)
processed_dataset = raw_dataset.map(
    lambda ex: robertaB_tokenizer(ex['comment_text'], max_length=MAX_SCORER_LENGTH, padding='max_length', truncation=True), 
    batched=True, num_proc=4,
)
df_ds = dataset_to_test_ds(processed_dataset) 
raw_dataset = datasets.Dataset.from_pandas(sub)
processed_dataset = raw_dataset.map(
    lambda ex: robertaB_tokenizer(ex['text'], max_length=MAX_SCORER_LENGTH, padding='max_length', truncation=True), 
    batched=True, num_proc=4,
)
sub_ds = dataset_to_test_ds(processed_dataset) 

def predict(backbone, ds):
    preds = backbone.predict(ds, verbose=1).pooler_output.astype(np.float32)
    return np.squeeze(preds)

with tf.device('/device:GPU:0'): 
    model = build_scorer_model(robertaB_backbone)
    
    model.load_weights(WTS/'old_pseudo_label.h5')
    dfc_old_lb827 = predict(robertaB_backbone, df_ds)
    sub_old_lb827 = predict(robertaB_backbone, sub_ds)
    
    model.load_weights(WTS/'2019val737_robertab_scorer.h5')
    dfc_new_lb777 = predict(robertaB_backbone, df_ds)
    sub_new_lb777 = predict(robertaB_backbone, sub_ds)
    
    model.load_weights(WTS/'ruddit_val731_robertab.h5')
    dfc_ruddit_lb785 = predict(robertaB_backbone, df_ds)
    sub_ruddit_lb785 = predict(robertaB_backbone, sub_ds)
    
    import gc
    del robertaB_backbone; gc.collect()
#     model = build_scorer_model(robertaL_backbone)
    
#     model.load_weights(WTS/'comp_only_fold3_val737_robertal_sentiment_10thDec.h5')
#     dfc_fold3_lb820 = predict(robertaL_backbone, df_ds)
#     sub_fold3_lb820 = predict(robertaL_backbone, sub_ds)
    
#     model.load_weights(WTS/'fold3_old_then_comp_loss571_val750_ep1_roberta_sent_12thDec.h5')
#     dfc_fold3_old_lb805 = predict(robertaL_backbone, df_ds)
#     sub_fold3_old_lb805 = predict(robertaL_backbone, sub_ds)
    
pipeline = joblib.load('../input/toxic-dataframes/pipeline_lb864.pkl')
dfc['tfidf_lb864'] = pipeline.predict(dfc.comment_text)
sub['tfidf_lb864'] = pipeline.predict(sub.text)

In [None]:
dfc_comment_to_i = {c: i for i, c in enumerate(dfc.comment_text.values)}
sub_comment_to_i = {c: i for i, c in enumerate(sub.text.values)}

A, B = df.copy(), df.copy()

A['A_comment'], A['B_comment'] = A.more_toxic, A.less_toxic
B['A_comment'], B['B_comment'] = B.less_toxic, B.more_toxic
dfc_dict = dfc.set_index('comment_text').to_dict()
for feat in UNIT_FEATURES: 
    df[f'MT_{feat}'] = df.more_toxic.map(dfc_dict[feat])
    df[f'LT_{feat}'] = df.less_toxic.map(dfc_dict[feat])
    
    A[f'A_{feat}'], A[f'B_{feat}'], A['y'] = df[f'MT_{feat}'], df[f'LT_{feat}'], 0.0
    B[f'A_{feat}'], B[f'B_{feat}'], B['y'] = df[f'LT_{feat}'], df[f'MT_{feat}'], 1.0
    

df_temp = pd.concat([A, B])
df_temp['A_i'] = df_temp.A_comment.map(dfc_comment_to_i)
df_temp['B_i'] = df_temp.B_comment.map(dfc_comment_to_i)
train, valid = df_temp[df_temp.fold!=3], df_temp[df_temp.fold==3]

In [None]:
%%time
def df_to_tfds(df, is_train=False): 
    col_to_values = {
        'old_lb827': dfc_old_lb827,
        'new_lb777': dfc_new_lb777,
        'ruddit_lb785': dfc_ruddit_lb785,
        # 'fold3_lb820': dfc_fold3_lb820,
        # 'fold3_old_lb805': dfc_fold3_old_lb805,
    }
    inputs = []
    for feat in UNIT_FEATURES: 
        inputs.append(tf.data.Dataset.from_tensor_slices(df[f'A_{feat}'].values))
    for feat in LIST_FEATURES: 
        x = col_to_values[feat][df.A_i]
        inputs.append(tf.data.Dataset.from_tensor_slices(x))
    for feat in UNIT_FEATURES: 
        inputs.append(tf.data.Dataset.from_tensor_slices(df[f'B_{feat}'].values))
    for feat in LIST_FEATURES:
        x = col_to_values[feat][df.B_i]
        inputs.append(tf.data.Dataset.from_tensor_slices(x))
    input_ds = tf.data.Dataset.zip(tuple(inputs))
    
    label_ds = tf.data.Dataset.from_tensor_slices(df.y.values)
    ds = tf.data.Dataset.zip((input_ds, label_ds))
    if is_train: 
        ds = ds.shuffle(len(df), reshuffle_each_iteration=True).repeat()
    ds = ds.batch(BATCH_SIZE)
    if not is_train: 
        ds = ds.cache()
    steps = len(df)//BATCH_SIZE
    return ds.prefetch(tf.data.AUTOTUNE), steps


def test_df_to_tfds(df): 
    col_to_values = {
        'old_lb827': sub_old_lb827,
        'new_lb777': sub_new_lb777,
        'ruddit_lb785': sub_ruddit_lb785,
        # 'fold3_lb820': sub_fold3_lb820,
        # 'fold3_old_lb805': sub_fold3_old_lb805,
    }
    
    inputs = []
    for feat in UNIT_FEATURES: 
        inputs.append(tf.data.Dataset.from_tensor_slices(df[feat].values))
    for feat in LIST_FEATURES: 
        x = col_to_values[feat]
        inputs.append(tf.data.Dataset.from_tensor_slices(x))
    ds = tf.data.Dataset.zip(tuple(inputs))
    ds = tf.data.Dataset.zip((ds, ds))
    ds = ds.batch(1024)
    return ds.prefetch(tf.data.AUTOTUNE)


train_ds, train_steps = df_to_tfds(train, is_train=True)
valid_ds, valid_steps = df_to_tfds(valid, is_train=False)
test_ds = test_df_to_tfds(sub)

## Train Cross Encoder

In [None]:
def build_hidden_layer(hidden_layer_units, hidden_dropout, name='hidden_layer'): 
    layers = []
    for units in hidden_layer_units: 
        layers.append(tf.keras.layers.Dropout(hidden_dropout))
        layers.append(tf.keras.layers.Dense(
            units, 
            activation=tfa.activations.mish, 
        ))
    return tf.keras.Sequential(layers, name=name)

def build_model(): 
    A_unit_inputs, B_unit_inputs = [], []
    A_unit_embeddings, B_unit_embeddings = [], []
    for unit_feat in UNIT_FEATURES:
        A_unit_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name=f'A_{unit_feat}')
        B_unit_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name=f'B_{unit_feat}')
        
        A_unit_inputs.append(A_unit_input)
        B_unit_inputs.append(B_unit_input)
        
        U = tf.keras.layers.Dense(1, activation=tfa.activations.mish, name=unit_feat)
        A_unit_embeddings.append(U(A_unit_input))
        B_unit_embeddings.append(U(B_unit_input))
        
    
    A_list_inputs, B_list_inputs = [], []
    A_list_embeddings, B_list_embeddings = [], []
    for list_feat, dim in zip(LIST_FEATURES, LIST_FEATURE_DIMS):
        A_list_input = tf.keras.Input(shape=(dim,), dtype=tf.float32, name=f'A_{list_feat}')
        B_list_input = tf.keras.Input(shape=(dim,), dtype=tf.float32, name=f'B_{list_feat}')
        
        A_list_inputs.append(A_list_input)
        B_list_inputs.append(B_list_input)
        
        L = build_hidden_layer(LIST_FEATURE_HIDDEN_LAYERS, HIDDEN_DROPOUT, name=list_feat)
        A_list_embeddings.append(L(A_list_input))
        B_list_embeddings.append(L(B_list_input))

    A_x = tf.concat(A_unit_embeddings+A_list_embeddings, axis=-1)
    B_x = tf.concat(B_unit_embeddings+B_list_embeddings, axis=-1)
    x = tf.concat([A_x, B_x], axis=-1)
    
    H = build_hidden_layer(FINAL_HIDDEN_LAYERS, HIDDEN_DROPOUT, name='final_hidden_layer')
    x = H(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid', name='y')(x)
    return tf.keras.Model(A_unit_inputs+A_list_inputs+B_unit_inputs+B_list_inputs, outputs=x)

def optimizer_factory(lr):
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    optimizer = tfa.optimizers.SWA(optimizer)
    return optimizer

def loss_fn(y_true, y_pred): 
    y_true, y_pred = tf.cast(y_true, tf.float32), tf.cast(y_pred, tf.float32)
    
    # return tfr.keras.losses.PairwiseHingeLoss()(y_true, y_pred)
    return tf.keras.losses.BinaryCrossentropy()(y_true, y_pred)
    # return tfr.keras.losses.PairwiseLogisticLoss(temperature=1.0)(y_true, y_pred)

def model_compile(): 
    optimizer = optimizer_factory(1e-2)
    model.compile(
        optimizer=optimizer, 
        loss=loss_fn, 
        metrics='accuracy', 
        steps_per_execution=1024, 
    )    

LIST_FEATURE_HIDDEN_LAYERS = [16]
FINAL_HIDDEN_LAYERS = [256, 64, 32, 16, 4, 2]
HIDDEN_DROPOUT = 0.25


with tf.device('/device:GPU:0'): 
    model = build_model()
    model_compile()
    
reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_accuracy', factor=0.5, patience=5, verbose=1, mode='max', min_lr=1e-6, 
)
accuracy_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'checkpoint_acc.h5', monitor='val_accuracy', mode='max', save_weights_only=True, save_best_only=True, verbose=1
)

history = model.fit(
    train_ds, steps_per_epoch=train_steps, epochs=100, 
    validation_data=valid_ds, validation_steps=valid_steps, 
    callbacks=[reduce_lr_on_plateau, accuracy_checkpoint]
)

In [None]:
# 0.744 without [4], None (No Fine Tuned Layers)

## Inference
---

In [None]:
def extract_features(model): 
    unit_inputs, unit_embeddings = [], []
    for unit_feat in UNIT_FEATURES: 
        unit_input = tf.keras.Input(shape=(1,), dtype=tf.float32, name=f'{unit_feat}_input')
        unit_embeddings.append(model.get_layer(unit_feat)(unit_input))
        unit_inputs.append(unit_input)
        
    list_inputs, list_embeddings = [], []
    for list_feat, dim in zip(LIST_FEATURES, LIST_FEATURE_DIMS): 
        list_input = tf.keras.Input(shape=(dim,), dtype=tf.float32, name=f'{list_feat}_input')
        L = model.get_layer(list_feat)
        list_embeddings.append(L(list_input))
        list_inputs.append(list_input)

    x = tf.concat(unit_embeddings+list_embeddings, axis=-1)
    return tf.keras.Model(unit_inputs+list_inputs, outputs=x)


with tf.device('/device:GPU:0'): 
    model.load_weights('checkpoint_acc.h5')
    feature_model = extract_features(model)
    test_x = feature_model.predict(test_ds, verbose=1)

In [None]:
def scorer_model(model): 
    x_dim = len(UNIT_FEATURES) + len(LIST_FEATURES)*LIST_FEATURE_HIDDEN_LAYERS[-1]
    A_x = tf.keras.Input(shape=(x_dim,), dtype=tf.float32, name='A_x')
    B_x = tf.keras.Input(shape=(x_dim,), dtype=tf.float32, name='B_x')
    x = tf.concat([A_x, B_x], axis=-1)
    
    H = model.get_layer('final_hidden_layer')
    x = H(x)
    score_layer = model.get_layer('y')
    x = score_layer(x)
    return tf.keras.Model(inputs=[A_x, B_x], outputs=x)

with tf.device('/device:GPU:0'): 
    scorer = scorer_model(model)

In [None]:
SPEED_UP = 2

def build_test_ds(i):
    B_x = test_x[np.arange(0, len(sub), SPEED_UP)]
    A_x = np.stack([test_x[i] for _ in range(len(B_x))])
    
    A_ds = tf.data.Dataset.from_tensor_slices(A_x)
    B_ds = tf.data.Dataset.from_tensor_slices(B_x)
    ds = tf.data.Dataset.zip((A_ds, B_ds))
    ds = tf.data.Dataset.zip((ds, A_ds))
    ds = ds.batch(4096*4)
    return ds.prefetch(tf.data.AUTOTUNE)

sub_scores = []
for i in tqdm(range(len(sub))): 
    test_ds = build_test_ds(i)
    scores = scorer.predict(test_ds, verbose=0)
    sub_scores.append(scores.mean())
    
sub['score'] = sub_scores

In [None]:
sub.score = -sub.score
sub.score = sub.score.rank(method='first')
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)

sub.sort_values(by='score')

In [None]:
# sub.sort_values(by='tfidf_lb864')