In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys

sys.maxsize = int(1e7)

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig
from transformers import logging
from scipy.stats import rankdata


logging.set_verbosity_error()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
CONFIG = {"seed": 2022,
          "buffer_size": 5000,
          "n_fold": 5,
          "down_frac": 1,
          "train_split": 0.8,
          "max_length": 128,
          "batch_size": 64,
          "epochs": 3,
          "ft_epochs": 2,
          "learning_rate": 5e-4,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "weight_decay": 1e-6,
          "distilbert_dropout": 0.2,
          "distilbert_att_dropout": 0.2,
          "layer-dropout": 0.2
          }

# Tokenizing Text

In [None]:
# Define function to encode text data in batched 
def batch_encode(tokenizer, df, text_col_name="comment_text", label_col_name=None,
                 batch_size=CONFIG["batch_size"], max_length=CONFIG["max_length"], 
                 frac=CONFIG["down_frac"]):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    def encode(text_tensor):
        text = tf.compat.as_str(text_tensor.numpy())
        encoded_text = tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True,
                           return_attention_mask=True
                           )
        return encoded_text["input_ids"], encoded_text["attention_mask"]
        
    
    def tf_encode(text, label=None):
        input_ids, attention_mask = tf.py_function(encode, [text], [tf.int64, tf.int64])
        # `tf.data.Datasets` work best if all components have a shape set
        #  so set the shapes manually: 
        input_ids.set_shape((128,))
        attention_mask.set_shape((128,))
        if label is None:
            return {"input_ids": input_ids, "attention_mask":attention_mask}
        else:   
            label.set_shape([])
            return {"input_ids": input_ids, "attention_mask":attention_mask}, label
    
    @tf.function
    def class_func(text, label):
        if label == 0:
            return 0
        else:
            return 1
        
    if  label_col_name is None:
        # Make dataset
        text_input = tf.data.Dataset.from_tensor_slices(np.asarray(df[text_col_name]).astype('str'))
        text_input = text_input.shuffle(CONFIG["buffer_size"], seed=CONFIG["seed"])
    else:        
        # Make dataset
        text_input = tf.data.Dataset.from_tensor_slices((np.asarray(df[text_col_name]).astype('str'), 
                                                         np.asarray(df[label_col_name]).astype('float32')))
        text_input = text_input.shuffle(CONFIG["buffer_size"], seed=CONFIG["seed"])
        
        # Down sample the majority (not toxic) group
        if frac < 1:
            y = df[label_col_name]
            frac0 = sum(y == 0)/y.shape[0]
            init_frac = [frac0, 1 - frac0]
            target_frac = [frac, 1 - frac]
            resample = tf.data.experimental.rejection_resample(class_func,target_dist=target_frac , initial_dist=init_frac)
            text_input = text_input.apply(resample)
            text_input = text_input.map(lambda x, y: y)
        
    # Encode and batch  data
    token_input = text_input.map(tf_encode, num_parallel_calls=tf.data.AUTOTUNE)
    token_input = token_input.batch(batch_size)
    
    return token_input
    

# Defining Model Architecture

In [None]:
def build_model(transformer, max_length=CONFIG["max_length"]):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
 

    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=CONFIG["seed"]) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name="input_ids", 
                                            dtype="int32")
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name="attention_mask", 
                                                  dtype="int32")
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    drout1 = tf.keras.layers.Dropout(CONFIG["layer-dropout"],
                                     seed=CONFIG["seed"]
                                     )(cls_token)
    
    dense1 = tf.keras.layers.Dense(256, 
                                   activation="relu",
                                   kernel_initializer=weight_initializer,  
                                   bias_initializer="zeros"
                                   )(drout1)
    
    drout2 = tf.keras.layers.Dropout(CONFIG["layer-dropout"],
                                     seed=CONFIG["seed"]
                                     )(dense1)
    
    
    dense2 = tf.keras.layers.Dense(32, 
                                   activation="relu",
                                   kernel_initializer=weight_initializer,  
                                   bias_initializer="zeros"
                                   )(dense1)
    
    drout3 = tf.keras.layers.Dropout(CONFIG["layer-dropout"],
                                     seed=CONFIG["seed"]
                                     )(dense2)
    
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation="linear",
                                   kernel_initializer=weight_initializer,  
                                   bias_initializer="zeros"
                                   )(drout3)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=CONFIG["learning_rate"]), 
                  loss="mean_squared_error",
                  metrics=tfa.metrics.RSquare(name="R^2", dtype=tf.float32, y_shape=(1,)))
    
    return model
    

# Utils

In [None]:
# Split the dataframe into training and validation datasets
def get_partitions(df, train_split=CONFIG["train_split"]):
    n_fold = df["kfold"].max()
    if n_fold == 0:
        train_df = df.sample(frac=train_split, random_state=CONFIG["seed"])
        valid_df = df.drop(train_df.index)
    else:
        train_df = input_df[input_df["kfold"] != k]
        valid_df = input_df[input_df["kfold"] == k]
    
    return train_df, valid_df

# get number of steps per epoch 
def get_num_steps(df, label_col_name, down_frac=True):
    if down_frac:
        num_row_df0 = df[df[label_col_name] == 0].shape[0]
        num_row_df1 = df[df[label_col_name] != 0].shape[0]
        num_rows =  int(CONFIG["down_frac"] * num_row_df0) + num_row_df1
    else:
        num_rows = df.shape[0]
    num_steps = num_rows // CONFIG["batch_size"]
    return num_steps

# validate model 
def validate_model(model, valid_df):
    # Predict
    print("      Predict less")
    valid_less_ds = batch_encode(tokenizer, valid_df, text_col_name="less_toxic")
    pred_less = model.predict(valid_less_ds, workers=4, use_multiprocessing=True)
    print("      Predict more")
    valid_more_ds = batch_encode(tokenizer, valid_df, text_col_name="more_toxic")
    pred_more = model.predict(valid_more_ds, workers=4, use_multiprocessing=True)
    # Compare predictions 
    return np.mean(pred_less < pred_more)

# Instantiate transformer model

In [None]:
# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained("../input/huggingface-transformers-tfpt/distilbert-base-uncased")

# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=CONFIG["distilbert_dropout"], 
                          attention_dropout=CONFIG["distilbert_att_dropout"], 
                          output_hidden_states=True)
                          
# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained("../input/huggingface-transformers-tfpt/distilbert-base-uncased")

# Make DistilBERT layers untrainable
distilBERT.trainable = False
    
# initialize model
model = build_model(distilBERT)

# Training Classification Layer Weights

In [None]:
dataset = 1

# load input data 
input_df = pd.read_csv("../input/toxicity-data-prep/train" + str(dataset) + "_data.csv")
input_df = input_df[input_df["comment_text"] != "deleted"]
print("input data number:", input_df.shape[0])
# load validation data 
validation_df = pd.read_csv("../input/toxicity-data-prep/valid_data.csv")
print("validation data number:", validation_df.shape[0])
# load test data 
test_df = pd.read_csv("../input/toxicity-data-prep/test_data.csv")
print("test data number:", test_df.shape[0])

In [None]:
n_fold = input_df["kfold"].max()
for k in range(n_fold + 1):
    print('Train Model for fold ' + str(k))
    # make training and validation dataset 
    print("    Split Data")
    train_fold_df, valid_fold_df = get_partitions(input_df)
    train_fold_ds = batch_encode(tokenizer, train_fold_df, text_col_name="comment_text", label_col_name="y")
    valid_fold_ds = batch_encode(tokenizer, valid_fold_df, text_col_name="comment_text", label_col_name="y", frac=1)

    # Train the model
    print("    Train Model")
    train_history1 = model.fit(
        train_fold_ds.repeat(),
        validation_data=valid_fold_ds.repeat(),
        epochs = CONFIG["epochs"],
        batch_size = CONFIG["batch_size"],
        steps_per_epoch = get_num_steps(train_fold_df, "y"),
        validation_steps = get_num_steps(valid_fold_df, "y", down_frac=False)
        )

    # Validate the model
    print("    Validate Model")
    right_order_pred = validate_model(model, validation_df)
    print("        Correctly ordered sentences in the validation data:", np.round(right_order_pred*100, 3), '%' )

    # Make predictions on test data
    print("    Predict ")
    test_ds = batch_encode(tokenizer, test_df, text_col_name="text")
    test_score = model.predict(test_ds)
    test_df["score"] = rankdata(test_score, method="ordinal")

    # Make predictions on training and validation data
    train_fold_df["score"] = model.predict(train_fold_ds)    # rankdata(train_score, method="ordinal")
    valid_fold_df["score"] = model.predict(valid_fold_ds)
    input_fold_df = pd.concat([train_fold_df, valid_fold_df], sort=True)


    # Save results
    print("    Save \n")
    result_name =  str(dataset) + "_fold" + str(k)
    test_df.to_csv( "result_test" + result_name + ".csv", index=False)
    input_fold_df.to_csv( "result_train" + result_name + ".csv", index=True)