In [None]:
from pathlib import Path
import numpy as np 
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tqdm import tqdm

import transformers

from transformers import PretrainedConfig, TFRobertaModel

from tokenizers import BertWordPieceTokenizer, SentencePieceBPETokenizer

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")




In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)


In [None]:
DATA_ROOT = Path("..")/"input"/ "jigsaw-multilingual-toxic-comment-classification/"

df1,df2,df3,test,sample = [pd.read_csv(DATA_ROOT/fname) for fname in ["jigsaw-toxic-comment-train.csv",
                                                                        "jigsaw-unintended-bias-train.csv",
                                                                        "validation.csv",
                                                                        "test.csv",
                                                                        "sample_submission.csv"
                                                                       ]]

train = df1
valid = df3

# subsample the train dataframe to 50%-50% 
train = pd.concat([
    train.query('toxic==1'),
    train.query('toxic==0').sample(117700,random_state=42)
])
# shufle it just to make sure 
train = train.sample(frac=1, random_state = 42)

In [None]:
#sum(train.toxic)

In [None]:
train.describe()

In [None]:
# #DATA_ROOT = Path("..")/"input"/ "jigsaw-multilingual-toxic-comment-classification/"

# df1,df2,df3,test,sample = [pd.read_csv(fname) for fname in ["jigsaw-toxic-comment-train.csv",
#                                                                         "jigsaw-unintended-bias-train.csv",
#                                                                         "validation.csv",
#                                                                         "test.csv",
#                                                                         "sample_submission.csv"
#                                                                        ]]
# df2.toxic = df2.toxic.round().astype(int)
# train = pd.concat([
#     df1[['comment_text', 'toxic']],
#     df2[['comment_text', 'toxic']].query('toxic==1'),
#     df2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
# ])

# valid = df3


# # train = pd.read_csv('jigsaw-toxic-comment-train.csv')
# # valid = pd.read_csv('validation.csv')
# # test = pd.read_csv('test.csv')
# # submission = pd.read_csv('sample_submission.csv')

# # subsample the train dataframe to 50%-50% 
# # train = pd.concat([
# #     train.query('toxic==1'),
# #     train.query('toxic==0').sample(sum(train.toxic),random_state=42)
# # ])
# # shufle it just to make sure 
# train = train.sample(frac=1, random_state = 42)


## Getting data in new way to improve


In [None]:
# df1,df2,df3 = [pd.read_csv(fname) for fname in ["jigsaw-toxic-comment-train.csv",
#                                                 "jigsaw-unintended-bias-train.csv",
#                                                 "validation.csv"
#                                               ]]



# test, sample= [pd.read_csv(fname) for fname in ["test.csv",
#                                                 "sample_submission.csv"
#                                                ]]

In [None]:
# df2['toxic'] = df2['toxic'].apply(lambda x: 0 if x<0.5 else 1)

In [None]:
# df2 = pd.concat([
    
#     df2[['comment_text', 'toxic']].query('toxic==1'),
#     df2[['comment_text', 'toxic']].query('toxic==0').sample(n=500000, random_state=0)
# ])

In [None]:
# df2.toxic.value_counts(normalize=True)

In [None]:
# # Character length for the rows in the df1 & df2 data
# df1['char_length'] = df1['comment_text'].apply(lambda x: len(str(x)))
# df2['char_length'] = df2['comment_text'].apply(lambda x: len(str(x)))

In [None]:
# # Character length for the rows in the training data
# df1= df1[df1['char_length'] >= 545] 
# df2= df2[df2['char_length'] >= 545] 

In [None]:
# train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# train = train.sample(frac=1).reset_index(drop=True).head(200000)
# train = train.reset_index(drop=True)
# valid = df3
# valid = valid.reset_index(drop=True)

In [None]:
# Check percentage of toxic comments in training Dataset
train.toxic.value_counts(normalize=True)

In [None]:
# Check percentage of toxic comments in validation Dataset
valid.toxic.value_counts(normalize=True)

In [None]:
# # # Function to encode the text
def encode_fn(texts, tokenizer, maxlen=512):
    encode = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(encode['input_ids'])

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
# Configuration
#EPOCHS = 3
LR = 1e-5
BATCH_SIZE = 8 
global_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync
MAX_LEN = 192
TOTAL_STEPS_STAGE1 = 3000
VALIDATE_EVERY_STAGE1 = 500
TOTAL_STEPS_STAGE2 = 1500
VALIDATE_EVERY_STAGE2 = 500

# roberta-base
#CONFIG_PATH = '../input/robertabaseconfig/config.json'
# roberta-large
CONFIG_PATH = '../input/robertalargeconfig/config.json'

In [None]:
# First load the real tokenizer
#tokenizer = transformers.AutoTokenizer.from_pretrained('jplu/tf-xlm-roberta-base')
# First load the real tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('jplu/tf-xlm-roberta-large')

In [None]:
%%time
x_train = encode_fn(train.comment_text.astype(str), tokenizer, maxlen=MAX_LEN)
x_valid = encode_fn(valid.comment_text.astype(str), tokenizer, maxlen=MAX_LEN)
x_test =  encode_fn(test.content.astype(str), tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values.reshape(-1,1)
y_valid = valid.toxic.values.reshape(-1,1)

In [None]:
def create_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_BATCH_SIZE).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset


In [None]:
# Create datasets
train_dataset = create_dataset(x_train, y_train, True)
valid_dataset   = create_dataset(x_valid)
test_dataset  = create_dataset(x_test)

In [None]:
# Function to build the MODEL
def model_fn(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    
    cls_token = sequence_output[:, 0, :]
    
    
    # layers1 = Dense(512, kernel_regularizer=regularizers.l2(hp.get('regularize')),activation='elu')(cls_token)
    # #layers2 = Dropout(0.5)(layers1)
    # layers3 = Dense(512, kernel_regularizer=regularizers.l2(hp.get('regularize')),activation='elu')(layers1)
    # #layes4 = Dropout(0.5)(layers3)
    # layers5 = Dense(512, kernel_regularizer=regularizers.l2(hp.get('regularize')),activation='elu')(layers3)
    # #layers6 = Dropout(0.5)(layers5)
    # layers7 = Dense(512, kernel_regularizer=regularizers.l2(hp.get('regularize')),activation='elu')(layers5)
    # #layers8 = Dropout(0.5)(layers7)
    
    
    
    #out = Dense(1, activation='sigmoid')(layers7)
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    #model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics = [tf.keras.metrics.AUC()] )#metrics=['accuracy'])
    #model.compile(Adam(lr=hp.get('learning_rate')), loss='binary_crossentropy',metrics=['accuracy']) #metrics = [tf.keras.metrics.AUC()] )#metrics=['accuracy'])
    
    
    
    return model
    
    
    
#     cls_token = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(cls_token)
    
#     model = Model(inputs=input_word_ids, outputs=out)
#     model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
#     model.compile(
#         loss=tf.keras.losses.BinaryCrossentropy(),
#         optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
#         metrics=[tf.keras.metrics.AUC()]
    
#    )
    

In [None]:
# ## Download config of huggingface roberta base model
# #!wget https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json

In [None]:
%%time
def create_model_from_config():
    with strategy.scope():
        ### Load only config no weights ###
        config = PretrainedConfig.from_json_file(CONFIG_PATH)                
        transformer_layer = TFRobertaModel(config) 

        ### Make the cls model ###               
        model = model_fn(transformer_layer)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    model.summary()
    return model, optimizer


def load_weights_workaround():
    ### Load full pretrained model outside strategy scope ###
    #transformer_layer = transformers.TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-base')
    transformer_layer = transformers.TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-large')

    ### Assign weights 
    for tv1, tv2 in zip(model.layers[1].trainable_variables,
                        transformer_layer.trainable_variables):
        tv1.assign(tv2)


model, optimizer = create_model_from_config()
load_weights_workaround()
model.summary()

In [None]:
def train(train_dataset, valid_dataset=None, y_valid=None,
          total_steps=5000, validate_every=500):
    step = 0
    ### Training lopp ###
    for tensor in train_dataset:
        distributed_train_step(tensor) 
        step+=1

        if (step % validate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_accuracy_metric.result().numpy()
            print("Step %d, train AUC: %.5f" % (step, train_metric))   
            
            ### Test loop with exact AUC ###
            if valid_dataset:
                val_metric = roc_auc_score(y_valid, predict(valid_dataset))
                print("     validation AUC: %.5f" %  val_metric)   

            ### Reset (train) metrics ###
            train_accuracy_metric.reset_states()
            
        if step  == total_steps:
            break

In [None]:
@tf.function
def distributed_train_step(data):
    strategy.experimental_run_v2(train_step, args=(data,))

In [None]:
# def train_step(inputs):
#     features, labels = inputs

#     with tf.GradientTape() as tape:
#         predictions = model(features, training=True)
#         loss = compute_loss(labels, predictions)

#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

#     train_accuracy_metric.update_state(labels, predictions)

In [None]:
def predict(dataset):  
    predictions = []
    for tensor in dataset:
        predictions.append(distributed_prediction_step(tensor))
    ### stack replicas and batches
    predictions = np.vstack(list(map(np.vstack,predictions)))
    return predictions

@tf.function
def distributed_prediction_step(data):
    predictions = strategy.experimental_run_v2(prediction_step, args=(data,))
    return strategy.experimental_local_results(predictions)

def prediction_step(inputs):
    features = inputs  # note datasets used in prediction do not have labels
    predictions = model(features, training=False)
    return predictions

In [None]:
def define_losses_and_metrics():
    with strategy.scope():
        loss_object = tf.keras.losses.BinaryCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=False)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_BATCH_SIZE)
            return loss

        train_accuracy_metric = tf.keras.metrics.AUC(name='training_AUC')

    return compute_loss, train_accuracy_metric

In [None]:
compute_loss, train_accuracy_metric = define_losses_and_metrics()

In [None]:
# #%%time
# train(train_dataset, valid_dataset, y_valid,
#       TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)


# ResourceExhaustedError will occur

In [None]:
with strategy.scope():
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR)

In [None]:
CLIP_NORM = 1  # agressive clipping

@tf.function
def train_step(data):
    inputs, targets = data
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = compute_loss(targets, predictions)

    ### There is an unused pooler head of the tranformer with None gradients
    ### we need to get rid of it before clipping
    trainable_variables = [v for v in model.trainable_variables 
                           if 'pooler' not in v.name]

    ### Calculate grads
    gradients = tape.gradient(loss, trainable_variables)
    
    ### We cannot clip replicas, it throws an error
    ### First we have to manually sum the gradients from the replicas
    gradients = tf.distribute.get_replica_context().all_reduce('sum', gradients)

    ### Clip by global norm, (do not change gradient direction)
    gradients, _ = tf.clip_by_global_norm(gradients, CLIP_NORM)

    ### Apply gradients
    ### NOTE: Only for tenforflow 2.2 on colab!!!!
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    
    #                          experimental_aggregate_gradients=False)

    train_accuracy_metric.update_state(targets, predictions)

In [None]:
optimizer.learning_rate.assign(0.001)

In [None]:
#%%time
train(train_dataset, valid_dataset, y_valid,
      TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)


In [None]:
%%time
# make a new dataset for training with the validation data 
# with targets, shuffling and repeating
valid_dataset_4_training = create_dataset(x_valid, y_valid, training=True)

# train again
train(valid_dataset_4_training,
      total_steps = TOTAL_STEPS_STAGE2, 
      validate_every = VALIDATE_EVERY_STAGE2)  # not validating but printing now

In [None]:
sample['toxic'] = predict(test_dataset)
sample.to_csv('submission.csv', index=False)