In [None]:
import os,re,gc,pickle,random,sys

import numpy as np 
import pandas as pd 
import tensorflow as tf
import transformers
from transformers import TFAutoModel
from tensorflow.data.experimental import sample_from_datasets
from tensorflow.data import Dataset

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Configurations

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print(strategy.num_replicas_in_sync)
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [None]:
np.random.seed(1234)
random.seed(1234)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
with strategy.scope():
    tf.random.set_seed(1234)

In [None]:
MAX_LEN = 192
HEAD="cls"
MODEL = 'jplu/tf-xlm-roberta-large'

TRANSFER = "../input/buffer/submission-9461.csv"
PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"
INPATH = "../input/jwtc-xlmroberta-encoding-192-pickle/datain/"
FORM = "training/encode_{}.pkl"
langs = ["en","en2","es","fr","it","pt","ru","tr"]

# Load Data

In [None]:
trans = pd.read_csv(TRANSFER).toxic.values.astype('float32')

In [None]:
def pick_load_format(path):
    with open(INPATH+path,"rb") as f:
        return pickle.load(f)
    
def get_cong(n,verb=True):
    tot = round(1+(n*2)/10_000)*10_000
    if verb: print("Pos: {}, Sample neg: {}, Total: {}".format(n,tot-n,tot))
    return tot-n

def load_balance_shuffle_train(seed=1214):
    train = []
    for i in langs:
        df1 = pick_load_format(FORM.format(i+"_l1"))
        sample_size = get_cong(df1.shape[0])
        df0 = pick_load_format(FORM.format(i+"_l0")).sample(n=sample_size, random_state=seed)
        train += [df1,df0]
    train = pd.concat(train).sample(frac=1, random_state=seed)
    train.reset_index(inplace=True,drop=True)
    return np.stack(train.comment_text.values, axis=0).astype("int32"), train.toxic.values.astype("float32")

def balance_shuffle(df,col="comment_text",pos_thred=0.6,neg_thred=0.4,balance=True,seed=1214):
    label = df.toxic.values
    df_pos = df[label>=pos_thred]
    df_neg = df[label<=neg_thred]
    psize,nsize = df_pos.shape[0],df_neg.shape[0]
    cong = get_cong(psize,balance)
    if balance and nsize>cong: 
        df_neg = df_neg.sample(n=cong, random_state=seed)
    df0 = pd.concat([df_pos,df_neg]).sample(frac=1, random_state=seed)
    df0.reset_index(inplace=True,drop=True)
    return np.stack(df0[col].values, axis=0).astype("int32"), df0.toxic.values

def load_data(pos_thred=0.6,neg_thred=0.4):
    train = load_balance_shuffle_train()
    valid = balance_shuffle(pick_load_format("valid.pkl"),balance=False)
    
    df_test = pick_load_format("test.pkl")
    df_test["toxic"] = trans
    quesdo = balance_shuffle(df_test,col="content",balance=False,
                             pos_thred=pos_thred,neg_thred=neg_thred)
    
    return train,valid,quesdo,np.stack(df_test.content.values, axis=0).astype("int32")

In [None]:
%%time
train,valid,quesdo,test =  load_data(pos_thred=0.6,neg_thred=0.4)
train_size,valid_size,quesdo_size = len(train[1]),len(valid[1]),len(quesdo[1])
print(train_size,valid_size,quesdo_size)

# Build Dataset

In [None]:
def make_dataset_pipeline(dataset, cache=False,repeat_and_shuffle=False,shuffle_size=128_000,seed=386491):
    if cache: dataset = dataset.cache()
    if repeat_and_shuffle:
        dataset = dataset.repeat().shuffle(shuffle_size,seed)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def build_datasets(train,valid,quesdo,test):
    dtrain = Dataset.from_tensor_slices(train)
    dvalid = Dataset.from_tensor_slices(valid)
    dquesdo = Dataset.from_tensor_slices(quesdo)
    dtest = Dataset.from_tensor_slices(test)

    train_dataset = make_dataset_pipeline(dtrain,True, repeat_and_shuffle=True)
    valid_dataset = make_dataset_pipeline(dvalid, True,repeat_and_shuffle=True) 
    quesdo_dataset = make_dataset_pipeline(dquesdo, True,repeat_and_shuffle=True) 

    validset = make_dataset_pipeline(dvalid) 
    testset = make_dataset_pipeline(dtest)
    return train_dataset,valid_dataset,quesdo_dataset,validset,testset

def mix_dataset(dss,szs,weight=None,seed=1214):
    if weight is None: weight = np.ones(len(szs))
    prop = np.array(szs)*weight
    return sample_from_datasets(dss,prop/np.sum(prop),seed),sum(szs)

In [None]:
train_dataset,valid_dataset,quesdo_dataset,validset,testset = build_datasets(train,valid,quesdo,test)
train_dataset,train_size = mix_dataset([train_dataset,quesdo_dataset],[train_size,quesdo_size],weight=np.array([1,3])) #55:5

# Build the model and check summary

In [None]:
from tensorflow.keras.layers import Input,Dropout,Dense,GlobalAveragePooling1D,GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC 
from tensorflow.keras.initializers import GlorotUniform

def get_cls(x):
    return x[:, 0, :]

dic = {"mean":GlobalAveragePooling1D(),
      "max":GlobalMaxPool1D(),
      "cls":get_cls}

def build_model(transformer,head="cls" , loss='binary_crossentropy',
                max_len=512, drop_rate=None, lr=1e-5,seed=940208):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    x = dic[head](sequence_output)
    if drop_rate is not None: 
        x = Dropout(drop_rate)(x)
    out = Dense(1, activation='sigmoid',kernel_initializer=GlorotUniform(seed))(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=lr), loss=loss, metrics=[AUC()])
    
    return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer,head=HEAD,loss='binary_crossentropy',
                        max_len=MAX_LEN,lr=1e-5)
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler,ReduceLROnPlateau

model_path = "xlm-roberta.h5"
checkpoint = ModelCheckpoint(model_path, monitor='val_auc', mode="max", 
                             save_best_only=True, save_weights_only=True, verbose=1)

# es = EarlyStopping(monitor='val_auc', mode='max', patience=6, restore_best_weights=False, verbose=1)
rp = ReduceLROnPlateau(monitor='val_auc', factor=0.8, patience=3, verbose=1, mode='max')

callback_list = [checkpoint,rp]

# Training

In [None]:
%%time
N_STEPS = train_size // (BATCH_SIZE*4)
EPOCHS = 8
train_history = model.fit(
    train_dataset,
    steps_per_epoch=N_STEPS,
    validation_data=validset,
    callbacks=callback_list,
    epochs=EPOCHS
)

In [None]:
del model
gc.collect()
tf.tpu.experimental.initialize_tpu_system(tpu)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer,head=HEAD,loss='binary_crossentropy', max_len=MAX_LEN,lr=5e-6)
    model.load_weights(model_path)

In [None]:
%%time
n_steps = valid_size // (BATCH_SIZE)
EPOCHS = 1
train_history_2 =model.fit(
    valid_dataset,
    steps_per_epoch=n_steps,
    epochs= EPOCHS
)

In [None]:
# model.save_weights(model_path)
!rm xlm-roberta.h5

# Make Submission

In [None]:
sub = pd.read_csv(PATH + "sample_submission.csv")
sub['toxic'] = model.predict(testset, verbose=1)
sub.to_csv('submission.csv', index=False)