In [None]:
import os,gc,pickle,random,sys

import numpy as np 
import pandas as pd 
import tensorflow as tf
import transformers
from transformers import TFAutoModel
from tqdm.notebook import tqdm

from tensorflow.data import Dataset
from tensorflow.data.experimental import sample_from_datasets

In [None]:
transformers.__version__

In [None]:
tf.__version__

# Configurations

In [None]:
MAX_LEN = 192
HEAD = "cls"

PATH = "../input/jigsaw-multilingual-toxic-comment-classification/"
INPATH = "../input/jmtc-monolinguish-encoding-pickle/"
TRANSFER = "../input/buffer/submission-pt-9470.csv"

LANG = "fr"
lang_valid = ["it","es","tr"]
MODEL = "flaubert/flaubert_large_cased"

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print(strategy.num_replicas_in_sync)
BATCH_SIZE = 12 * strategy.num_replicas_in_sync

In [None]:
np.random.seed(1234)
random.seed(1234)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
with strategy.scope():
    tf.random.set_seed(1234)

In [None]:
models = {"pt":["neuralmind/bert-large-portuguese-cased"],
         "it":["dbmdz/bert-base-italian-xxl-uncased",
              "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"],
         "es":["dccuchile/bert-base-spanish-wwm-cased"],
         "tr":["dbmdz/bert-base-turkish-128k-cased",
              "dbmdz/electra-base-turkish-cased-discriminator"],
         "ru":["DeepPavlov/rubert-base-cased"],
          "fr":["flaubert/flaubert_large_cased",
               "camembert/camembert-large"]}

# Load Data

In [None]:
trans = pd.read_csv(TRANSFER).toxic.values.astype('float32')

In [None]:
fetch_title = lambda x: x.split("/")[-1]

def pick_load_format(path):
    with open(INPATH+path,"rb") as f:
        return pickle.load(f)
    
def get_cong(n,verb=True):
    tot = round(1+(n*2)/10_000)*10_000
    if verb: print("Pos: {}, Sample neg: {}, Total: {}".format(n,tot-n,tot))
    return tot-n

def balance_shuffle(list_arr,pos_thred=0.6,neg_thred=0.4,balance=True):
    target = list_arr[-1]
    idx = np.arange(len(target))
    pos_idx = idx[target>=pos_thred] 
    neg_idx = idx[target<=neg_thred]
    psize,nsize = len(pos_idx),len(neg_idx)
    cong = get_cong(psize,balance)
    if balance and nsize>cong: 
        neg_idx = np.random.choice(neg_idx,cong,replace=False)
    idx = np.concatenate((pos_idx, neg_idx))
    np.random.shuffle(idx)
    
    return tuple(arr[idx] for arr in list_arr)

def load_data(model,lang,frac=.05):
    title = fetch_title(model)
    x_train,y_train = balance_shuffle(pick_load_format("train/train_%s"%title))
    train = x_train,y_train.astype("float32")
    if lang in lang_valid:
        valid = balance_shuffle(pick_load_format("valid/valid_%s"%title),balance=False)
    else:
        thres = round(len(train[-1])*frac)
        valid = tuple(arr[:thres] for arr in train)
        train = tuple(arr[thres:] for arr in train)
    
    x_test,id_test = pick_load_format("test/test_%s"%title)
    quesdo = balance_shuffle((x_test,trans[id_test]),balance=False)
    
    return train,valid,quesdo,x_test,id_test

In [None]:
train,valid,quesdo,test,ids = load_data(MODEL,LANG)
train_size,valid_size,quesdo_size = len(train[1]),len(valid[1]),len(quesdo[1])
print(train_size,valid_size,quesdo_size)

# Build Dataset

In [None]:
def make_dataset_pipeline(dataset, cache=False,repeat_and_shuffle=False,shuffle_size=128_000,seed=386491):
    if cache: dataset = dataset.cache()
    if repeat_and_shuffle:
        dataset = dataset.repeat().shuffle(shuffle_size,seed)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def build_datasets(train,valid,quesdo,test):
    dtrain = Dataset.from_tensor_slices(train)
    dvalid = Dataset.from_tensor_slices(valid)
    dquesdo = Dataset.from_tensor_slices(quesdo)
    dtest = Dataset.from_tensor_slices(test)

    train_dataset = make_dataset_pipeline(dtrain,True, repeat_and_shuffle=True)
    valid_dataset = make_dataset_pipeline(dvalid, True,repeat_and_shuffle=True) 
    quesdo_dataset = make_dataset_pipeline(dquesdo, True,repeat_and_shuffle=True) 

    validset = make_dataset_pipeline(dvalid) 
    testset = make_dataset_pipeline(dtest)
    return train_dataset,valid_dataset,quesdo_dataset,validset,testset

def mix_dataset(dss,szs,weight=None,seed=1214):
    if weight is None: weight = np.ones(len(szs))
    prop = np.array(szs)*weight
    return sample_from_datasets(dss,prop/np.sum(prop),seed),sum(szs)

In [None]:
train_dataset,valid_dataset,quesdo_dataset,validset,testset = build_datasets(train,valid,quesdo,test)

# Build the model and check summary

In [None]:
from tensorflow.keras.layers import Input,Dropout,Dense,GlobalAveragePooling1D,GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC 
from tensorflow.keras.initializers import GlorotUniform

def get_cls(x):
    return x[:, 0, :]

dic = {"mean":GlobalAveragePooling1D(),
      "max":GlobalMaxPool1D(),
      "cls":get_cls}

def build_model(transformer,head="cls" , loss='binary_crossentropy',
                max_len=512, drop_rate=None, lr=1e-5,seed=940208):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    x = dic[head](sequence_output)
    if drop_rate is not None: 
        x = Dropout(drop_rate)(x)
    out = Dense(1, activation='sigmoid',kernel_initializer=GlorotUniform(seed))(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=lr), loss=loss, metrics=[AUC()])
    
    return model

In [None]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL, from_pt=True)
    model = build_model(transformer_layer,head=HEAD,loss='binary_crossentropy', max_len=MAX_LEN,lr=1e-5)
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

model_path = "model1.h5"
checkpoint = ModelCheckpoint(model_path, monitor='val_auc', mode='max', save_best_only=True, save_weights_only=True, verbose=1)
callback_list = [checkpoint]

# Training

In [None]:
train_dataset,train_size = mix_dataset([train_dataset,quesdo_dataset],[train_size,quesdo_size],weight=None)

In [None]:
%%time
N_STEPS = train_size // (BATCH_SIZE*2)
EPOCHS = 20
train_history = model.fit(
    train_dataset,
    steps_per_epoch=N_STEPS,
    validation_data=validset,
     callbacks=callback_list,
    epochs=EPOCHS
) 

In [None]:
del model
gc.collect()

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL, from_pt=True)
    model = build_model(transformer_layer,head=HEAD,loss='binary_crossentropy', max_len=MAX_LEN,lr=5e-6)
    model.load_weights(model_path)

In [None]:
%%time
n_steps = valid_size // (BATCH_SIZE)
EPOCHS = 1
train_history_2 =model.fit(
    valid_dataset,
    steps_per_epoch=n_steps,
    epochs= EPOCHS
)

In [None]:
!rm model1.h5

# Make Submission

In [None]:
sub = pd.read_csv(TRANSFER)
sub.loc[ids,'toxic'] = model.predict(testset, verbose=1).reshape(-1)
sub.to_csv('submission.csv', index=False) 