In [1]:
import time
start_time = time.time()

import gc
gc.enable()
import os 
import numpy as np 
import pandas as pd 
import multiprocessing
from tqdm.notebook import tqdm 
from kaggle_datasets import KaggleDatasets 

import transformers 
from transformers import (TFAutoModel, 
                          AutoTokenizer) 
from tokenizers import (Tokenizer, 
                        models, 
                        pre_tokenizers, 
                        decoders, 
                        processors, 
                        BertWordPieceTokenizer, 
                        SentencePieceBPETokenizer) 
from transformers import AdamW

import tensorflow as tf 
from tensorflow.keras import backend
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.models import Model 
from tensorflow.keras.callbacks import ModelCheckpoint 
tf.config.experimental_run_functions_eagerly(False)
from tensorflow.keras.mixed_precision import experimental as mixed_precision

from nltk.tokenize.treebank import TreebankWordTokenizer
Ttokenizer = TreebankWordTokenizer()

from sklearn.metrics import roc_auc_score as roc, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

import warnings
warnings.simplefilter("ignore")
print('Import done! Time past %.2f secs' % (time.time() - start_time))

Import done! Time past 8.01 secs


In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [3]:
def binary_focal_loss(gamma=2., alpha=.25):
    def binary_focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        epsilon = K.epsilon()
        pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
        pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) \
               -K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return binary_focal_loss_fixed

In [4]:
def transform(tensor):
    apool = backend.mean(tensor, 1)
    mpool = backend.max(tensor, 1)
    cat = backend.concatenate((apool, mpool), 1)
    return cat

def build_model(transformer, max_len=192, lr=1e-5):
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    o1, o2 = transformer(ids,attention_mask=att,token_type_ids=tok)
#     lam = tf.keras.layers.Lambda(transform, name='transform')(o1)
#     drop = tf.keras.layers.Dropout(0.2)(lam)
    out = Dense(1, activation='sigmoid',
                kernel_initializer=tf.keras.initializers.GlorotUniform(),
                bias_initializer=tf.keras.initializers.Zeros())(o1[:,0,:])
    model = Model(inputs=[ids, att, tok], outputs=out)
    model.compile(Adam(lr=lr), loss=binary_focal_loss(gamma=2., alpha=.25), metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

In [5]:
MAX_LEN = 256
MODEL = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [6]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [7]:
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=250465, random_state=0)
])

In [8]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=True, 
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
    )    
    return np.array(enc_di['input_ids']), np.array(enc_di['attention_mask']), np.array(enc_di['token_type_ids'])

In [9]:
input_ids, input_mask, input_type_ids = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
input_ids_v, input_mask_v, input_type_ids_v = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
input_ids_t, input_mask_t, input_type_ids_t = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [10]:
del train, train1, train2, valid, test
VER = 'large'
tf.compat.v1.reset_default_graph()
backend.clear_session()
gc.collect()

20

In [11]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, lr=1e-6)
    model.load_weights('../input/xlmrobertatrained/xlm-roberta-large.h5')
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 256, 1024),  559890432   input_1[0][0]                    
_____________________________________________________________________________________________

In [12]:
VER = 'large'
sv = tf.keras.callbacks.ModelCheckpoint(
        'xlm-roberta-%s.h5'%(VER), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, patience=2,
                                      restore_best_weights=True, mode='auto')

train_history = model.fit([input_ids, input_mask, input_type_ids], [y_train], 
                    epochs=3, batch_size=128, verbose=1, callbacks=[sv, es],
                    validation_data=([input_ids_v, input_mask_v, input_type_ids_v], [y_valid]))

Train on 586240 samples, validate on 8000 samples
Epoch 1/3
Epoch 00001: val_loss improved from inf to 0.51050, saving model to xlm-roberta-large.h5
Epoch 2/3
Epoch 00002: val_loss did not improve from 0.51050
Epoch 3/3
Epoch 00003: val_loss did not improve from 0.51050
Restoring model weights from the end of the best epoch.
Epoch 00003: early stopping


In [13]:
valid_history = model.fit([input_ids_v, input_mask_v, input_type_ids_v], [y_valid], 
                    epochs=5, batch_size=64, verbose=1, callbacks=[sv, es],
                    validation_data=None)

Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
preds = model.predict([input_ids_t, input_mask_t, input_type_ids_t], verbose=1, batch_size=64)
sample = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
sample.toxic = preds
sample.to_csv('submission.csv',index=False)

