In [0]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [0]:
def regular_encode(texts, tokenizer,maxlen):
    enc_di = tokenizer.batch_encode_plus(texts, return_attention_masks=False, return_token_type_ids=False,pad_to_max_length=True,max_length=maxlen)   
    return np.array(enc_di['input_ids'])

In [0]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [0]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 512
MODEL = 'jplu/tf-xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [0]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)
valid = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_valid_translated.csv")

In [0]:
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=21384, random_state=0),
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=112226, random_state=0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
])

In [0]:
def clean(text):
    text = text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

valid["comment_text"] = clean(valid["translated"])
train["comment_text"] = clean(train["comment_text"])

y_valid = valid.toxic.values
y_train = train.toxic.values

In [0]:
%%time 

x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
n=x_train.shape[0]

CPU times: user 3min 55s, sys: 1.71 s, total: 3min 57s
Wall time: 3min 57s


In [0]:
train_dataset = (tf.data.Dataset.from_tensor_slices((x_train, y_train)).repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO))
test_dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(BATCH_SIZE))

In [0]:
del x_valid,train,train1,train2,y_train,x_train

In [0]:
from tensorflow.keras.layers import Dropout,Flatten
def build_model(transformer, max_len):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, :, :]
    flat = Flatten()(cls_token)
    drop = Dropout(0.1)(flat)
    out = Dense(1, activation='sigmoid')(drop)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-3), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
    
    return model

In [0]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [0]:
n_steps = n//BATCH_SIZE
train_history = model.fit(train_dataset,steps_per_epoch=n_steps,epochs=EPOCHS,validation_data=test_dataset)

In [0]:
pre = model.predict(test_dataset, verbose=1)
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_valid,pre))
pre2=pre.round()
print(roc_auc_score(y_valid,pre2))