In [None]:
import pandas as pd
import numpy as np
import sklearn
from transformers import pipeline
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("../input/ruddit-jigsaw-dataset-combined-cleaned/toxic_train.csv",usecols=["processed","isOffensive"])
df=df.rename(columns={"processed": "text","isOffensive": "toxic_score"})
#jigsaw=pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
#df.to_csv("train.csv",index=False)

df_train,df_val=train_test_split(df,stratify=df.toxic_score,test_size=0.2)
test=pd.read_csv("../input/jigsaw-toxic-comment-clean-data/clean_test_data.csv")

In [None]:
import transformers
from tokenizers import BertWordPieceTokenizer
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.optimizers import Adam

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    '''
    Function for fast encoding
    '''
    all_ids = []
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = list(texts[i:chunk_size+i])
        encs = tokenizer.batch_encode_plus(text_chunk, max_length=maxlen, pad_to_max_length = True)
        all_ids.extend(encs['input_ids'])
        
    return np.array(all_ids)

In [None]:
fast_tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

In [None]:
x_train = fast_encode(df_train.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(df_val.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.clean_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)

In [None]:
y_train = df_train.toxic_score.values
y_valid = df_val.toxic_score.values


In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train,y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid,y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, maxlen=512):
    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name='input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    
    clf_output = sequence_output[:,0,:]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(clf_output)
    
    model = tf.keras.models.Model(inputs = input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, maxlen=MAX_LEN)
    
model.summary()

In [None]:
n_steps = x_train.shape[0]//BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch = n_steps,
    validation_data = valid_dataset,
    epochs = EPOCHS
)

In [None]:
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model.save('./model', options=save_locally)

In [None]:
sample=pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
sample['score'] = model.predict(test_dataset, verbose=1)
#sample.to_csv('submission.csv', index=False)

#sample.head(1)

In [None]:
sample