In [None]:
import os

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Dropout, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
import transformers

from tokenizers import BertWordPieceTokenizer

from tqdm.notebook import tqdm

from kaggle_datasets import KaggleDatasets

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Tokenize text
    Source: https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
def build_model(transformer, max_len=512):
    """
    Model initalization
    Source: https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    dense_layer = Dense(224, activation='relu')(cls_token)
    dense_layer = Dropout(0.2)(dense_layer)
    out = Dense(224, activation='relu')(dense_layer)
    out = Dense(1, activation='sigmoid')(out)
    model = Model(inputs=input_word_ids, outputs=out)
    # model = InceptionV3(input_tensor=input_word_ids, weights='imagenet', include_top=True)
    model.compile(Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# def build_model(transformer, max_len=512):
#     model = Sequential()
#     model.add(Embedding(119547, 500, input_length=MAX_LEN))
#     model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))
#     model.add(Dense(256, activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])



# TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
# GCS_DS_PATH = KaggleDatasets().get_gcs_path()

In [None]:
# Configuration
EPOCHS = 2
BATCH_SIZE = 64 * strategy.num_replicas_in_sync
MAX_LEN = 200 #192

# Create fast tokenizer

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
fast_tokenizer

# Load text data

In [None]:
DATA_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"

In [None]:
train1 = pd.read_csv(os.path.join(DATA_PATH, "jigsaw-toxic-comment-train.csv"))
train2 = pd.read_csv(os.path.join(DATA_PATH, "jigsaw-unintended-bias-train.csv"))
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

In [None]:
train1.head()

In [None]:
# Combine train1 with a subset of train2
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    
])

# Note: changed random_state from 0 to 39

In [None]:
train.toxic.value_counts()

In [None]:
x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

# Build datasets objects

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

# Load model into the TPU

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

# Train Model

First, we train on the subset of the training set, which is completely in English.

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=1#2
)

Now that we have pretty much saturated the learning potential of the model on english only data, we train it for one more epoch on the `validation` set, which is significantly smaller but contains a mixture of different languages.

# Submission

In [None]:
# sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
# # print(submi.shape)
# sub = model.predict(test_dataset, verbose=1)
# # sub['toxic1'].to_csv('submission.csv', index=True)

In [None]:
# sub
# submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
# submission.toxic = sub
# submission.toxic.shape

In [None]:
# submission

In [None]:
# max(submission.toxic)

In [None]:
#submission.to_csv('submission.csv', index=False)

In [None]:
history = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=x_valid.shape[0],
    epochs=3
    
)

In [None]:
submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

sub = model.predict(test_dataset, verbose=1)
submission.toxic = sub
submission.toxic.shape

In [None]:
submission.to_csv('submission.csv', index=False)