Over the past year, the field has seen impressive multilingual capabilities from the latest model innovations, including few- and zero-shot learning. We're excited to learn whether these results "translate" (pun intended!) to toxicity classification. Our training data will be the English data provided for our previous two competitions and your test data will be Wikipedia talk page comments in several different languages.
We will be using DistilBERT as it is 2 times faster and 25% lighter than multilingual BERT base, all while retaining 92% of its performance. This model let you quickly experiments with different ideas, and when you are ready for the real thing, just change two lines of code to use bert-base-multilingual-cased.
* You can try other models like BERT large,Xlnet,RoBERTa,etc and compare the performances.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing all the necessory Libraries

In [None]:
!pip install -q googletrans
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

import transformers
import tensorflow as tf
from tqdm.notebook import tqdm
from wordcloud import WordCloud, STOPWORDS
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
from tokenizers import BertWordPieceTokenizer


sns.set(style="darkgrid")

Get the training datas of both the previous competetions and concatenate to be used as a single training dataset. 

In [None]:
dir = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification'

train_set1 = pd.read_csv(os.path.join(dir, 'jigsaw-toxic-comment-train.csv'))
train_set2 = pd.read_csv(os.path.join(dir, 'jigsaw-unintended-bias-train.csv'))
train_set2.toxic = train_set2.toxic.round().astype(int)

valid = pd.read_csv(os.path.join(dir, 'validation.csv'))
test = pd.read_csv(os.path.join(dir, 'test.csv'))

In [None]:
train = pd.concat([
    train_set1[['comment_text', 'toxic']],
    train_set2[['comment_text', 'toxic']].query('toxic==1'),
    train_set2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

In [None]:
print(train.shape)
train.head()

In [None]:
print(valid.shape)
valid.head()

In [None]:
valid.lang.unique()

In [None]:
test.lang.unique()
test.head()

# EDA on textual data and toxic distribution

In [None]:
print(train.toxic.value_counts())

In [None]:
sns.countplot(train.toxic)

In [None]:
nrow_train=train.shape[0]
nrow_test=test.shape[0]
sum=nrow_train+nrow_test
print("       : train : test")
print("rows   :",nrow_train,":",nrow_test)
print("perc   :",round(nrow_train*100/sum),"   :",round(nrow_test*100/sum))

In [None]:
x=train.iloc[:,2:].sum()
#marking comments without any tags as "clean"
rowsums=train.iloc[:,2:].sum(axis=1)
train['clean']=(rowsums==0)
#count number of clean entries
train['clean'].sum()
print("Total comments = ",len(train))
print("Total clean comments = ",train['clean'].sum())
print("Total tags =",x.sum())

In [None]:
print("Check for missing values in Train dataset")
null_check=train.isnull().sum()
print(null_check)
print("Check for missing values in Test dataset")
null_check=test.isnull().sum()
print(null_check)
print("filling NA with \"unknown\"")
train["comment_text"].fillna("unknown", inplace=True)
# test["comment_text"].fillna("unknown", inplace=True)

In [None]:
#plot
plt.figure(figsize=(8,4))
ax= sns.countplot(valid.lang, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type of Language', fontsize=12)
#adding the text labels


In [None]:
#plot
plt.figure(figsize=(8,4))
ax= sns.countplot(test.lang,alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type of Language', fontsize=12)
#adding the text labels


In [None]:
#plot
plt.figure(figsize=(8,4))
ax= sns.countplot(valid.toxic, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)
#adding the text labels


In [None]:
def get_ax(rows = 1,cols = 2,size = 7):
    fig, ax = plt.subplots(rows, cols, figsize=(size*cols, size*rows))
    return fig,ax

In [None]:
fig,ax = get_ax()
sns.distplot(train[train["toxic"]==0]["comment_text"].str.len(),ax = ax[0])
sns.distplot(train[train["toxic"]==1]["comment_text"].str.len(),ax = ax[1])

In [None]:
def wordcloud(data):
    wordcloud = WordCloud(background_color = 'Black',
                         max_words = 50,
                         max_font_size = 40,
                         scale = 5,
                         random_state = 5).generate(str(data))
    fig = plt.figure(1, figsize=(10,10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
wordcloud(train["comment_text"])    

In [None]:
wordcloud(valid["comment_text"])

In [None]:
wordcloud(test["content"])

# Modelling

Setting a fixed size of encoding i.e tokenizing and padding each input string

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

Building model layers with a input layer with encoded string ,transformer layer for processing and final dense layer to get predictions.Since its a binary classification we are using binary crossentropy.

In [None]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

Configuring TPU
* [Read the TPU documentation](http://https://www.kaggle.com/docs/tpu) one-pager

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

Instancing the tokenizer from DistilBERT model and then applying WordPeice Tokenizer 

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

Converting data into Tensordata for TPU processing.

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

Model initialization and fitting on train and valid sets

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:
    preds = model.predict(test_dataset,verbose = 1)
    # # final = pd.DataFrame({"test_content":test.content,"Preds":preds})
    # # final.head()

In [None]:

# sub['toxic'] = model.predict(test_dataset, verbose=1)
# sub.to_csv('submission.csv', index=False)
sub = pd.DataFrame(preds,index = [i for i in range(len(preds))])
sub.to_csv("submiss.csv",index = False)

Thank You for visiting this Notebook!!If you like it Please UPVOTE!!It motivates me to learn more.