## Imports

In [None]:
import os
import re
import numpy as np
import pandas as pd


import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from transformers import TFAutoModel, AutoTokenizer, RobertaTokenizerFast

from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors



## Helper Functions

In [None]:
def clean_sentence(sentence):
    ''' 
    Author: louilghada@gmail.com | kaggle.com/swannnn
    Removes punctuation, digits, special characters, stopwords and words of 1 character
    '''
    clean_sent = ''''''
    for word in sentence.split():
        if len(word) > 1 and not re.match(r'.*[0-9]+', word):
            clean_sent = "{} {}".format(clean_sent, word)
    clean_sent = re.sub(r'[.,:;?!/\|@#$%^&-_(){}]', '', clean_sent)
    clean_sent = text = re.sub(r'(http|www)\S*', '', clean_sent)
    return clean_sent.strip()

def clean_df(df, column):
    '''
    louilghada@gmail.com | kaggle.com/swannnn
    Cleans text in specified column of dataframe df
    '''
    df = df.apply(lambda x: x.astype(str).str.lower())
    df[column] = df[column].apply(lambda x: clean_sentence(x))

def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_CNN_model():
    input_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_words_ids")
    embedding = Embedding(VOCAB_SIZE[0], VOCAB_SIZE[1], input_length=MAX_LEN, name='embed')(input_layer)
    conv_1 = Conv1D(256, (100), activation='relu')(embedding)
    max_pool = MaxPooling1D()(conv_1)
    conv_1 = Conv1D(128, (5), activation='relu')(max_pool)
    max_pool = MaxPooling1D()(conv_1)
    dense = Dense(128, activation='relu')(max_pool)
    dense = Dense(128, activation='relu')(dense)
    flatten = Flatten()(dense)
    out = Dense(1, activation='sigmoid')(flatten)
    
    
    model = Model(inputs = input_layer, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Load text data into memory

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train3 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv")
train4 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv")
train5 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv")
train6 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv")
train7 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv")
train8 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv")

valid1 = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
valid2 = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_valid_translated.csv')
test1 = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
test2 = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')


In [None]:
# Combine train1 with a subset of train2
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train3[['comment_text', 'toxic']].query('toxic==1'),
    train3[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train4[['comment_text', 'toxic']].query('toxic==1'),
    train4[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train5[['comment_text', 'toxic']].query('toxic==1'),
    train5[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train6[['comment_text', 'toxic']].query('toxic==1'),
    train6[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train7[['comment_text', 'toxic']].query('toxic==1'),
    train7[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    train8[['comment_text', 'toxic']].query('toxic==1'),
    train8[['comment_text', 'toxic']].query('toxic==0').sample(n=50000, random_state=0),
    
])

valid = pd.concat([valid1, valid2])

In [None]:
clean_df(train, 'comment_text')
train.toxic = train.toxic.round().astype(int)
train = train.sample(frac = 1)

clean_df(valid, 'comment_text')

clean_df(test1, 'content')
clean_df(test2, 'content')

In [None]:
x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)

x_test1 = regular_encode(test1.content.values, tokenizer, maxlen=MAX_LEN)
x_test2 = regular_encode(test2.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values


In [None]:
x_train1 = regular_encode(train1.comment_text.values, tokenizer, maxlen=MAX_LEN)

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)


## Build datasets objects

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test1_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test1)
    .batch(BATCH_SIZE)
)

test2_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test2)
    .batch(BATCH_SIZE)
)

## Load models into the TPU

In [None]:
VOCAB_SIZE = x_train.shape

In [None]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    roberta_model = build_model(transformer_layer, max_len=MAX_LEN)
roberta_model.summary()

In [None]:
with strategy.scope():
    cnn_model = build_CNN_model()
cnn_model.summary()

## Train Models

First, we train the XLM-Roberta Model.

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE

train_history = roberta_model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE

train_history_2 = roberta_model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

Then we train the CNN model

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE

train_history = cnn_model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS*5
)

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE

train_history_2 = cnn_model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

## blending

In [None]:
multi_ling_sub = roberta_model.predict(test1_dataset, verbose=1)
eng_sub = roberta_model.predict(test2_dataset, verbose=1)

sub['toxic'] = multi_ling_sub*0.5 + eng_sub*0.5

## Submission

In [None]:
sub.to_csv('submission.csv', index=False)