In [8]:
!pip install translate



In [24]:

from translate import Translator
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras import layers
from keras.engine import InputSpec, Layer
from tensorflow.keras.layers import Dropout, Dense, Input, Embedding,concatenate, SpatialDropout1D, GlobalAveragePooling1D, Conv1D, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, BertWordPieceTokenizer

In [10]:
def fast_encode(texts, tokenizer, chunk_size=240, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [36]:
def build_cnn_model(transformer, max_len):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    
    embed = transformer.weights[0].numpy()
    embedding = Embedding(np.shape(embed)[0], np.shape(embed)[1],
                          input_length=max_len, weights=[embed],
                          trainable=False)(input_word_ids)
    
    embedding1 = SpatialDropout1D(0.8)(embedding)
    """
    conv_1 = Conv1D(16, 2)(embedding1)
    conv_2 = Conv1D(16, 3)(embedding1)
    conv_3 = Conv1D(16, 4)(embedding1)
    conv_4 = Conv1D(16, 5)(embedding1)
    
    maxpool_1 = GlobalAveragePooling1D()(conv_1)
    maxpool_2 = GlobalAveragePooling1D()(conv_2)
    maxpool_3 = GlobalAveragePooling1D()(conv_3)
    maxpool_4 = GlobalAveragePooling1D()(conv_4)
    conc = concatenate([maxpool_1, maxpool_2, maxpool_3, maxpool_4], axis=-1)
    
    
    """
    flat = layers.Flatten()(embedding1)
    drop = Dropout(0.5)(flat)
    conc2 = Dense(1, activation='sigmoid')(drop)
    
    model = Model(inputs=input_word_ids, outputs=conc2)
    
    model.compile(Adam(lr=0.01), 
                  loss='binary_crossentropy', 
                  metrics=[tf.keras.metrics.AUC()])
    
    return model

In [12]:
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average attention mechanism from:
        Zhou, Peng, Wei Shi, Jun Tian, Zhenyu Qi, Bingchen Li, Hongwei Hao and Bo Xu.
        “Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification.”
        ACL (2016). http://www.aclweb.org/anthology/P16-2034
    How to use:
    see: [BLOGPOST]
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.w = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_w'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.w]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, h, mask=None):
        h_shape = K.shape(h)
        d_w, T = h_shape[0], h_shape[1]
        
        logits = K.dot(h, self.w)  # w^T h
        logits = K.reshape(logits, (d_w, T))
        alpha = K.exp(logits - K.max(logits, axis=-1, keepdims=True))  # exp
        
        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            alpha = alpha * mask
        alpha = alpha / K.sum(alpha, axis=1, keepdims=True) # softmax
        r = K.sum(h * K.expand_dims(alpha), axis=1)  # r = h*alpha^T
        h_star = K.tanh(r)  # h^* = tanh(r)
        if self.return_attention:
            return [h_star, alpha]
        return h_star

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [51]:
def build_lstm_model(transformer, max_len):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    
    embed = transformer.weights[0].numpy()
    embedding = Embedding(np.shape(embed)[0], np.shape(embed)[1],
                          input_length=max_len, weights=[embed],
                          trainable=False)(input_word_ids)
    
    embedding = SpatialDropout1D(0.3)(embedding)
    lstm_1 = LSTM(16, return_sequences=True)(embedding)
    
    #x,attention = AttentionWeightedAverage(return_attention=True)(lstm_2)
    flat = layers.Flatten()(lstm_1)
    drop = Dropout(0.5)(flat)
    conc = Dense(1, activation='sigmoid')(drop)
    
    model = Model(inputs=input_word_ids, outputs=conc)
    
    model.compile(Adam(lr=0.001), 
                  loss='binary_crossentropy', 
                  metrics=["accuracy"])
    
    return model

In [14]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')

train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=21384, random_state=0),
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=112226, random_state=0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
])

In [15]:
%%time 

valid["lang"].replace("tr","turkish",inplace=True)
valid["lang"].replace("es","spanish",inplace=True)
valid["lang"].replace("it","italian",inplace=True)
for i in range(len(valid["lang"])):
    translator= Translator(from_lang=valid.iloc[i,2],to_lang="English")
    valid.iloc[i,0] = translator.translate(valid.iloc[i,1])
    if(i%1000==0):
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
CPU times: user 46.9 s, sys: 3.52 s, total: 50.4 s
Wall time: 32min 47s


In [16]:
def clean(text):
    text = text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

valid["comment_text"] = clean(valid["id"])
train["comment_text"] = clean(train["comment_text"])

In [17]:
del train1,train2

In [18]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

GCS_DS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')

EPOCHS = 1
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [19]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

save_path = '/kaggle/working/distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [20]:

x_train = fast_encode(train.comment_text.astype(str).values, 
                      fast_tokenizer, maxlen=512)
x_valid = fast_encode(valid.comment_text.astype(str).values, 
                      fast_tokenizer, maxlen=512)

y_valid = valid.toxic.values
y_train = train.toxic.values

n=x_train.shape[0]

In [21]:
AUTO = tf.data.experimental.AUTOTUNE
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [22]:
del x_train,y_train

In [37]:
MODEL='distilbert-base-multilingual-cased'
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_cnn = build_cnn_model(transformer_layer, max_len=512)

model_cnn.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 512, 768)          91812096  
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 512, 768)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 393216)            0         
_________________________________________________________________
dropout_117 (Dropout)        (None, 393216)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 393217    
Total params: 92,205,313
Trainable params: 393,217
Non-trainable params: 91,812,096
_________________________________________

In [39]:

train_history = model_cnn.fit(
    train_dataset,
    steps_per_epoch=1000,
    validation_data=valid_dataset,
    epochs=1
)


Train for 1000 steps, validate for 32 steps


In [52]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_lstm = build_lstm_model(transformer_layer, max_len=512)

model_lstm.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 512, 768)          91812096  
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 512, 768)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 512, 16)           50240     
_________________________________________________________________
flatten_5 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dropout_195 (Dropout)        (None, 8192)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 8193

In [53]:

train_history2 = model_lstm.fit(
    train_dataset,
    steps_per_epoch=50,
    validation_data=valid_dataset,
    epochs=1
)


Train for 50 steps, validate for 32 steps


In [54]:
from sklearn.metrics import roc_auc_score
pre=model_lstm.predict(valid_dataset)
print(roc_auc_score(y_valid,pre))

0.5


In [46]:
pre.shape

(8000, 512, 1)

In [55]:
def build_distilbert_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = Dropout(0.2)(cls_token)
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    model.compile(Adam(lr=1.5e-3), 
                  loss='binary_crossentropy', 
                  metrics=[tf.keras.metrics.AUC()])
    
    return model

In [56]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_distilbert = build_distilbert_model(transformer_layer, max_len=512)

model_distilbert.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_distil_bert_model_10 (TFD ((None, 512, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dropout_215 (Dropout)        (None, 768)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


In [60]:
train_history3 = model_distilbert.fit(
    train_dataset,
    steps_per_epoch=90,
    validation_data=valid_dataset,
    epochs=1
)

Train for 80 steps, validate for 32 steps
