In [13]:
!pip install contractions transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [37]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
import os

import re
import tensorflow as tf
from matplotlib import pyplot as plt

import contractions
from contractions import contractions_dict

from transformers import AutoTokenizer, pipeline,DistilBertTokenizer,TFDistilBertForSequenceClassification,TFDistilBertModel, DistilBertConfig, XLNetConfig, XLNetTokenizer, TFXLNetModel, RobertaConfig, RobertaTokenizer, TFRobertaModel
from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [2]:
dir = '../input/jigsaw-multilingual-toxic-comment-classification'

Jigsaw_Toxic_Comment_TrainData = pd.read_csv(os.path.join(dir, 'jigsaw-toxic-comment-train.csv'))
Jigsaw_Unintended_Bias_TrainData = pd.read_csv(os.path.join(dir, 'jigsaw-unintended-bias-train.csv'))

print(f'Jigsaw_Toxic_Comment_TrainData and Jigsaw_Unintended_Bias_TrainData are loaded')

Jigsaw_Toxic_Comment_TrainData and Jigsaw_Unintended_Bias_TrainData are loaded


In [3]:
# Jigsaw_Toxic_Comment_TrainData has binary data for toxic variables
# Jigsaw_Unintended_Bias_TrainData has Numeric/float data for toxic variables

# Jigsaw_Toxic_Comment_TrainData
Jigsaw_Toxic_Comment_TrainData['OtherToxic'] = Jigsaw_Toxic_Comment_TrainData[['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].apply(lambda x: np.max(x), axis=1)
Jigsaw_Toxic_Comment_TrainData['ToxicComment'] = np.maximum(Jigsaw_Toxic_Comment_TrainData['toxic'], Jigsaw_Toxic_Comment_TrainData['OtherToxic'])
Jigsaw_Toxic_Comment_TrainData = Jigsaw_Toxic_Comment_TrainData[['id','comment_text','ToxicComment']]

# Jigsaw_Unintended_Bias_TrainData
# Convert numeric data to binary data
for col in ['toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']:
        Jigsaw_Unintended_Bias_TrainData[col] = Jigsaw_Unintended_Bias_TrainData[col].round()
        
Jigsaw_Unintended_Bias_TrainData['OtherToxic'] = Jigsaw_Unintended_Bias_TrainData[['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']].apply(lambda x: np.max(x), axis=1)
Jigsaw_Unintended_Bias_TrainData['ToxicComment'] = np.maximum(Jigsaw_Unintended_Bias_TrainData['toxic'], Jigsaw_Unintended_Bias_TrainData['OtherToxic'])
Jigsaw_Unintended_Bias_TrainData = Jigsaw_Unintended_Bias_TrainData[['id','comment_text','ToxicComment']]

# Combine Train data
TrainData = pd.concat([Jigsaw_Toxic_Comment_TrainData, Jigsaw_Unintended_Bias_TrainData], axis=0).reset_index(drop=True)

In [33]:
TrainData = TrainData.sample(frac =.1)

In [11]:
def clean(text):
    text = text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    # Remove Leaky features - IP addresses or user IDs
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    # Remove any links (whole link is replaced)
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    text = text.map(lambda x: re.sub("\(https://.*?\s\(https://.*\)",'',str(x)))
    # Expand contractions
    text = text.map(lambda x: contractions.fix(x))
    return text

In [34]:
TrainData["comment_text"] = clean(TrainData["comment_text"])

##### Tokenizer - Every transformer based model has a unique tokenization technique, unique use of special tokens. The transformer library takes care of this for us. It supports tokenization for every model which is associated with it.
* add_special_tokens: Is used to add special character like <cls>, <sep>,<unk>, etc w.r.t Pretrained model in use. It should be always kept True
* max_length: Max length of any sentence to tokenize, its a hyperparameter. (originally BERT has 512 max length)
* pad_to_max_length: perform padding operation.
    
##### Any transformer model generally needs three input:
* input ids: word id associated with their vocabulary
* attention mask: Which id must be paid attention to; 1=pay attention. In simple terms, it tells the model which are original words and which are padded words or special tokens
* token type id: It's associated with model consuming multiply sentence like Question-Answer model. It tells model about the sequence of the sentences.

In [26]:
#tokenizing input text
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return [np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')]

In [27]:
# Defining DistilBERT tokonizer
distil_bert = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

In [35]:
# Returns input_ids, input_masks
input_ids_masks = tokenize(TrainData["comment_text"], tokenizer)
labels = TrainData['ToxicComment'].values

100%|██████████| 212574/212574 [09:36<00:00, 368.82it/s]


In [38]:
# Split data into Train & Validation
xtrain_id, xval_id, xtrain_mask, xval_mask, ytrain, yval = train_test_split(input_ids_masks[0], input_ids_masks[1], labels, test_size=0.3, random_state=42)

In [None]:
# def create_model_dense():
#     input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
#     input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 
    
#     embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
#     # cls_token = embedding_layer[:,0,:]
#     X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
#     X = tf.keras.layers.GlobalMaxPool1D()(X)
#     # X = tf.keras.layers.BatchNormalization()(cls_token)
#     X = tf.keras.layers.Dense(50, activation='relu')(X)
#     X = tf.keras.layers.Dropout(0.2)(X)
#     X = tf.keras.layers.Dense(6, activation='sigmoid')(X)
#     model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

#     for layer in model.layers[:3]:
#       layer.trainable = False
    
#     optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4, epsilon=1e-08, clipnorm=1.0)
#     # loss = tf.keras.losses.binary_crossentropy(from_logits=True)
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [39]:
def create_model_dense():
    input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
    input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 
    
    embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
    X = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

    for layer in model.layers[:-1]:
      layer.trainable = False
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
    return model


model = create_model_dense()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti ((None, 128, 768),)  66362880    input_token[0][0]                
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 768)          0           tf_distil_bert_model_1[0][0]     
______________________________________________________________________________________________

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3, verbose=1, restore_best_weights=True)
lr_on_pla = tf.keras.callbacks.ReduceLROnPlateau(patience=2)

hist = model.fit([xtrain_id,xtrain_mask],
                ytrain,
                validation_data = ([xval_id, xval_mask], yval),
                epochs=3,
                batch_size=128,
                callbacks=[early_stop, lr_on_pla])

Epoch 1/3
