In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score

train = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
test = pd.read_csv('/kaggle/input/unbalanced/testu.csv')

In [3]:
#Balancing the dataset by undersampling from the majority class
df_train_1 = train[train['Target']== 1]
df_train_0 = train[train['Target']== 0]
df_train_0_downsampled = df_train_0.sample(df_train_1.shape[0])
df_balanced = pd.concat([df_train_0_downsampled, df_train_1])
df_balanced['Target'].value_counts()

0    50715
1    50715
Name: Target, dtype: int64

In [18]:
data = df_balanced[['comment_text','severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack','insult','threat','other','Target']]
data.head(5)

Unnamed: 0,comment_text,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,other,Target
32264,Hebrew and Abrahamic males took many wives (it...,0,0,0,0,0,0,1,0
206701,What fake news? What do you even mean?,0,0,0,0,0,0,1,0
199181,"""St. Paul never proposed that two believers go...",0,0,0,0,0,0,1,0
278882,Good luck. Just let Alberta change it and be d...,0,0,0,0,0,0,1,0
293622,"(Part two of four) So, if election night prov...",0,0,0,0,0,0,1,0


In [19]:
sentences=data['comment_text']
labels=data['Target']
len(sentences),len(labels)

(101430, 101430)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
#loop through the input sentences as part of the dataset and find the input ids and attention masks by storing it in an array
input_ids=[]
attention_masks=[]

for sent in sentences:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

len(input_ids),len(attention_masks),len(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(101430, 101430, 101430)

In [26]:
#split the dataset into train(80%) and validation(20%)
from sklearn.model_selection import train_test_split
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (81144, 64) Val input shape (20286, 64)
Train label shape (81144,) Val label shape (20286,)
Train attention mask shape (81144, 64) Val attention mask shape (20286, 64)


In [27]:
# pickle the input ids and labels along with the labels of the dataset
import pickle
print('Preparing the pickle file.....')

pickle_inp_path='/kaggle/working/bert_inp.pkl'
pickle_mask_path='/kaggle/working/bert_mask.pkl'
pickle_label_path='/kaggle/working/bert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

Preparing the pickle file.....
Pickle files saved as  /kaggle/working/bert_inp.pkl /kaggle/working/bert_mask.pkl /kaggle/working/bert_label.pkl


In [28]:
print('Loading the saved pickle files..')

pickle_inp_path='/kaggle/input/pickled-files/bert_inp.pkl'
pickle_mask_path='/kaggle/input/pickled-files/bert_mask.pkl'
pickle_label_path='/kaggle/input/pickled-files/bert_label.pkl'

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/pickled-files/bert_inp.pkl'

In [7]:
from sklearn.model_selection import train_test_split
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (81144, 64) Val input shape (20286, 64)
Train label shape (81144,) Val label shape (20286,)
Train attention mask shape (81144, 64) Val attention mask shape (20286, 64)


In [33]:
from transformers import TFBertModel,BertConfig, BertTokenizerFast

config = BertConfig.from_pretrained('bert-base-uncased')
config.output_hidden_states = False

tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = 'bert-base-uncased', config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained('bert-base-uncased', config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [38]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(100,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
severe_toxicity = Dense(units=len(data.severe_toxicity.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='severe_toxicity')(pooled_output)
obscene = Dense(units=len(data.obscene.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='obscene')(pooled_output)
sexual_explicit = Dense(units=len(data.sexual_explicit.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='sexual_explicit')(pooled_output)
identity_attack = Dense(units=len(data.identity_attack.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='identity_attack')(pooled_output)
insult = Dense(units=len(data.insult.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='insult')(pooled_output)
threat = Dense(units=len(data.threat.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='threat')(pooled_output)
outputs = {'severe_toxicity': severe_toxicity, 'obscene': obscene,'sexual_explicit': sexual_explicit, 'identity_attack':identity_attack,'insult':insult, 'threat':threat}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]']              
                                thPoolingAndCrossAt                                               
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                               
                                 768),                                                            
                                 pooler_output=(Non                      

In [41]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08)
# Set loss and metrics
loss = {'severe_toxicity': CategoricalCrossentropy(from_logits = True), 'obscene': CategoricalCrossentropy(from_logits = True), 'sexual_explicit': CategoricalCrossentropy(from_logits = True), 'identity_attack': CategoricalCrossentropy(from_logits = True), 'insult': CategoricalCrossentropy(from_logits = True), 'threat': CategoricalCrossentropy(from_logits = True) }
metric = {'severe_toxicity': CategoricalAccuracy('accuracy'), 'obscene': CategoricalAccuracy('accuracy'), 'sexual_explicit': CategoricalAccuracy('accuracy'), 'identity_attack': CategoricalAccuracy('accuracy'), 'insult': CategoricalAccuracy('accuracy'), 'threat': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
# Ready output data for the model
y_severe_toxicity = to_categorical(data['severe_toxicity'])
y_obscene = to_categorical(data['obscene'])
y_sexual_explicit = to_categorical(data['sexual_explicit'])
y_identity_attack = to_categorical(data['identity_attack'])
y_insult = to_categorical(data['insult'])
y_threat = to_categorical(data['threat'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['comment_text'].to_list(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'severe_toxicity': y_severe_toxicity, 'obscene': y_obscene, 'sexual_explicit': y_sexual_explicit, 'identity_attack': y_identity_attack, 'insult': y_insult, 'threat': y_threat},
    validation_split=0.2,
    batch_size=64,
    epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [44]:
test_y_severe_toxicity = to_categorical(test['severe_toxicity'])
test_y_obscene = to_categorical(test['obscene'])
test_y_sexual_explicit = to_categorical(test['sexual_explicit'])
test_y_identity_attack = to_categorical(test['identity_attack'])
test_y_insult = to_categorical(test['insult'])
test_y_threat = to_categorical(test['threat'])
test_x = tokenizer(
    text=test['comment_text'].to_list(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)


In [45]:
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'severe_toxicity': test_y_severe_toxicity, 'obscene': test_y_obscene, 'sexual_explicit': test_y_sexual_explicit, 'identity_attack': test_y_identity_attack, 'insult': test_y_insult, 'threat': test_y_threat},
)

print("Accuracy: {:.2f}%".format(model_eval[1] * 100))
print("Precision: {:.2f}%".format(model_eval[2] * 100))
print("Recall: {:.2f}%".format(model_eval[3] * 100))
print("F1 score: {:.2f}%".format(2 * (model_eval[2] * model_eval[3]) / (model_eval[2] + model_eval[3]) * 100))

Accuracy: 9.44%
Precision: 17.08%
Recall: 1.64%
F1 score: 3.00%


In [46]:
print(model_eval)

[0.3100249171257019, 0.09441576898097992, 0.17075058817863464, 0.016420559957623482, 0.00025641440879553556, 0.01722324825823307, 0.010958373546600342, 0.9612177014350891, 0.9402413964271545, 0.9947633743286133, 0.9999776482582092, 0.9940770864486694, 0.9969714283943176]


In [None]:
test_input_ids=[]
test_attention_masks=[]
test_bert_inp=[]

for sent in test_sentences:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    test_bert_inp.append(bert_inp)
    test_input_ids.append(bert_inp['input_ids'])
    test_attention_masks.append(bert_inp['attention_mask'])

test_input_ids=np.asarray(test_input_ids)
test_attention_masks=np.array(test_attention_masks)
test_labels=np.array(test_labels)

len(test_input_ids),len(test_attention_masks),len(test_labels)


In [None]:
test_preds = bert_model.predict([test_input_ids, test_attention_masks])
test_preds1 = np.argmax(test_preds, axis=-1)

In [None]:
test_preds[0]

probs = tf.nn.softmax(test_preds[0])
print(probs)
test_preds1 = np.argmax(test_preds[0], axis=-1)
print(len(test_preds1))

In [None]:
print(test_preds1)
from sklearn.metrics import accuracy_score
test_acc = accuracy_score(test_labels, test_preds1)
print('Test accuracy:', test_acc)