In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer, TFBertModel
import tensorflow as tf
import transformers



In [2]:
import zipfile
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as zip_ref:
    zip_ref.extractall("./")
    
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
trainset, testset = train_test_split(df, test_size = 0.1,random_state=42)
trainset, valset = train_test_split(trainset, test_size = 0.2,random_state=42)

In [22]:
x_train = trainset["comment_text"].to_list()
y_train = trainset.iloc[:,2:].values

In [5]:
x_test = testset["comment_text"].to_list()
y_test = testset.iloc[:,2:].values

In [6]:
x_val = valset["comment_text"].tolist()
y_val = valset.iloc[:,2:].values

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [8]:
max_seq_length =256
train_encodings= tokenizer(x_train,
                             add_special_tokens=True,
                             max_length=max_seq_length,
                             truncation=True,
                             padding=True, 
                             return_tensors='tf')
val_encodings = tokenizer(x_val,
                             add_special_tokens=True,
                             max_length=max_seq_length,
                             truncation=True,
                             padding=True, 
                             return_tensors='tf')
test_encodings = tokenizer(x_test,
                             add_special_tokens=True,
                             max_length=max_seq_length,
                             truncation=True,
                             padding=True, 
                             return_tensors='tf')

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

# Define model

In [10]:
#model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
#issue: it'll train normally, but in the end of the first epoch will raise 
#ValueError: logits and labels must have the same shape ((256, 6) vs (6, 1))

In [16]:
class BertforMultiLabelClassification(tf.keras.Model):
    def __init__(self,num_labels,dropout=0.3,**kwargs):
        super().__init__(**kwargs)
        self.bert = transformers.TFBertModel.from_pretrained('bert-base-uncased',return_dict=False)
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.dense = tf.keras.layers.Dense(num_labels,kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                                          bias_initializer=tf.zeros_initializer(),activation=tf.keras.activations.sigmoid)
    
    def call(self,inputs):
        _, output_1= self.bert(inputs)
        output_2 = self.dropout(output_1)
        output = self.dense(output_2)
        return output

In [17]:
model = BertforMultiLabelClassification(6)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [26]:
import tensorflow_addons as tfa
f1 = tfa.metrics.F1Score(num_classes=1, average='micro',threshold=0.5)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc',f1])
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=3,validation_data=val_dataset.batch(16))
#, batch_size=16

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

model.evaluate(test_dataset.batch(16))



[0.04282236099243164, 0.9914776086807251, 0.7793024182319641]

In [30]:
model.save_weights("TFBertforMultilabelclassification.h5",save_format="h5")

===================================================================