In [1]:
%%capture

!wget https://raw.githubusercontent.com/ipavlopoulos/toxic_spans/master/SemEval2021/data/tsd_test.csv
!wget https://raw.githubusercontent.com/ipavlopoulos/toxic_spans/master/SemEval2021/data/tsd_train.csv

! pip install -r requirements.txt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading Libraries

In [3]:
# Basic Libraries
import pandas as pd
import numpy as np
import json
import gc

# Tensorflow modules
from tensorflow.keras import *
import tensorflow as tf
from tensorflow.keras import *
import tensorflow.keras.backend as K

# Transformer module
from transformers import TFElectraModel, ElectraTokenizer

# Result Visualization
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from keras import callbacks


## Load dataset


In [4]:
# Training
train_dataset = pd.read_csv('tsd_train.csv')

train_dataset

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."
...,...,...
7934,"[8, 9, 10, 11]",Another fool pipes in.
7935,"[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 5...",So if a restaurant owner puts up a sign saying...
7936,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Any faith that can't stand up to logic and rea...
7937,"[5, 6, 7, 8, 9, 10, 11]",This idiotic. Use the surplus to pay down the ...


In [5]:
# Creating python dictionary from json string for spans
train_dataset['spans'] = train_dataset['spans'].apply(lambda x : json.loads(x))

# Converting spans to numpy
spans = train_dataset['spans'].to_numpy()

# Converting texts to numpy
texts = train_dataset['text'].to_numpy()

In [6]:
from crf import *


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



## Tokenize Using Electra

In [7]:
# Tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

## Data Preprocessing

In [8]:
max_length = 400

# Preprocessing the train data['spans']
# For identifying the correct words of span and converting to 1 and 0 of of a boolean vector of whether the word is toxic or not. 
def create_outputs(texts,spans,max_length,tokenizer):
    outputs = []
    for text,span in zip(texts,spans):
        output = np.zeros(max_length*3,dtype=np.float).reshape((max_length,3))
        tokens = tokenizer.tokenize(text)[:max_length]
        length = 0
        start = True
        for i in range(len(tokens),max_length):
            output[i,0] = 1.0
        for index,token in enumerate(tokens):
            sub = False
            if "##" in token:
                sub = True
                token = token[2:]
            if not start:
                next_index = text[length:].find(token)
                if next_index == 0:
                    sub = True
                length += next_index
            if length in span:
                output[index,2] = 1.0
                output[index,0] = 0.0
            else:
                output[index,1] = 1.0
                output[index,0] = 0.0
            length += len(token)
            start = False
        outputs.append(output)
    return np.array(outputs)

In [9]:
# processing the input data['texts'] using the tokenizer
def create_inputs(texts,max_length,tokenizer):

  tokens = tokenizer(texts, max_length=max_length, padding="max_length", return_tensors="tf",truncation=True)
  input_length = []
  for text in texts:
    input_length.append(min(max_length,len(tokenizer.tokenize(text))))

  tokenized_input_ids = np.array(tokens['input_ids'])
  tokenized_token_type_ids = np.array(tokens['token_type_ids'])
  tokenized_attention_mask = np.array(tokens['attention_mask'])
  nparray_input_length = np.array(input_length)
  
  inputs = [tokenized_input_ids, tokenized_token_type_ids, tokenized_attention_mask, nparray_input_length]
  return inputs

# Helper Functions

Function to calculate the F1 score

In [14]:
# This class helps us to call the f1 score 
class F1Metric(callbacks.Callback):
    def __init__(self,inputs,labels,spans,texts,test=True):
        self.inputs = inputs
        self.spans = spans
        self.tokenizer = tokenizer
        self.texts = texts
        self.test = test

    def on_epoch_end(self, epoch, logs={}):
        preds = self.model.predict(self.inputs,verbose=0)
        indices = createIndicesForNERModel(preds,texts,tokenizer)
        f1 = avg_f1(indices,self.spans)
        if self.test:
            print()
            print("test f1 = "+str(f1))
        else:
            print()
            print("train f1 = "+str(f1))

In [13]:
# function to calculate the average of f1 score after each epoch
def avg_f1(preds,trues):
    avg_f1_total = 0.0
    for pred,true in zip(preds,trues):
        avg_f1_total += f1(pred,true)
    return avg_f1_total/len(preds)

In [12]:
# function to calculate the f1 score using the given standard formula 
def f1(preds,trues):
    if len(trues) == 0:
        return 1. if len(preds) == 0 else 0.
    if len(preds) == 0:
        return 0.
    predictions_set = set(preds)
    gold_set = set(trues)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

Input for Name Entity Recognition

In [11]:
# creates the indices for the NER model for a list of predictions and true labels
def createIndicesForNERModel(predicts,texts,tokenizer):
    outputs = []
    for text,pred in zip(texts,predicts):
         indices = NERGetIndicesSingleText(pred,text,tokenizer)
         outputs.append(indices)
    return outputs

In [10]:
# It is taking the tokens and convert to array of indixes of each and every text in the input data 
def NERGetIndicesSingleText(outputs,text,tokenizer):
    outputs = tf.argmax(outputs,axis=-1)
    tokens = tokenizer.tokenize(text)
    index = 0
    indexes = []
    sub = False
    prev = False
    for token,output in zip(tokens,outputs):
        if token[:2] == "##":
            token = token[2:]
            sub = True
        else:
            sub = False
        temp_index = text[index:].find(token)
        temp_start = index+temp_index
        if output == 2 or (sub and prev and output != 0):
            prev = True
            indexes = indexes + list(range(temp_start,temp_start+len(token)))
        else:
            prev = False
        index = temp_start+len(token)
    return np.array(indexes)

## Embed Model on top of Electra Transformer

In [15]:

def CRF_BiGRU(max_input_length, base_model):
    input_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_input_ids",dtype=tf.int32)
    input_type_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_token_type_ids",dtype=tf.int32)
    input_attention_mask_layer = layers.Input(shape=(max_input_length,),name="encoder_attention_mask",dtype=tf.int32)
    input_length = layers.Input(shape=(1,),name="length",dtype=tf.int32)
    base_model.trainable = True
    # loading the base model
    base_model = base_model(input_ids_layer,token_type_ids=input_type_ids_layer,attention_mask=input_attention_mask_layer,return_dict=True)
    # Embedding the bidirectional GRU model 
    output = layers.Bidirectional(layers.GRU(512,return_sequences=True))(base_model.last_hidden_state)
    # adding the dropout and dense layers
    output = layers.Dropout(0.1)(base_model.last_hidden_state)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dropout(0.1)(output)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dense(3,activation="linear")(output)
    # embedding the CRF layer
    crf = CRFLayer()
    output = crf(inputs=[output,input_length])
    model = models.Model(inputs=[input_ids_layer,input_type_ids_layer,input_attention_mask_layer,input_length],outputs=output)
    # compiling the model with adam optimizer and learning rate and loss as crf.loss with metric as accuracy
    model.compile(optimizer=optimizers.Adam(learning_rate=3e-5),loss=crf.loss,metrics=['accuracy'])
    return model


In [25]:
def CRF_biLSTM(max_input_length,base_model):
    input_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_input_ids",dtype=tf.int32)
    input_type_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_token_type_ids",dtype=tf.int32)
    input_attention_mask_layer = layers.Input(shape=(max_input_length,),name="encoder_attention_mask",dtype=tf.int32)
    input_length = layers.Input(shape=(1,),name="length",dtype=tf.int32)
    base_model.trainable = True
    # loading the base model
    base_model = base_model(input_ids_layer,token_type_ids=input_type_ids_layer,attention_mask=input_attention_mask_layer,return_dict=True)
     # Embedding the bidirectional LSTM model 
    output = layers.Bidirectional(layers.LSTM(512,return_sequences=True))(base_model.last_hidden_state)
# adding the dropout and dense layers
    output = layers.Dropout(0.1)(base_model.last_hidden_state)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dropout(0.1)(output)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dense(3,activation="linear")(output)
# embedding the CRF layer
    crf = CRFLayer()
    output = crf(inputs=[output,input_length])
    model = models.Model(inputs=[input_ids_layer,input_type_ids_layer,input_attention_mask_layer,input_length],outputs=output)
    # compiling the model with adam optimizer and learning rate and loss as crf.loss with metric as accuracy
    model.compile(optimizer=optimizers.Adam(learning_rate=3e-5),loss=crf.loss,metrics=['accuracy'])
    return model

## K-Fold Cross Validation

In [18]:
outputs = create_outputs(texts,train_dataset['spans'],max_length,tokenizer)
kf = KFold(n_splits=5)
train_validation_indices = []
for train_index,validation_index in kf.split(texts):
    train_validation_indices.append((train_index,validation_index))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  output = np.zeros(max_length*3,dtype=np.float).reshape((max_length,3))


## Split dataset into training and validation

In [19]:
train_index, validation_index = train_validation_indices.pop()
x_train , x_validation = list(texts[train_index]) , list(texts[validation_index])
y_train , y_validation = outputs[train_index] , outputs[validation_index]

## Apply preprocessing functions on train and Validation data

In [20]:
train_data = create_inputs(x_train,max_length,tokenizer)
validation_data = create_inputs(x_validation,max_length,tokenizer)

spans_validation = spans[validation_index]
spans_train = spans[train_index]

#Electra Embedded + BiGRU + CRF 

## Bi-GRU model

In [21]:
# Creating Model Instance for Bi-GRU
gc.collect()
tf.keras.backend.clear_session()
base_model = TFElectraModel.from_pretrained('google/electra-base-discriminator')
model_bigru = CRF_BiGRU(max_length, base_model)

# Fitting Bi-GRU Model
history = model_bigru.fit(train_data,y_train,batch_size=16,epochs=3,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/T2_Checkpoints_Sriram/gru/checkpoints/bigru",save_weights_only=True)])

# Creating predictions by Bi-GRU
preds_bigru = model_bigru.predict(validation_data)
indices = createIndicesForNERModel(preds_bigru,x_validation,tokenizer)

# Calculating F1
f1_toxic_bigru = avg_f1(indices,spans_validation)
print("validation F1 = %f"%(f1_toxic_bigru))

Downloading tf_model.h5:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some layers from the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraModel: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFElectraModel were initialized from the model checkpoint at google/electra-base-discriminator.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further training.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Epoch 1/3
Epoch 2/3
Epoch 3/3
validation F1 = 0.408439


In [22]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model_bigru, show_shapes = False).create(prog = 'dot', format = 'svg'))
plot_model(
    model = model_bigru, show_shapes = False,
    to_file = 'bigru_network.pdf'
)

In [23]:
model_bigru.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_ids (InputLayer)  [(None, 400)]       0           []                               
                                                                                                  
 encoder_attention_mask (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                                  
 encoder_token_type_ids (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                              

# Electra Embedded + BiLSTM + CRF 

In [26]:
# Creating Model Instance for Bi-LSTM
gc.collect()
tf.keras.backend.clear_session()
base_model = TFElectraModel.from_pretrained('google/electra-base-discriminator')
model_bilstm = CRF_biLSTM(max_length,base_model)

# Fitting Bi-LSTM Model
model_bilstm.fit(train_data,y_train,batch_size=16,epochs= 3,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/T2_Checkpoints_Sriram/lstm/checkpoints/bilstm",save_weights_only=True)])

# Creating predictions by Bi-LSTM 
preds_bilstm = model_bilstm.predict(validation_data)
indices = createIndicesForNERModel(preds_bilstm,x_validation,tokenizer)

# Calculating F1
f1_toxic = avg_f1(indices,spans_validation)
print("validation F1 = %f"%(f1_toxic))

Some layers from the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraModel: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFElectraModel were initialized from the model checkpoint at google/electra-base-discriminator.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
validation F1 = 0.394736


In [28]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model_bilstm, show_shapes = False).create(prog = 'dot', format = 'svg'))
plot_model(
    model = model_bilstm, show_shapes = False,
    to_file = 'bilstm_network.pdf'
)
model_bilstm.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_ids (InputLayer)  [(None, 400)]       0           []                               
                                                                                                  
 encoder_attention_mask (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                                  
 encoder_token_type_ids (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                              