In [1]:
%%capture
!wget https://raw.githubusercontent.com/ipavlopoulos/toxic_spans/master/SemEval2021/data/tsd_test.csv
!wget https://raw.githubusercontent.com/ipavlopoulos/toxic_spans/master/SemEval2021/data/tsd_train.csv


In [2]:
%%capture
!pip install -r requirements.txt

## Loading Libraries

In [4]:
# Basic Libraries
import pandas as pd
import numpy as np
import json
import gc

# Tensorflow modules
from tensorflow.keras import *
import tensorflow as tf
from tensorflow.keras import *
import tensorflow.keras.backend as K

# Transformer module
from transformers import TFElectraModel, ElectraTokenizer

# Result Visualization
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from keras import callbacks


## Load dataset


In [5]:
# Training
train_dataset = pd.read_csv('tsd_train.csv')
test_dataset = pd.read_csv('tsd_test.csv')

In [6]:
# Creating python dictionary from json string for spans
train_dataset['spans'] = train_dataset['spans'].apply(lambda x : json.loads(x))
test_dataset['spans'] = test_dataset['spans'].apply(lambda x : json.loads(x))

In [7]:
# Converting spans to numpy
spans = train_dataset['spans'].to_numpy()
# spans

## Tokenize Using Electra

In [8]:
# Tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

## Data Preprocessing

In [9]:
# processing the input data['texts'] using the tokenizer
def create_inputs(texts,max_length,tokenizer):
  # Tokenize the input texts
  tokens = tokenizer(texts, max_length=max_length, padding="max_length", return_tensors="tf",truncation=True)
  # Get input length for each text
  input_length = []
  for text in texts:
    input_length.append(min(max_length,len(tokenizer.tokenize(text))))

  # Generate arrays of the tokenized inputs
  # Input IDs
  tokenized_input_ids = np.array(tokens['input_ids'])
  # Attention mask
  tokenized_token_type_ids = np.array(tokens['token_type_ids'])
  # Token type IDs
  tokenized_attention_mask = np.array(tokens['attention_mask'])
  # Input length
  nparray_input_length = np.array(input_length)
  
  inputs = [tokenized_input_ids, tokenized_token_type_ids, tokenized_attention_mask, nparray_input_length]
  return inputs

In [10]:
max_length = 400
# Preprocessing the train data['spans']
# For identifying the correct words of span and converting to 1 and 0 of of a boolean vector of whether the word is toxic or not. 
def create_outputs(texts,spans,max_length,tokenizer):
    outputs = []
    for text,span in zip(texts,spans):
        # initialize outputs with 0
        output = np.zeros(max_length*3,dtype=np.float).reshape((max_length,3))
        tokens = tokenizer.tokenize(text)[:max_length]
        length = 0
        start = True
        for i in range(len(tokens),max_length):
            output[i,0] = 1.0
        for index,token in enumerate(tokens):
            sub = False
            if "##" in token:
                sub = True
                token = token[2:]
            if not start:
                next_index = text[length:].find(token)
                if next_index == 0:
                    sub = True
                length += next_index
            if length in span:
                output[index,2] = 1.0
                output[index,0] = 0.0
            else:
                output[index,1] = 1.0
                output[index,0] = 0.0
            length += len(token)
            start = False
        outputs.append(output)
    return np.array(outputs)

In [11]:
# Converting texts to numpy
texts = train_dataset['text'].to_numpy()
# texts

## Split testing datasets into x and y

In [12]:
# Get texts for train and test set
x_train, x_test= list(train_dataset['text'].to_numpy()), list(test_dataset['text'].to_numpy())

# Get spans for train and test set
y_train = create_outputs(train_dataset['text'].to_numpy(),train_dataset['spans'],max_length,tokenizer)
y_test = create_outputs(test_dataset['text'].to_numpy(),test_dataset['spans'],max_length,tokenizer)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  output = np.zeros(max_length*3,dtype=np.float).reshape((max_length,3))


## Build Model

In [13]:
from crf import *


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [14]:
def build_bigru_model(max_input_length, base_model):
    input_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_input_ids",dtype=tf.int32)
    input_type_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_token_type_ids",dtype=tf.int32)
    input_attention_mask_layer = layers.Input(shape=(max_input_length,),name="encoder_attention_mask",dtype=tf.int32)
    input_length = layers.Input(shape=(1,),name="length",dtype=tf.int32)
    base_model.trainable = True
    # loading the base model
    base_model = base_model(input_ids_layer,token_type_ids=input_type_ids_layer,attention_mask=input_attention_mask_layer,return_dict=True)
        # Embedding the bidirectional GRU model 
    output = layers.Bidirectional(layers.GRU(512,return_sequences=True))(base_model.last_hidden_state)
    # adding the dropout and dense layers
    output = layers.Dropout(0.1)(base_model.last_hidden_state)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dropout(0.1)(output)
    output = layers.Dense(1024,activation="relu")(output)

    output = layers.Dense(3,activation="linear")(output)
        # embedding the CRF layer
    crf = CRFLayer()
    output = crf(inputs=[output,input_length])
    model = models.Model(inputs=[input_ids_layer,input_type_ids_layer,input_attention_mask_layer,input_length],outputs=output)
    model.compile(optimizer=optimizers.Adam(learning_rate=3e-5),loss=crf.loss,metrics=['accuracy'])
    return model


In [15]:
def build_bilstm_model(max_input_length,base_model):
    input_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_input_ids",dtype=tf.int32)
    input_type_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_token_type_ids",dtype=tf.int32)
    input_attention_mask_layer = layers.Input(shape=(max_input_length,),name="encoder_attention_mask",dtype=tf.int32)
    input_length = layers.Input(shape=(1,),name="length",dtype=tf.int32)
    base_model.trainable = True
    # loading the base model
    base_model = base_model(input_ids_layer,token_type_ids=input_type_ids_layer,attention_mask=input_attention_mask_layer,return_dict=True)
    # Embedding the bidirectional GRU model 
    output = layers.Bidirectional(layers.LSTM(512,return_sequences=True))(base_model.last_hidden_state)
    # adding the dropout and dense layers
    output = layers.Dropout(0.1)(base_model.last_hidden_state)
    output = layers.Dense(1024,activation="relu")(output)
    output = layers.Dropout(0.1)(output)
    output = layers.Dense(1024,activation="relu")(output)

    output = layers.Dense(3,activation="linear")(output)
    # embedding the CRF layer
    crf = CRFLayer()
    output = crf(inputs=[output,input_length])
    model = models.Model(inputs=[input_ids_layer,input_type_ids_layer,input_attention_mask_layer,input_length],outputs=output)
    model.compile(optimizer=optimizers.Adam(learning_rate=3e-5),loss=crf.loss,metrics=['accuracy'])
    return model

In [16]:
# tokenize texts for training
train_data = create_inputs(x_train,max_length,tokenizer)

# tokenize texts for testing
test_data = create_inputs(x_test,max_length,tokenizer)

In [17]:
# Get spans for testing
spans_test = test_dataset['spans']

# Fet spans for training
spans_train = train_dataset['spans']

# Helper Functions

Calculate the F1 score

In [18]:
# This class helps us to call the f1 score 

class F1Metric(callbacks.Callback):
    def __init__(self,inputs,labels,spans,texts,test=True):
        self.inputs = inputs
        self.spans = spans
        self.tokenizer = tokenizer
        self.texts = texts
        self.test = test

    def on_epoch_end(self, epoch, logs={}):
        preds = self.model.predict(self.inputs,verbose=0)
        indices = createIndicesForNERModel(preds,texts,tokenizer)
        f1 = avg_f1(indices,self.spans)
        if self.test:
            print()
            print("test f1 = "+str(f1))
        else:
            print()
            print("train f1 = "+str(f1))

In [19]:
def avg_f1(preds,trues):
  # Initialize average f1
    average_f1_total = 0.0
    for pred,true in zip(preds,trues):
      # Calculate average of f1 scores
        average_f1_total += f1(pred,true)
    return average_f1_total/len(preds)

In [20]:
def f1(preds,trues):
    
    if len(trues) == 0:
        # return 1 if true is empty
        return 1. if len(preds) == 0 else 0.
    if len(preds) == 0:
        # return 0 if preds is empty
        return 0.
    predictions_set = set(preds)
    gold_set = set(trues)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

Input for Name Entity Recognition Model

In [21]:
# creates the indices for the NER model for a list of predictions and true labels

def createIndicesForNERModel(predicts,texts,tokenizer):
    outputs = []
    for text,pred in zip(texts,predicts):
         indices = NERGetIndicesSingleText(pred,text,tokenizer)
         outputs.append(indices)
    return outputs

In [22]:
# It is taking the tokens and convert to array of indixes of each and every text in the input data 

def NERGetIndicesSingleText(outputs,text,tokenizer):
    outputs = tf.argmax(outputs,axis=-1)
    tokens = tokenizer.tokenize(text)
    index = 0
    indexes = []
    sub = False
    prev = False
    for token,output in zip(tokens,outputs):
      # end token for pretrained embeddings from Electra
        if token[:2] == "##":
            token = token[2:]
            sub = True
        else:
            sub = False
        temp_index = text[index:].find(token)
        temp_start = index+temp_index
        if output == 2 or (sub and prev and output != 0):
            prev = True
            indexes = indexes + list(range(temp_start,temp_start+len(token)))
        else:
            prev = False
        index = temp_start+len(token)
    return np.array(indexes)

# CRF + Bi GRU Predict

In [23]:
# load model.h5

load_bigru = tf.keras.models.load_model(
    'bigru.h5', 
    custom_objects = {
        'TFElectraModel' : TFElectraModel,
        'CRFLayer':CRFLayer,
        'loss':CRFLayer.loss
    }
)
load_bigru.summary()


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_ids (InputLayer)  [(None, 400)]       0           []                               
                                                                                                  
 encoder_attention_mask (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                                  
 encoder_token_type_ids (InputL  [(None, 400)]       0           []                               
 ayer)                                                                                            
                                                                                              

In [24]:
# Get predictions
preds_bigru = load_bigru.predict(test_data)

# Generate indices of the toxic spans
indices = createIndicesForNERModel(preds_bigru,x_test,tokenizer)

# Calculate F1 score of the prediction
f1_toxic = avg_f1(indices,spans_test)



In [25]:
print("test F1 = %f"%(f1_toxic))

test F1 = 0.462212


## CRF + Bi LSTM Predict

In [None]:
# load model.h5

load_bilstm = tf.keras.models.load_model(
    'bilstm.h5', 
    custom_objects = {
        'TFElectraModel' : TFElectraModel,
        'CRFLayer':CRFLayer,
        'loss':CRFLayer.loss
    }
)
load_bilstm.summary()



# Get predictions
preds_bilstm = load_bilstm.predict(test_data)

# Generate indices for the toxic spans
indices = createIndicesForNERModel(preds_bilstm,x_test,tokenizer)

# Calculate f1 score
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))

test F1 = 0.459520
