<a href="https://colab.research.google.com/github/rahulmeghwal/LiPi/blob/master/Roberta_%2B_LSTM_%2B_crf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp "/content/drive/My Drive/tsd_test.csv" "./toxic_span_practice.csv"
!cp "/content/drive/My Drive/tsd_train.csv" "./toxic_span_train.csv"

In [None]:
!pip install tensorflow==2.3.0
!pip install stanza
!pip install transformers
!pip install tensorflow-addons



In [None]:
import pandas as pd
import numpy as np
import gc
import json
import stanza
from tensorflow.keras import *
import tensorflow as tf
from tensorflow.keras import *
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from transformers import TFRobertaModel,RobertaTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
test_set = pd.read_csv("toxic_span_practice.csv")
test_set['spans'] = test_set['spans'].apply(lambda x : json.loads(x))
train_set = pd.read_csv("toxic_span_train.csv")
train_set['spans'] = train_set['spans'].apply(lambda x : json.loads(x))
toxic_span_dataset = test_set.append(train_set,ignore_index=True)
toxic_span_dataset['text'] = toxic_span_dataset['text'].apply(lambda x : x.lower())
print(toxic_span_dataset)


### Char CNN Processing

tk=Tokenizer(num_words=None,char_level=True,oov_token='unk')
tk.fit_on_texts(toxic_span_dataset['text'])
print(tk.word_index)
charcnntext=tk.texts

                                                  spans                                               text
0     [84, 85, 86, 87, 88, 89, 90, 91, 133, 134, 135...  that's right. they are not normal. and i am st...
1                              [81, 82, 83, 84, 85, 86]  "watch people die from taking away their healt...
2                                                    []  tens years ago i contacted the pdr and suggest...
3                                                    []  the parallels between the anc and the sicilian...
4                                                    []  intel community: ‘how can we work for a presid...
...                                                 ...                                                ...
9934                                     [8, 9, 10, 11]                             another fool pipes in.
9935  [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 5...  so if a restaurant owner puts up a sign saying...
9936  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 

In [None]:
def createNEROutputs(texts,spans,max_length,tokenizer):
    outputs = []
    for text,span in zip(texts,spans):
        output = np.zeros(max_length*3,dtype=np.float).reshape((max_length,3))
        tokens = tokenizer.tokenize(text)[:max_length]
        length = 0
        start = True
        for i in range(len(tokens),max_length):
            output[i,0] = 1.0
        for index,token in enumerate(tokens):
            sub = True
            if "Ġ" in token:
                sub = False
                token = token[1:]
            if not start:
                next_index = text[length:].find(token)
                if next_index == 0:
                    sub = True
                length += next_index
            # if length in span and not sub:
            #     output[index,2] = 1.0
            #     output[index,0] = 0.0
            if length in span:
                output[index,2] = 1.0
                output[index,0] = 0.0
            else:
                output[index,1] = 1.0
                output[index,0] = 0.0
            length += len(token)
            start = False
        outputs.append(output)
    return np.array(outputs)

In [None]:
def NERGetIndicesSingleText(outputs,text,tokenizer):
    outputs = tf.argmax(outputs,axis=-1)
    tokens = tokenizer.tokenize(text)
    index = 0
    indexes = []
    sub = False
    prev = False
    for token,output in zip(tokens,outputs):
        if token[0] == "Ġ":
            token = token[1:]
            sub = False
        elif token.isalpha():
            sub = True
        else:
            sub = False
        temp_index = text[index:].find(token)
        temp_start = index+temp_index
        if output == 2 or (sub and prev and output != 0):
            prev = True
            indexes = indexes + list(range(temp_start,temp_start+len(token)))
        else:
            prev = False
        index = temp_start+len(token)
    return np.array(indexes)

In [None]:
def createIndicesForNERModel(predicts,texts,tokenizer):
    outputs = []
    for text,pred in zip(texts,predicts):
         indices = NERGetIndicesSingleText(pred,text,tokenizer)
         outputs.append(indices)
    return outputs

In [None]:
def f1(preds,trues):
    if len(trues) == 0:
        return 1. if len(preds) == 0 else 0.
    if len(preds) == 0:
        return 0.
    predictions_set = set(preds)
    gold_set = set(trues)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [None]:
def avg_f1(preds,trues):
    avg_f1_total = 0.0
    for pred,true in zip(preds,trues):
        avg_f1_total += f1(pred,true)
    return avg_f1_total/len(preds)

In [None]:
class F1Metric(callbacks.Callback):
    def __init__(self,inputs,labels,spans,texts,test=True):
        self.inputs = inputs
        self.spans = spans
        self.tokenizer = tokenizer
        self.texts = texts
        self.test = test

    def on_epoch_end(self, epoch, logs={}):
        preds = self.model.predict(self.inputs,verbose=0)
        indices = createIndicesForNERModel(preds,texts,tokenizer)
        f1 = avg_f1(indices,self.spans)
        if self.test:
            print()
            print("test f1 = "+str(f1))
        else:
            print()
            print("train f1 = "+str(f1))

In [None]:
def createInputForNER(texts,max_length,tokenizer):
    input_length = []
    for text in texts:
        input_length.append(min(max_length,len(tokenizer.tokenize(text))))
    tokens = tokenizer(texts,padding="max_length",max_length=max_length,return_tensors="tf",truncation=True)
    data = [np.array(tokens['input_ids']),np.array(tokens['attention_mask']),np.array(input_length)]
    return data

# CRF Layer

In [None]:
#https://github.com/Hironsan/keras-crf-layer/blob/master/crf.py
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer, InputSpec
import tensorflow_addons as tfa

try:
    from tensorflow.contrib.crf import crf_decode
except ImportError:
    from tensorflow.python.framework import dtypes
    from tensorflow.python.ops import array_ops, gen_array_ops, math_ops, rnn, rnn_cell


    class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
        def __init__(self, transition_params):
            self._transition_params = array_ops.expand_dims(transition_params, 0)
            self._num_tags = transition_params.get_shape()[0]

        @property
        def state_size(self):
            return self._num_tags

        @property
        def output_size(self):
            return self._num_tags

        def __call__(self, inputs, state, scope=None):
            state = array_ops.expand_dims(state, 2)  # [B, O, 1]
            transition_scores = state + self._transition_params  # [B, O, O]
            new_state = inputs + math_ops.reduce_max(transition_scores, [1])  # [B, O]
            backpointers = math_ops.argmax(transition_scores, 1)
            backpointers = math_ops.cast(backpointers, dtype=dtypes.int32)  # [B, O]
            return backpointers, new_state


    class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
        def __init__(self, num_tags):
            self._num_tags = num_tags

        @property
        def state_size(self):
            return 1

        @property
        def output_size(self):
            return 1

        def __call__(self, inputs, state, scope=None):
            state = array_ops.squeeze(state, axis=[1])  # [B]
            batch_size = array_ops.shape(inputs)[0]
            b_indices = math_ops.range(batch_size)  # [B]
            indices = array_ops.stack([b_indices, state], axis=1)  # [B, 2]
            new_tags = array_ops.expand_dims(
                gen_array_ops.gather_nd(inputs, indices),  # [B]
                axis=-1)  # [B, 1]

            return new_tags, new_tags


    def crf_decode(potentials, transition_params, sequence_length):
        num_tags = potentials.get_shape()[2]

        # Computes forward decoding. Get last score and backpointers.
        crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
        initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
        initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
        inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
        backpointers, last_score = rnn.dynamic_rnn(
            crf_fwd_cell,
            inputs=inputs,
            sequence_length=sequence_length - 1,
            initial_state=initial_state,
            time_major=False,
            dtype=dtypes.int32)  # [B, T - 1, O], [B, O]
        backpointers = gen_array_ops.reverse_sequence(backpointers, sequence_length - 1, seq_dim=1)  # [B, T-1, O]

        # Computes backward decoding. Extract tag indices from backpointers.
        crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
        initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1), dtype=dtypes.int32)  # [B]
        initial_state = array_ops.expand_dims(initial_state, axis=-1)  # [B, 1]
        decode_tags, _ = rnn.dynamic_rnn(
            crf_bwd_cell,
            inputs=backpointers,
            sequence_length=sequence_length - 1,
            initial_state=initial_state,
            time_major=False,
            dtype=dtypes.int32)  # [B, T - 1, 1]
        decode_tags = array_ops.squeeze(decode_tags, axis=[2])  # [B, T - 1]
        decode_tags = array_ops.concat([initial_state, decode_tags], axis=1)  # [B, T]
        decode_tags = gen_array_ops.reverse_sequence(decode_tags, sequence_length, seq_dim=1)  # [B, T]

        best_score = math_ops.reduce_max(last_score, axis=1)  # [B]
        return decode_tags, best_score


class CRFLayer(Layer):

    def __init__(self, transition_params=None, **kwargs):
        super(CRFLayer, self).__init__(**kwargs)
        self.transition_params = transition_params
        self.input_spec = [InputSpec(ndim=3), InputSpec(ndim=2)]
        self.supports_masking = True

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape[0]) == 3

        return input_shape[0]

    def build(self, input_shape):
        assert len(input_shape) == 2
        assert len(input_shape[0]) == 3
        assert len(input_shape[1]) == 2
        n_steps = input_shape[0][1]
        n_classes = input_shape[0][2]
        assert n_steps is None or n_steps >= 2

        self.transition_params = self.add_weight(shape=(n_classes, n_classes),
                                                 initializer='uniform',
                                                 name='transition')
        self.input_spec = [InputSpec(dtype=K.floatx(), shape=(None, n_steps, n_classes)),
                           InputSpec(dtype='int32', shape=(None, 1))]
        self.built = True

    def viterbi_decode(self, potentials, sequence_length):
        decode_tags, best_score = crf_decode(potentials, self.transition_params, sequence_length)
        return decode_tags

    def call(self, inputs, mask=None, **kwargs):
        inputs, sequence_lengths = inputs
        self.sequence_lengths = K.flatten(sequence_lengths)
        y_pred = self.viterbi_decode(inputs, self.sequence_lengths)
        nb_classes = self.input_spec[0].shape[2]
        y_pred_one_hot = K.one_hot(y_pred, nb_classes)

        return K.in_train_phase(inputs, y_pred_one_hot)

    def loss(self, y_true, y_pred):
        y_true = K.cast(K.argmax(y_true, axis=-1), dtype='int32')
        log_likelihood, self.transition_params = tfa.text.crf.crf_log_likelihood(
            y_pred, y_true, self.sequence_lengths, self.transition_params)
        loss = tf.reduce_mean(-log_likelihood)

        return loss

    def get_config(self):
        config = {
            'transition_params': K.eval(self.transition_params),
        }
        base_config = super(CRFLayer, self).get_config()

        return dict(list(base_config.items()) + list(config.items()))


def create_custom_objects():
    instanceHolder = {'instance': None}

    class ClassWrapper(CRFLayer):
        def __init__(self, *args, **kwargs):
            instanceHolder['instance'] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)

    def loss(*args):
        method = getattr(instanceHolder['instance'], 'loss')
        return method(*args)

    return {'CRFLayer': ClassWrapper, 'loss': loss}

 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# NER model

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# base_model = TFBertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
#Char CNN
def charCNNProcessing():
        #inputs = Input(shape=(self.input_size,), name='sent_input', dtype='int64')
        inputs=layers.Input(shape=(max_input_length,),name="charcnn_input_ids",dtype=tf.int32)
        # Embedding layers
        x = layers.Embedding(70, 128, input_length=max_input_length)(inputs)
        charcnn=layers.Conv1D(filters=512,kernel_size=4, activation='relu',padding='same')
        return charcnn
        # Convolution layers
        # convolution_output = []
        # for num_filters, filter_width in self.conv_layers:
        #     conv = Convolution1D(filters=num_filters,
        #                          kernel_size=filter_width,
        #                          activation='tanh',
        #                          name='Conv1D_{}_{}'.format(num_filters, filter_width))(x)
        #     pool = GlobalMaxPooling1D(name='MaxPoolingOverTime_{}_{}'.format(num_filters, filter_width))(conv)
        #     convolution_output.append(pool)

        # x = Concatenate()(convolution_output)
        # Fully connected layers
        # for fl in self.fully_connected_layers:
        #     x = Dense(fl, activation='selu', kernel_initializer='lecun_normal')(x)
        #     x = AlphaDropout(self.dropout_p)(x)
        # Output layer
        # predictions = Dense(self.num_of_classes, activation='softmax')(x)
        # # Build and compile model
        # model = Model(inputs=inputs, outputs=predictions)
        # model.compile(optimizer=self.optimizer, loss=self.loss)
        # self.model = model
        # print("CharCNNKim model built: ")
        # self.model.summary()

In [None]:
def createToxicModelWithGivenBaseModel(max_input_length,base_model):
    input_ids_layer = layers.Input(shape=(max_input_length,),name="encoder_input_ids",dtype=tf.int32)
    input_attention_mask_layer = layers.Input(shape=(max_input_length,),name="encoder_attention_mask",dtype=tf.int32)
    input_length = layers.Input(shape=(1,),name="length",dtype=tf.int32)
    base_model.trainable = True
    base_model = base_model(input_ids_layer,attention_mask=input_attention_mask_layer,return_dict=True)

    ### char CNN starts
    embed = layers.Embedding(70, 128, input_length=max_input_length)(input_ids_layer)
    charcnn=layers.Conv1D(filters=512,kernel_size=4, activation='relu',padding='same')(embed)
    ### char CNN ends
    
    print(base_model.last_hidden_state.shape)
    cnn1=layers.Conv1D(filters=512,kernel_size=4, activation='relu',padding='same')(base_model.last_hidden_state)
    # maxpool1=layers.MaxPooling1D(5)(cnn1)
    # #flatten1=layers.Flatten()(maxpool1)

    # cnn2=layers.Conv1D(filters=256,kernel_size=4, activation='relu',padding='same')(maxpool1)
    # maxpool2=layers.MaxPooling1D(4)(cnn2)
    # #flatten2=layers.Flatten()(maxpool2)

    # cnn3=layers.Conv1D(filters=128,kernel_size=3, activation='relu',padding='same')(maxpool2)
    # maxpool3=layers.MaxPooling1D(3)(cnn3)
    # #flatten3=layers.Flatten()(maxpool3)

    # #merged = layers.Concatenate()([flatten1,flatten2,flatten3])
    # #output=layers.Dense(400,activation="relu")(merged)
    # output=layers.Dropout(0.5)(maxpool3)
    lstm = layers.LSTM(512,return_sequences=True)(base_model.last_hidden_state)
    concat=layers.Concatenate()([cnn1,lstm,charcnn])
    dense=layers.Dense(500,activation='relu')(concat)
    drop=layers.Dropout(0.3)(dense)
    output = layers.Dense(3,activation="linear")(drop)
    crf = CRFLayer()
    output = crf(inputs=[output,input_length])
    model = models.Model(inputs=[input_ids_layer,input_attention_mask_layer,input_length],outputs=output)
    model.compile(optimizer=optimizers.Adam(learning_rate=3e-5),loss=crf.loss,metrics=['accuracy'])
    model.summary()
    return model

In [None]:
max_length = 400

In [None]:
texts = toxic_span_dataset['text'].to_numpy()
targets = createNEROutputs(texts,toxic_span_dataset['spans'],max_length,tokenizer)
all_spans = toxic_span_dataset['spans'].to_numpy()
result_test = []
result_train = []
kf = KFold(n_splits=5)
train_test_indices = []
for train_index,test_index in kf.split(texts):
    
    train_test_indices.append((train_index,test_index))

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 29 21:04:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
train_index,test_index = train_test_indices.pop()
print(train_index)
print(test_index)
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)


[   0    1    2 ... 7949 7950 7951]
[7952 7953 7954 ... 9936 9937 9938]


Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
_________________________________________________________________________

In [None]:
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Epoch 1/2
Epoch 2/2
test F1 = 0.646134
train F1 = 0.683065


In [None]:
train_index,test_index = train_test_indices.pop()
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 512)     1573376     tf_ro

In [None]:
train_index,test_index = train_test_indices.pop()
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 512)     1573376     tf_ro

In [None]:
train_index,test_index = train_test_indices.pop()
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 512)     1573376     tf_ro

In [None]:
train_index,test_index = train_test_indices.pop()
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 512)     1573376     tf_ro

In [None]:
f1_toxic = sum(result_test)/5
print("final test F1 = %f"%(f1_toxic))
f1_toxic = sum(result_train)/5
print("final train F1 = %f"%(f1_toxic))

final test F1 = 0.658758
final train F1 = 0.691268


# train on random part of dataset to save a check point

In [None]:
toxic_span_dataset = toxic_span_dataset.sample(frac=1)
texts = toxic_span_dataset['text'].to_numpy()
targets = createNEROutputs(texts,toxic_span_dataset['spans'],max_length,tokenizer)
all_spans = toxic_span_dataset['spans'].to_numpy()
result_test = []
result_train = []
kf = KFold(n_splits=5,shuffle=True)
train_test_indices = []
for train_index,test_index in kf.split(texts):
    train_test_indices.append((train_index,test_index))

In [None]:
train_index,test_index = train_test_indices.pop()
x_train , x_test = list(texts[train_index]) , list(texts[test_index])
y_train , y_test = targets[train_index] , targets[test_index]
model = None
base_model = None
gc.collect()
tf.keras.backend.clear_session()
base_model = TFRobertaModel.from_pretrained('roberta-base')
model = createToxicModelWithGivenBaseModel(max_length,base_model)
train_data = createInputForNER(x_train,max_length,tokenizer)
test_data = createInputForNER(x_test,max_length,tokenizer)
spans_test = all_spans[test_index]
spans_train = all_spans[train_index]
model.fit(train_data,y_train,batch_size=16,epochs=2,callbacks=[callbacks.ModelCheckpoint("/content/drive/MyDrive/toxic span/final saved models/NER/roberta/LSTM_crf/ner",save_weights_only=True)])
preds = model.predict(test_data)
indices = createIndicesForNERModel(preds,x_test,tokenizer)
f1_toxic = avg_f1(indices,spans_test)
print("test F1 = %f"%(f1_toxic))
result_test.append(f1_toxic)
preds = model.predict(train_data)
indices = createIndicesForNERModel(preds,x_train,tokenizer)
f1_toxic = avg_f1(indices,spans_train)
print("train F1 = %f"%(f1_toxic))
result_train.append(f1_toxic)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


(None, 400, 768)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_ids (InputLayer)  [(None, 400)]        0                                            
__________________________________________________________________________________________________
encoder_attention_mask (InputLa [(None, 400)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   encoder_input_ids[0][0]          
                                                                 encoder_attention_mask[0][0]     
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 512)     1573376     tf_ro

KeyboardInterrupt: ignored