In [1]:
from transformers import TFRobertaModel, RobertaConfig, RobertaTokenizer
import tensorflow as tf
from tqdm import tqdm
import numpy as np
import os
import random
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import random
import sys
import math
from scipy import stats
from scipy.spatial import distance
import random
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


In [3]:
BERT_MODEL = "roberta-base"  # Specify the RoBERTa model
MAX_LENGTH = 128
TEST_SIZE = 0.15
SEED = 42

USE_ATT = True

BERT_DROPOUT = 0.2
LSTM_UNITS = 512
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2
EPOCHS = 10 #(Default 10)
BATCH_SIZE = 32


In [4]:
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

In [5]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks = [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True, max_length=MAX_LENGTH,
                                       pad_to_max_length=True, return_attention_mask=True,
                                       return_token_type_ids=False)  # Roberta doesn't use token_type_ids
        
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])

    return (np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'))

## Define base Roberta configs
# config = RobertaConfig(dropout=BERT_DROPOUT,
#                        attention_dropout=BERT_DROPOUT,
#                        output_attentions=True)

config = RobertaConfig.from_pretrained(BERT_MODEL, output_hidden_states=False, attention_probs_dropout_prob=BERT_DROPOUT, output_attentions = True)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(BERT_MODEL, config=config)

# Freeze the pre-trained layers for fine-tuning
for layer in transformer_model.layers[:3]:
    layer.trainable = False

# Defining tokenizer
tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [6]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]

X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT,  kernel_initializer='normal')
    )(embedding_layer)

if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.

X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS, activation='relu', kernel_initializer='normal')(X)

X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(1, activation='linear', kernel_initializer='normal')(X) # Can be with activation="sigmoid" here.

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])  # Treat HIP as a regression problem
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_token[0][0]',            
 el)                            thPoolingAndCrossAt               'masked_token[0][0]']           
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [7]:
input_data = pd.read_csv("../../datasets/hate_norm_with_span.csv")

In [8]:
intensity_value = []
hate_sentences = []

for i in range(len(input_data)):
    intensity_value.append(int(input_data['Original_Intensity'][i]))
    hate_sentences.append(input_data['Sentence'][i])
    intensity_value.append(int(input_data['Normalized_Intensity'][i]))
    hate_sentences.append(input_data['Normalized_Sentence'][i])

c = list(zip(intensity_value, hate_sentences))
random.shuffle(c)
intensity_value, hate_sentences = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(hate_sentences, intensity_value, test_size=TEST_SIZE, random_state=1)

train_input_ids, train_input_masks = tokenize(X_tr, tokenizer)
test_input_ids, test_input_masks = tokenize(X_te, tokenizer)

y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

  0%|                                                                                               | 0/5145 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5145/5145 [00:01<00:00, 4213.76it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

In [9]:
model.fit(x=[train_input_ids, train_input_masks], y=y_tr, epochs=10, validation_split=0.15, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27b7ce681f0>

In [10]:
print("TEST split", TEST_SIZE)
results = model.evaluate(x=[test_input_ids, test_input_masks], y=y_te)

print(results)
result = model.predict(x=[test_input_ids, test_input_masks])
result = np.array(result, dtype=np.float64)
result = result.flatten()
print("pear", stats.pearsonr(result, y_te))
print("cosine", 1 - distance.cosine(result, y_te))

TEST split 0.15
[1.4486263990402222, 0.005500549916177988, 1.2035889625549316]
pear PearsonRResult(statistic=0.8189982709589667, pvalue=5.041461623706612e-221)
cosine 0.9794501067442702


In [12]:
model.save('model_weights/bilstm')



INFO:tensorflow:Assets written to: model_weights/bilstm\assets


INFO:tensorflow:Assets written to: model_weights/bilstm\assets


In [14]:
input_data

Unnamed: 0,Sentence,Normalized_Sentence,Original_Intensity,Normalized_Intensity,Span,spanbio,postags
0,Islam looks like a cult more than like a relig...,Islam looks like a group more than like a reli...,9.0,5.0,"{'start': [0, 13, 22], 'end': [4, 19, 27]}",B I I I I O O O O O O O O B I I I I I I O O B ...,NNP VBZ IN DT NN JJR IN IN DT NN PRP VBZ JJ JJ...
1,We must send Islamics back to their native cou...,We must send some muslims who misinterpret Isl...,7.0,4.0,"{'start': [2, 11], 'end': [4, 14]}",O O B I I O O O O O O B I I I,PRP MD VB NNP RB TO PRP$ JJ NN PRP VBP RB JJ I...
2,"Europe is civilised, Muslims should not stay t...","Europe is more civilised, some Muslims who hav...",7.0,6.0,"{'start': [3], 'end': [7]}",O O O B I I I I,NNP VBZ JJ NNP MD RB VB NNS
3,"If they love Sharia law so much, why do not th...","If they love Sharia law so much, why do not th...",8.0,7.0,"{'start': [2, 11], 'end': [4, 18]}",O O B I I O O O O O O B I I I I I I I,IN PRP VBP NNP NN RB JJ WRB VBP RB PRP VBP RB ...
4,Islam=evil. Islam is invading us and trying to...,Some Muslims who misinterpret their scriptures...,7.0,5.0,"{'start': [0, 1, 6, 25], 'end': [0, 4, 10, 26]}",B I I I I O B I I I I O O O O O O O O O O O O ...,NNP NNP VBZ VBG PRP CC VBG TO VB RP NNS PRP VB...
...,...,...,...,...,...,...,...
3022,@user you shouldn't be leaving your kitchen un...,you shouldn't be leaving your kitchen unattended,8.0,2.0,"{'start': [8], 'end': [8]}",O O O O O O O O B,NN PRP VBP VB VBG PRP$ NN VBD NN
3023,@user a faggot that needs to grow up,@user a person that needs to grow up,7.0,2.0,"{'start': [2], 'end': [2]}",O O B O O O O O,VB DT NN WDT VBZ TO VB RP
3024,Women ... Can't live with them ... Can't shoot...,Women Can't live with them.Can't live without ...,9.0,7.0,"{'start': [8], 'end': [8]}",O O O O O O O O B O O,NNS : NNP VBP IN PRP : NNP VBD PRP .
3025,""" Who the fuck you callin jiggaboo , nigga ? ! ""","""Who the you callin out, black man""",7.0,3.0,"{'start': [2, 8], 'end': [2, 8]}",O O B O O O O O B O O O,"NN WP DT NN PRP VBP NNS , RB . . NN"


In [45]:
# Extract the sentences
sentences = []
for i in range(len(input_data)):
    sentences.append(input_data['Sentence'][i])
    sentences.append(input_data['Normalized_Sentence'][i])

# Tokenize the sentences
input_ids, input_masks = tokenize(sentences, tokenizer)

# Make predictions
predictions = model.predict([input_ids, input_masks])

# Round the predictions to the nearest integer
predicted_labels = np.round(predictions).astype(int)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6054/6054 [00:01<00:00, 5998.79it/s]




In [46]:
# Print predictions
for sentence, predicted_label, prediction in zip(sentences, predicted_labels, predictions):
    print(f"Sentence: {sentence}")
    print(f"Predicted Intensity: {predicted_label[0]}")
    print(f"Predicted Intensity: {prediction}")
    
    print()

Sentence: Islam looks like a cult more than like a religion. It is beliefs make people rape children and commit violence. It should not be accepted in our country.
Predicted Intensity: 8
Predicted Intensity: [8.471718]

Sentence: Islam looks like a group more than like a religion. Its beliefs can be misinterpreted to make people commit crimes. It should be scrutinized in our country.
Predicted Intensity: 6
Predicted Intensity: [5.5841956]

Sentence: We must send Islamics back to their native countries. They are too different from us.
Predicted Intensity: 7
Predicted Intensity: [6.9469223]

Sentence: We must send some muslims who misinterpret Islam back to their native countries. They have different values from us.
Predicted Intensity: 3
Predicted Intensity: [3.4575527]

Sentence: Europe is civilised, Muslims should not stay there.
Predicted Intensity: 7
Predicted Intensity: [7.2113185]

Sentence: Europe is more civilised, some Muslims who have misinterpreted thoughts regarding Islam sh

In [28]:
intensity = []
for i in range(len(input_data)):
    intensity.append(input_data['Original_Intensity'][i])
    intensity.append(input_data['Normalized_Intensity'][i])


In [47]:
output_data = pd.DataFrame({'Sentence':sentences, 'Predicted_Intensity': predicted_labels.flatten(), 'Original_Predicted_Intensity': predictions.flatten(), 'Intensity': intensity})

output_data

Unnamed: 0,Sentence,Predicted_Intensity,Original_Predicted_Intensity,Intensity
0,Islam looks like a cult more than like a relig...,8,8.471718,9.0
1,Islam looks like a group more than like a reli...,6,5.584196,5.0
2,We must send Islamics back to their native cou...,7,6.946922,7.0
3,We must send some muslims who misinterpret Isl...,3,3.457553,4.0
4,"Europe is civilised, Muslims should not stay t...",7,7.211318,7.0
...,...,...,...,...
6049,Women Can't live with them.Can't live without ...,4,4.388725,7.0
6050,""" Who the fuck you callin jiggaboo , nigga ? ! """,8,7.551882,7.0
6051,"""Who the you callin out, black man""",4,4.122194,3.0
6052,@user @user @user @user @user @user ape ? Fuck...,6,6.428796,7.0


In [48]:
# Save the results to a new CSV file
output_data.to_csv('predicted_intensities.csv', index=False)

print("Predictions saved to 'predicted_intensities.csv'.")

Predictions saved to 'predicted_intensities.csv'.


In [49]:
rmse_original = np.sqrt(np.mean((output_data['Original_Predicted_Intensity'] - output_data['Intensity']) ** 2))
rmse = np.sqrt(np.mean((output_data['Predicted_Intensity'] - output_data['Intensity']) ** 2))

rmse_original, rmse

(0.9977495826280334, 1.0352237149875765)

In [53]:
import json

file_path = '../../datasets/json_datasets/gpt_35_ft_intensity.json'

# Load the JSON content from the file
with open(file_path, 'r') as file:
    data = json.load(file)

# Convert JSON data to a pandas DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,sentence,intensity,gpt_intensity
0,1,Islam looks like a cult more than like a relig...,9.0,9.0
1,2,We must send Islamics back to their native cou...,7.0,11.0
2,3,"Europe is civilised, Muslims should not stay t...",7.0,7.0
3,4,"If they love Sharia law so much, why do not th...",8.0,7.0
4,5,Islam=evil. Islam is invading us and trying to...,7.0,11.0


In [61]:
merged_df = pd.merge(df, output_data, left_on='sentence', right_on='Sentence', how='inner')
merged_df.drop(columns=['id', 'Intensity', 'Sentence'], inplace=True)


merged_df

Unnamed: 0,sentence,intensity,gpt_intensity,Predicted_Intensity,Original_Predicted_Intensity
0,Islam looks like a cult more than like a relig...,9.0,9.0,8,8.471718
1,We must send Islamics back to their native cou...,7.0,11.0,7,6.946922
2,"Europe is civilised, Muslims should not stay t...",7.0,7.0,7,7.211318
3,"If they love Sharia law so much, why do not th...",8.0,7.0,7,6.576067
4,Islam=evil. Islam is invading us and trying to...,7.0,11.0,7,7.047411
...,...,...,...,...,...
6209,you shouldn't be leaving your kitchen unattended,2.0,5.0,3,3.311737
6210,@user a person that needs to grow up,2.0,2.0,3,2.952168
6211,Women Can't live with them.Can't live without ...,7.0,5.0,4,4.388725
6212,"""Who the you callin out, black man""",3.0,4.0,4,4.122194


In [62]:
merged_df.to_csv('merged_output.csv', index=False)