In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaModel
from sklearn.model_selection import train_test_split
import pickle
import random
import sys
import math
from scipy import stats
from scipy.spatial import distance
import random
import os
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


In [3]:
base_folder = "data/"
input_file = "hate_int_prof_SVO.tsv"

output_folder = "output_weights/"
output_file = "output_weights"

roberta_model = "roberta-base"
max_length = 256
TEST_SIZE = 0.2
seed = 42

use_attention = True

roberta_dropout = 0.2
lstm_units = 256
dense_units = 50
lstm_dropout = 0.1
dense_dropout = 0.2
epochs = 30 #(Default 10)
batch_size = 32

def random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

random_seed(seed)

In [4]:
dataframe = pd.read_table(base_folder + input_file)
dataframe.head()

Unnamed: 0,Sentence,Intensity,Profanity,Subject,Verb,Object
0,Islam looks like a cult more than like a relig...,9.0,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,We must send Islamics back to their native cou...,7.0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"Europe is civilised, Muslims should not stay t...",7.0,0,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"If they love Sharia law so much, why do not th...",8.0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Islam=evil. Islam is invading us and trying to...,7.0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."


In [5]:
## Define base Roberta configs
config = RobertaConfig(dropout=roberta_dropout, attention_dropout=roberta_dropout, output_attentions=True)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(roberta_model, config=config, ignore_mismatched_sizes=True)
for layer in transformer_model.layers[:3]:  ## Freeze first 3 layers
    layer.trainable = False

# Defining tokonizer
tokenizer = RobertaTokenizer.from_pretrained(roberta_model, do_lower_case=True, add_special_tokens=True, max_length=max_length, pad_to_max_length=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.embeddings.position_embeddings.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.bias', 'roberta.embeddings.token_type_embeddings.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You 

In [6]:
input_ids_in = tf.keras.layers.Input(shape=(max_length, ), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(max_length, ), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]

X = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_units, return_sequences=True, dropout=lstm_dropout, 
                             recurrent_dropout=lstm_dropout, kernel_initializer='normal')
    )(embedding_layer)

if use_attention:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
    
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(dense_units, activation='relu',  kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(dense_dropout)(X)
X = tf.keras.layers.Dense(1, activation='linear', kernel_initializer='normal')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 256)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 256)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124644864  ['input_token[0][0]',            
 el)                            thPoolingAndCrossAt               'masked_token[0][0]']           
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [7]:
def tokenize(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings, tokenizer, sentence_length):
    input_ids, input_masks, input_segments = [], [], []
    
    for sentence, subj_emb, verb_emb, obj_emb, svo_emb in tqdm(zip(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings)):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=sentence_length,
                                       pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

        input_ids[-1].extend(svo_emb.tolist())
        input_masks[-1].extend([1] * len(svo_emb))
        input_segments[-1].extend([1] * len(svo_emb))
    
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

input_data = pd.read_table(base_folder + input_file)

sentences = input_data['Sentence'].tolist()
intensity_value = input_data['Intensity'].astype(int).tolist()

SVO_length = 128
subject_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Subject"].tolist()], maxlen=SVO_length, padding='post'))
verb_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Verb"].tolist()], maxlen=SVO_length, padding='post'))
object_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Object"].tolist()], maxlen=SVO_length, padding='post'))

# adding all the embeddings together to truncate them when putting them in a model (instead of 128 * 3, it will be 128)
svo_embeddings = subject_embeddings + verb_embeddings + object_embeddings

c = list(zip(intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings))
random.shuffle(c)
intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(sentences, intensity_value, test_size=TEST_SIZE, random_state=1)

sentence_length = 128
train_input_ids, train_input_masks, train_input_segment = tokenize(X_tr, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings, tokenizer, sentence_length)
test_input_ids, test_input_masks, test_input_segment = tokenize(X_te, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings, tokenizer, sentence_length)

y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
4843it [00:01, 3788.60it/s]
1211it [00:00, 4727.81it/s]


In [8]:
# print(X_tr) # Training sentences
print("Train input ids: \n", len(train_input_ids[100]), 
      "\n\nAttention masks: \n", len(train_input_masks[0]),  
      "\n\nToken type ids: \n", len(train_input_segment[0])) # input_ids, attention_masks, token_type_ids

Train input ids: 
 256 

Attention masks: 
 256 

Token type ids: 
 256


### LSTM Units = 128
![image.png](attachment:20b76c0d-9837-4a37-a190-0936d0ccc7d8.png)

### LSTM Units = 256
![image.png](attachment:ee15312f-c04d-432d-b485-c249d6ed4bcd.png)


In [9]:
model.fit(x=[train_input_ids, train_input_masks], y=y_tr, epochs=epochs, validation_split=0.1, batch_size=batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1d429a4ce80>

In [10]:
results = model.evaluate(x=[test_input_ids, test_input_masks], y=y_te)
print(results)

result = model.predict(x=[test_input_ids, test_input_masks])
result = np.array(result, dtype=np.float64)
result = result.flatten()

[4.2812042236328125, 0.008257638663053513, 2.0691070556640625]


In [11]:
_, _, rmse = results
print("RMSE", rmse)
print("Pearson", stats.pearsonr(result, y_te))
print("Cosine", 1 - distance.cosine(result, y_te))

RMSE 2.0691070556640625
Pearson PearsonRResult(statistic=0.35425755286069494, pvalue=4.009937125442718e-37)
Cosine 0.9358549503338093
