## CNN

In [1]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
    return embeddings

# Create embedding matrix
def create_embedding_matrix(embeddings, tokenizer, embedding_dim=100):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 128
embedding_dim = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Load GloVe embeddings
glove_file = '../glove/glove.6B.100d.txt'  # Update the path to your GloVe file
glove_embeddings = load_glove_embeddings(glove_file)
embedding_matrix = create_embedding_matrix(glove_embeddings, tokenizer, embedding_dim)

# Build the model
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                             output_dim=embedding_dim,
                             weights=[embedding_matrix],
                             trainable=False)(input_layer)

conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.5)(pooling_layer)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(train_padded, np.array(train_intensity), epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, mse = model.evaluate(test_padded, np.array(test_intensity))
print(f'Test MSE: {mse}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 2.2745494842529297


In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity


# Predictions
predictions = model.predict(test_padded)

# Ensure predictions are a NumPy array
predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Mean Squared Error: 2.274549398407914
Pearson Correlation Coefficient: 0.6992633994535645
Cosine Similarity: 0.9683171795946206


## BiLSTM

In [4]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Bidirectional, LSTM
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
    return embeddings

# Create embedding matrix
def create_embedding_matrix(embeddings, tokenizer, embedding_dim=100):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 128
embedding_dim = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Load GloVe embeddings
glove_file = '../glove/glove.6B.100d.txt'  # Update the path to your GloVe file
glove_embeddings = load_glove_embeddings(glove_file)
embedding_matrix = create_embedding_matrix(glove_embeddings, tokenizer, embedding_dim)

# Build the model
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                             output_dim=embedding_dim,
                             weights=[embedding_matrix],
                             trainable=False)(input_layer)

# Bi-directional LSTM layer
bilstm_layer = Bidirectional(LSTM(128, return_sequences=False))(embedding_layer)
dropout_layer = Dropout(0.5)(bilstm_layer)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(train_padded, np.array(train_intensity), epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, mse = model.evaluate(test_padded, np.array(test_intensity))
print(f'Test MSE: {mse}')

# Predictions
predictions = model.predict(test_padded)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 2.104975461959839


In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

# Ensure predictions are a NumPy array
predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Mean Squared Error: 2.1049753536589706
Pearson Correlation Coefficient: 0.7213071418065663
Cosine Similarity: 0.9703194180768979


## BiLSTM + CNN

In [2]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
    return embeddings

# Create embedding matrix
def create_embedding_matrix(embeddings, tokenizer, embedding_dim=100):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 128
embedding_dim = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Load GloVe embeddings
glove_file = '../glove/glove.6B.100d.txt'  # Update the path to your GloVe file
glove_embeddings = load_glove_embeddings(glove_file)
embedding_matrix = create_embedding_matrix(glove_embeddings, tokenizer, embedding_dim)

# Build the model
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                             output_dim=embedding_dim,
                             weights=[embedding_matrix],
                             trainable=False)(input_layer)

# Convolutional layer
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)

# Bi-directional LSTM layer
bilstm_layer = Bidirectional(LSTM(128, return_sequences=False))(embedding_layer)

# Concatenate CNN and BiLSTM outputs
combined = tf.keras.layers.concatenate([pooling_layer, bilstm_layer])
dropout_layer = Dropout(0.5)(combined)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(train_padded, np.array(train_intensity), epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, mse = model.evaluate(test_padded, np.array(test_intensity))
print(f'Test MSE: {mse}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 2.0995113849639893


In [4]:

# Predictions
predictions = model.predict(test_padded)

import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

# Ensure predictions are a NumPy array
predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')

Mean Squared Error: 2.099511431687822
Pearson Correlation Coefficient: 0.7227299781401003
Cosine Similarity: 0.9705215784110299


## CNN

In [6]:
import os
import random
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Load BERT tokenizer
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

# Tokenization and padding
max_length = 128

def tokenize_and_pad(sentences):
    inputs = tokenizer(sentences, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    return inputs['input_ids'], inputs['attention_mask']

train_input_ids, _ = tokenize_and_pad(train_sentences)
test_input_ids, _ = tokenize_and_pad(test_sentences)

# Build the model
vocab_size = tokenizer.vocab_size  # Use the BERT tokenizer vocabulary size
embedding_dim = 768  # Typically 768 for BERT embeddings

# Input layer
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')

# Embedding layer (using random weights, can be initialized with pretrained embeddings if desired)
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_ids)

# Convolutional layer
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)

# Pooling layer
pooling_layer = GlobalMaxPooling1D()(conv_layer)

# Dropout and output layer
dropout_layer = Dropout(0.5)(pooling_layer)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_ids, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(
    x=train_input_ids,
    y=np.array(train_intensity),
    epochs=10,
    batch_size=32,
    validation_split=0.1
)

# Evaluate the model
loss, mse = model.evaluate(
    x=test_input_ids,
    y=np.array(test_intensity)
)
print(f'Test MSE: {mse}')

# Predictions
predictions = model.predict(test_input_ids)

# Calculate metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MSE: 1.9263602495193481
Mean Squared Error: 1.9263600466285273
Pearson Correlation Coefficient: 0.7517681438471154
Cosine Similarity: 0.9731643350323889


## BiLSTM

In [3]:
import os
import random
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Embedding, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Load BERT tokenizer
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

# Tokenization and padding
max_length = 128

def tokenize_and_pad(sentences):
    inputs = tokenizer(sentences, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    return inputs['input_ids'], inputs['attention_mask']

train_input_ids, _ = tokenize_and_pad(train_sentences)
test_input_ids, _ = tokenize_and_pad(test_sentences)

# Build the model
vocab_size = tokenizer.vocab_size  # Use the BERT tokenizer vocabulary size
embedding_dim = 768  # Typically 768 for BERT embeddings

# Input layer
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')

# Embedding layer (using random weights, can be initialized with pretrained embeddings if desired)
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_ids)

# BiLSTM layer
bilstm_output = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)

# Pooling layer
pooling_layer = GlobalMaxPooling1D()(bilstm_output)

# Dropout and output layer
dropout_layer = Dropout(0.5)(pooling_layer)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_ids, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(
    x=train_input_ids,
    y=np.array(train_intensity),
    epochs=10,
    batch_size=32,
    validation_split=0.1
)

# Evaluate the model
loss, mae = model.evaluate(
    x=test_input_ids,
    y=np.array(test_intensity)
)
print(f'Test MSE: {mse}')

predictions = model.predict(test_input_ids)

# Calculate metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 1.0151128768920898
Mean Squared Error: 1.8676682983354544
Pearson Correlation Coefficient: 0.7659235157145619
Cosine Similarity: 0.9737162920060269


## BiLSTM + CNN (BERT)

In [9]:
import os
import random
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Embedding
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Load BERT tokenizer
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

# Tokenization and padding
max_length = 128

def tokenize_and_pad(sentences):
    inputs = tokenizer(sentences, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    return inputs['input_ids'], inputs['attention_mask']

train_input_ids, _ = tokenize_and_pad(train_sentences)
test_input_ids, _ = tokenize_and_pad(test_sentences)

# Build the model
vocab_size = tokenizer.vocab_size  # Use the BERT tokenizer vocabulary size
embedding_dim = 768  # Typically 768 for BERT embeddings

# Input layer
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')

# Embedding layer (using random weights, can be initialized with pretrained embeddings if desired)
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_ids)

# Convolutional layer
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)

# BiLSTM layer
bilstm_output = Bidirectional(LSTM(128, return_sequences=True))(conv_layer)

# Pooling layer
pooling_layer = GlobalMaxPooling1D()(bilstm_output)

# Dropout and output layer
dropout_layer = Dropout(0.5)(pooling_layer)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_ids, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(
    x=train_input_ids,
    y=np.array(train_intensity),
    epochs=10,
    batch_size=32,
    validation_split=0.1
)

# Evaluate the model
loss, mse = model.evaluate(
    x=test_input_ids,
    y=np.array(test_intensity)
)
print(f'Test MSE: {mse}')

# Predictions
predictions = model.predict(test_input_ids)

# Calculate metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MSE: 1.800592303276062
Mean Squared Error: 1.8005924766931944
Pearson Correlation Coefficient: 0.7698896371740548
Cosine Similarity: 0.9747284936749111


In [5]:
import os
import random
import numpy as np
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load data
train_data = pd.read_csv("../dataset/i_train.csv")
train_sentences = train_data['sentence'].tolist()
train_intensity = train_data['intensity'].tolist() 

test_data = pd.read_csv("../dataset/i_test.csv")
test_sentences = test_data['sentence'].tolist()
test_intensity = test_data['intensity'].tolist() 

# Load BERT tokenizer and model
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
bert_model = TFBertModel.from_pretrained(BERT_MODEL)

# Tokenization and padding
max_length = 128

def tokenize_and_pad(sentences):
    inputs = tokenizer(sentences, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    return inputs['input_ids'], inputs['attention_mask']

train_input_ids, train_attention_masks = tokenize_and_pad(train_sentences)
test_input_ids, test_attention_masks = tokenize_and_pad(test_sentences)

# Build the model
input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_masks = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_masks')

# BERT layer
bert_output = bert_model(input_ids, attention_mask=attention_masks)[1]  # Use the pooled output

# Dropout and output layer
dropout_layer = tf.keras.layers.Dropout(0.5)(bert_output)
output_layer = tf.keras.layers.Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(
    x=[train_input_ids, train_attention_masks],
    y=np.array(train_intensity),
    epochs=10,  # Reduce epochs for quicker training; adjust as needed
    batch_size=16,
    validation_split=0.1
)



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2936161de50>

In [6]:
# Evaluate the model
loss, mse = model.evaluate(
    x=[test_input_ids, test_attention_masks],
    y=np.array(test_intensity)
)
print(f'Test MSE: {mse}')

# Predictions
predictions = model.predict([test_input_ids, test_attention_masks])

# Calculate metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

predictions = np.array(predictions).flatten()
test_intensity = np.array(test_intensity).flatten()

# Calculate MSE
mse = mean_squared_error(test_intensity, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate Pearson correlation coefficient
pearson_corr, _ = pearsonr(test_intensity, predictions)
print(f'Pearson Correlation Coefficient: {pearson_corr}')

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(test_intensity.reshape(1, -1), predictions.reshape(1, -1))[0][0]
print(f'Cosine Similarity: {cosine_sim}')


Test MSE: 4.421870231628418
Mean Squared Error: 4.421870623404657
Pearson Correlation Coefficient: 0.02498798820637999
Cosine Similarity: 0.9372334711621242


In [1]:
import os
import random
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from sklearn.model_selection import train_test_split
import pickle
from tqdm import tqdm
from scipy import stats
from scipy.spatial import distance

BASE_FOLDER = "../dataset/"
INPUT_FILE = "hate_norm_combined.pkl"
BATCH_SIZE = 8
EPOCHS = 10
TEST_SIZE = 0.2
SEED = 42
LSTM_UNITS = 50
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

# Load ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

def elmo_embedding(sentences):
    return elmo.signatures['default'](tf.constant(sentences))['elmo']

with open(BASE_FOLDER + INPUT_FILE, 'rb') as f:
    input_data = pickle.load(f)

intensity_value = []
hate_sentences = []

for i in range(len(input_data)):
    intensity_value.append(int(input_data['Original_Intensity'][i]))
    hate_sentences.append(input_data['Sentence'][i])
    intensity_value.append(int(input_data['Normalized_Intensity'][i]))
    hate_sentences.append(input_data['Normalized_Sentence'][i])

c = list(zip(intensity_value, hate_sentences))
random.shuffle(c)
intensity_value, hate_sentences = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(hate_sentences, intensity_value, test_size=TEST_SIZE, random_state=1)

train_embeddings = elmo_embedding(X_tr)
test_embeddings = elmo_embedding(X_te)

input_shape = train_embeddings.shape[1:]

input_layer = tf.keras.layers.Input(shape=input_shape, dtype=tf.float32, name='elmo_embeddings')
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=LSTM_DROPOUT))(input_layer)
att_layer = tf.keras.layers.Attention(use_scale=True)([bi_lstm, bi_lstm])
global_max_pool = tf.keras.layers.GlobalMaxPool1D()(att_layer)
dense_layer = tf.keras.layers.Dense(DENSE_UNITS, activation='relu')(global_max_pool)
dropout_layer = tf.keras.layers.Dropout(DENSE_DROPOUT)(dense_layer)
output_layer = tf.keras.layers.Dense(1, activation='linear')(dropout_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()

y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

model.fit(x=train_embeddings, y=y_tr, epochs=EPOCHS, validation_split=0.1, batch_size=BATCH_SIZE)

results = model.evaluate(x=test_embeddings, y=y_te)
print(results)
predictions = model.predict(test_embeddings).flatten()

mse = np.mean((predictions - y_te) ** 2)
pearson_corr, _ = stats.pearsonr(predictions, y_te)
cosine_sim = 1 - distance.cosine(predictions, y_te)

print(f'Mean Squared Error: {mse}')
print(f'Pearson Correlation Coefficient: {pearson_corr}')
print(f'Cosine Similarity: {cosine_sim}')


ResourceExhaustedError: Graph execution error:

Detected at node 'bilm/CNN/Conv2D_2' defined at (most recent call last):
    File "C:\Users\Administrator\anaconda3\envs\research\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\Administrator\anaconda3\envs\research\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Administrator\AppData\Local\Temp\2\ipykernel_9024\117836428.py", line 33, in <module>
      elmo = hub.load("https://tfhub.dev/google/elmo/3")
    File "C:\Users\Administrator\anaconda3\envs\research\lib\site-packages\tensorflow_hub\module_v2.py", line 126, in load
      obj = tf.compat.v1.saved_model.load_v2(module_path, tags=tags)
Node: 'bilm/CNN/Conv2D_2'
OOM when allocating tensor with shape[4843,64,112,48] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node bilm/CNN/Conv2D_2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_pruned_4791]

In [4]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from transformers import DistilBertTokenizer, BertConfig, TFBertModel
from sklearn.model_selection import train_test_split
import pickle
import random
from scipy import stats
from scipy.spatial import distance
import os

BASE_FOLDER = "../dataset/"
INPUT_FILE = "hate_norm_combined.pkl"
BERT_MODEL = "distilbert-base-uncased"
MAX_LENGTH = 128
TEST_SIZE = 0.2
SEED = 42

USE_ATT = True

BERT_DROPOUT = 0.2
LSTM_UNITS = 50
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2
EPOCHS = 10  #(Default 10)
BATCH_SIZE = 32

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=MAX_LENGTH,
                                       padding='max_length',
                                       return_attention_mask=True,
                                       return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(
        input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


## Define base bert configs
config = BertConfig(dropout=BERT_DROPOUT,
                    attention_dropout=BERT_DROPOUT,
                    output_attentions=True)
config.output_hidden_states = False
transformer_model = TFBertModel.from_pretrained(BERT_MODEL, config=config)
for layer in transformer_model.layers[:3]:  ## We are freezing first 3 layers
    layer.trainable = False

# Defining tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL,
                                                do_lower_case=True,
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH,
                                                padding='max_length')

input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS,
                         return_sequences=True,
                         dropout=LSTM_DROPOUT,
                         recurrent_dropout=LSTM_DROPOUT,
                         kernel_initializer='normal'))(embedding_layer)
if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS,
                          activation='relu',
                          kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(
    1,
    activation='sigmoid',  # Using sigmoid instead of linear here.
    kernel_initializer='normal')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()

with open(BASE_FOLDER + INPUT_FILE, 'rb') as f:
    input_data = pickle.load(f)

intensity_value = []
hate_sentences = []

for i in range(len(input_data)):
    intensity_value.append(int(input_data['Original_Intensity'][i]))
    hate_sentences.append(input_data['Sentence'][i])
    intensity_value.append(int(input_data['Normalized_Intensity'][i]))
    hate_sentences.append(input_data['Normalized_Sentence'][i])

c = list(zip(intensity_value, hate_sentences))
random.shuffle(c)
intensity_value, hate_sentences = zip(*c)


X_tr, X_te, y_tr, y_te = train_test_split(hate_sentences,
                                          intensity_value,
                                          test_size=TEST_SIZE,
                                          random_state=1)

y_tr = (y_tr - 1) / 9  # Scale from [1, 10] to [0, 1]
y_te = (y_te - 1) / 9  # Scale from [1, 10] to [0, 1]

train_input_ids, train_input_masks, train_input_segment = tokenize(
    X_tr, tokenizer)
test_input_ids, test_input_masks, test_input_segment = tokenize(
    X_te, tokenizer)
y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

model.fit(x=[train_input_ids, train_input_masks],
          y=y_tr,
          epochs=EPOCHS,
          validation_split=0.1,
          batch_size=BATCH_SIZE)

print("TEST split", TEST_SIZE)
results = model.evaluate(x=[test_input_ids, test_input_masks], y=y_te)
print(results)
result = model.predict(x=[test_input_ids, test_input_masks])

result = np.array(result, dtype=np.float64).flatten()

# Scale back to [1, 10]
scaled_result = result * 9 + 1
y_te = y_te * 9 + 1

print("Pearson correlation:", stats.pearsonr(scaled_result, y_te))
print("Cosine similarity:", 1 - distance.cosine(scaled_result, y_te))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.2.attention.k_lin.weight', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.4.attention.q_lin.weight', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.bias', 'distilbert.transformer.layer.2.output_layer_norm.bias', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.4.sa_layer_norm.bias', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'vocab_transform.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias'

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_3 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_token[0][0]',            
                                thPoolingAndCrossAt               'masked_token[0][0]']           
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

100%|████████████████████████████████████████████████████████████████████████████| 4843/4843 [00:03<00:00, 1446.56it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1211/1211 [00:00<00:00, 1546.50it/s]

Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TEST split 0.2
[0.031141238287091255, 0.004954583011567593, 0.17646880447864532]
Pearson correlation: PearsonRResult(statistic=0.6707108436443342, pvalue=4.498327054140168e-159)
Cosine similarity: 0.9655380382758498


## Unscaled Sigmoid

In [5]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from transformers import DistilBertTokenizer, BertConfig, TFBertModel
from sklearn.model_selection import train_test_split
import pickle
import random
from scipy import stats
from scipy.spatial import distance
import os

BASE_FOLDER = "../dataset/"
INPUT_FILE = "hate_norm_combined.pkl"
BERT_MODEL = "distilbert-base-uncased"
MAX_LENGTH = 128
TEST_SIZE = 0.2
SEED = 42

USE_ATT = True

BERT_DROPOUT = 0.2
LSTM_UNITS = 50
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2
EPOCHS = 10  #(Default 10)
BATCH_SIZE = 32

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=MAX_LENGTH,
                                       padding='max_length',
                                       return_attention_mask=True,
                                       return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(
        input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


## Define base bert configs
config = BertConfig(dropout=BERT_DROPOUT,
                    attention_dropout=BERT_DROPOUT,
                    output_attentions=True)
config.output_hidden_states = False
transformer_model = TFBertModel.from_pretrained(BERT_MODEL, config=config)
for layer in transformer_model.layers[:3]:  ## We are freezing first 3 layers
    layer.trainable = False

# Defining tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL,
                                                do_lower_case=True,
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH,
                                                padding='max_length')

input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')l
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS,
                         return_sequences=True,
                         dropout=LSTM_DROPOUT,
                         recurrent_dropout=LSTM_DROPOUT,
                         kernel_initializer='normal'))(embedding_layer)
if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS,
                          activation='relu',
                          kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='normal')(X)

# Custom scaling layer
def scale_output(x):
    return x * 9 + 1

scaled_output = tf.keras.layers.Lambda(scale_output)(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=scaled_output)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()

with open(BASE_FOLDER + INPUT_FILE, 'rb') as f:
    input_data = pickle.load(f)

intensity_value = []
hate_sentences = []

for i in range(len(input_data)):
    intensity_value.append(int(input_data['Original_Intensity'][i]))
    hate_sentences.append(input_data['Sentence'][i])
    intensity_value.append(int(input_data['Normalized_Intensity'][i]))
    hate_sentences.append(input_data['Normalized_Sentence'][i])

c = list(zip(intensity_value, hate_sentences))
random.shuffle(c)
intensity_value, hate_sentences = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(hate_sentences,
                                          intensity_value,
                                          test_size=TEST_SIZE,
                                          random_state=1)

train_input_ids, train_input_masks, train_input_segment = tokenize(X_tr, tokenizer)
test_input_ids, test_input_masks, test_input_segment = tokenize(X_te, tokenizer)
y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

model.fit(x=[train_input_ids, train_input_masks],
          y=y_tr,
          epochs=EPOCHS,
          validation_split=0.1,
          batch_size=BATCH_SIZE)

print("TEST split", TEST_SIZE)
results = model.evaluate(x=[test_input_ids, test_input_masks], y=y_te)
print(results)
result = model.predict(x=[test_input_ids, test_input_masks])

result = np.array(result, dtype=np.float64).flatten()

print("Pearson correlation:", stats.pearsonr(result, y_te))
print("Cosine similarity:", 1 - distance.cosine(result, y_te))


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.2.attention.k_lin.weight', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.4.attention.q_lin.weight', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.bias', 'distilbert.transformer.layer.2.output_layer_norm.bias', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.4.sa_layer_norm.bias', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'vocab_transform.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias'

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_4 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_token[0][0]',            
                                thPoolingAndCrossAt               'masked_token[0][0]']           
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

100%|████████████████████████████████████████████████████████████████████████████| 4843/4843 [00:03<00:00, 1461.22it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1211/1211 [00:00<00:00, 1482.06it/s]

Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TEST split 0.2
[2.28839111328125, 0.005780346691608429, 1.5127428770065308]
Pearson correlation: PearsonRResult(statistic=0.698103728362522, pvalue=1.2697512063633943e-177)
Cosine similarity: 0.9683089233253874
