# Sentiment Analysis with BERT

## Preparing TensorFlow API

In [1]:
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

Import the Bert text model.

In [2]:
preprocessor = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [3]:
preprocessor_model = hub.KerasLayer(preprocessor)
encoder_model = hub.KerasLayer(encoder)

Read the data file.

In [13]:
with open("frame_v3.csv", 'r') as input_file:
    data = pd.read_csv(input_file)

In [14]:
data

Unnamed: 0,Story,Higher-level,Main,Sub,Comments,Prompt
0,The sales office was bustling with activity as...,Openness to change,Hedonism,,,
1,The sales office was buzzing with activity as ...,,,,,
2,Alice had been working in the sales office for...,Conservation,Conformity-Interpersonal,,,
3,The office of salesperson John Smith was bustl...,,,,,
4,The sales office was bustling with activity as...,Self-Enhancement,Power Dominance,,,
5,The sun shone brightly through the large windo...,Conservation,Humility,,,it is important to her that the weak and vulne...
6,John had been working in the sales office for ...,Self-Transcendence,Universalism-Nature,,,it is important to her that people do what she...
7,"It was a typical day at the sales office, but ...",,,,,It is important to her never to think she dese...
8,The office was abuzz with activity. Salespeopl...,,,,,it is important to her to care for nature
9,The sales office was bustling with activity as...,,Face,,,it is important to her that no one should ever...


## Label Representation

Extract higher-level values.

In [16]:
values = list(data.get("Higher-level"))

In [28]:
print(values)

['Openness to change', nan, 'Conservation', nan, 'Self-Enhancement', 'Conservation', 'Self-Transcendence', nan, nan, nan, 'Openness to change', 'Self-Transcendence', 'Self-Enhancement', 'Conservation', 'Self-Transcendence', 'Conservation', 'Openness to change', 'Self-Enhancement', 'Conservation', 'Self-Transcendence', 'Self-Enhancement', 'Conservation', 'Openness to change', nan, 'Self-Transcendence', 'Conservation', 'Self-Transcendence', 'Openness to change', 'Self-Enhancement', 'Openness to change', 'Conservation', 'Self-Enhancement', 'Conservation', 'Self-Transcendence', 'Conservation', 'Openness to change', 'Self-Transcendence', 'Conservation', 'Openness to change', 'Conservation', 'Self-Enhancement', 'Conservation', 'Openness to change', 'Self-Enhancement', 'Self-Transcendence', 'Openness to change', 'Self-Transcendence', 'Self-Enhancement', nan, 'Conservation', 'Conservation', 'Self-Transcendence', 'Conservation', 'Conservation', 'Self-Transcendence', 'Openness to change', 'Self-

Value-int mapping.

In [34]:
value_vector_map = {
    "nan": -1,
    "Self-Transcendence": 0,
    "Self-Enhancement": 1,
    "Openness to change": 2,
    "Conservation": 3
}

Map values to unique integers.

In [41]:
labels = [value_vector_map[str(vector_value)] for vector_value in values if str(vector_value) in value_vector_map]

In [42]:
print(labels)

[2, -1, 3, -1, 1, 3, 0, -1, -1, -1, 2, 0, 1, 3, 0, 3, 2, 1, 3, 0, 1, 3, 2, -1, 0, 3, 0, 2, 1, 2, 3, 1, 3, 0, 3, 2, 0, 3, 2, 3, 1, 3, 2, 1, 0, 2, 0, 1, -1, 3, 3, 0, 3, 3, 0, 2, 0]


## Data Split

Extract stories.

In [18]:
stories = list(data.get("values"))

Create train-test data splits.

In [27]:
from sklearn.model_selection import train_test_split

In [29]:
x_train, x_test = train_test_split(stories, test_size=0.2, random_state=42)

## Testing Embeddings

Testing the embeddings, created by Bert.

In [37]:
testing_preprocessor = preprocessor_model(stories)
testing_encoder = encoder_model(testing_preprocessor)

56 stories overall, each encoded as a vector of size 768.

In [38]:
testing_encoder["pooled_output"]

<tf.Tensor: shape=(56, 768), dtype=float32, numpy=
array([[-0.50290674, -0.6103573 , -0.9853746 , ..., -0.9730395 ,
        -0.62223715,  0.4646754 ],
       [-0.44730604, -0.5027853 , -0.94899756, ..., -0.85607076,
        -0.6244724 ,  0.44547993],
       [-0.6147948 , -0.68217784, -0.99331486, ..., -0.9635989 ,
        -0.762658  ,  0.80841565],
       ...,
       [-0.6546136 , -0.5529941 , -0.958877  , ..., -0.85563534,
        -0.66113573,  0.745502  ],
       [-0.684393  , -0.58833706, -0.9438508 , ..., -0.8941437 ,
        -0.6452566 ,  0.74640054],
       [-0.6528005 , -0.6333736 , -0.98649347, ..., -0.95632315,
        -0.65573937,  0.4990547 ]], dtype=float32)>

## Neural Networks

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

Initialize the ML model.

In [34]:
input_text = Input(shape=(), dtype=tf.string)
preprocessed_text = preprocessor_model(input_text)
text_encoding = encoder_model(preprocessed_text)
# The single hidden layer has only 1 neuron.
# I think this is not a bad practice when it comes to text classification.
output = Dense(1, activation="sigmoid")(text_encoding["pooled_output"])
model = Model(inputs=input_text, outputs=output)

Adam is a generally good optimization algorithm. \
Categorical Cross Entropy is used as we are dealing with multi-labelled data.

In [35]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), 
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=["accuracy"])