# Creat dataframe for ploting

In [3]:
import yaml
import json
import pandas as pd
import time

In [None]:
# Collecting data from rasa nlu file
with open('nlu.yml') as fp:
  data = yaml.load(fp)

common_examples =[]
intents = []

# mention the intent name in intent_list
intent_list=["angry","afraid","anxious"]

for category in data["nlu"]:
    intent = category["intent"]
    if intent in intent_list:
      intents.append(intent)
      sentences  = category["examples"].split("\n-")
      for index,example in enumerate(sentences):
        if index ==0:
          example = example[1:]
        if index == len(sentences)-1:
          example = example [:-1]
        common_examples.append({"text": example,
                    "text_emotion": intent
                },)

In [None]:
# !pip freeze

In [6]:
# creating data frame use nlu data
df_analysis = pd.DataFrame(common_examples)

In [7]:
df_analysis

Unnamed: 0,text,text_emotion
0,"I was so scared by the movie the Grudge, even...",afraid
1,Yesterday I was doing shores outside and was ...,afraid
2,yesterday at work some guy ran his work truck...,afraid
3,"With Turkish currency Lira plummeting now, I ...",afraid
4,While working at a restaurant I was robbed,afraid
...,...,...
1269,"Anxiety and stress can stay with you all day,...",anxious
1270,It's normal to have some anxiety about giving...,anxious
1271,First-time moms might feel even more anxious.,anxious
1272,As a sufferer of generalized anxiety disorder...,anxious


## Transfer Learning – BERT

In [8]:
import tensorflow_hub as hub

In [9]:
# load BERT 
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"

In [10]:
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [11]:
# install the dependencies
!pip install bert-tensorflow &> /dev/null

In [12]:
# install the dependencies
!pip install sentencepiece &> /dev/null

In [13]:
# import tokenization
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from bert.tokenization import FullTokenizer

In [14]:
# The 4 next lines allows to prevent an error due to Bert version
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

['preserve_unused_tokens=False']

In [15]:
# process our data
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [16]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [17]:
train_input = bert_encode(df_analysis.text.values, tokenizer, max_len=100)
train_labels = df_analysis.text_emotion.values

In [18]:
def build_model(bert_layer, max_len=512):
    input_word_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    # define the Embedding layer
    flatten = layers.Flatten(name='flatten') 

    #use the previously created embedding layer
    output_flatten = flatten(clf_output)

    # the “out” layer, where instead of having Dense(2, …) we have Dense(len(np.unique(train_labels)) …).
    # In fact this number indicates the number of outputs of the model. We used 2 for a Binary Classification.
    out = layers.Dense(len(np.unique(train_labels)), activation='sigmoid')(output_flatten)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model(bert_layer, max_len=100)

  super(Adam, self).__init__(name, **kwargs)


## Training BERT

> Indented block



In [19]:
# get_dummies() function convert categorical variables to dummy variables, which are numeric variables that are used to represent categorical data.
label_dummy = pd.get_dummies(train_labels)

In [20]:
label_dummy.head(2)

Unnamed: 0,afraid,angry,anxious
0,1,0,0
1,1,0,0


In [21]:
start = time.time()

# train our model 
train_history = model.fit(
    train_input, label_dummy,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

end = time.time()
t_time = end - start
print("\ntotal time", t_time)

start 1660624982.6263072
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
end 1660625267.9931605

total time 285.3668532371521


## TSNE – Visualization of Embedding of sentences

In [22]:
# recreate a model from the old one by including all its layers up to “flatten”, the Embedding layer
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('flatten').output)

In [23]:
# send our data into the newly created model
sentence_embedded = intermediate_layer_model.predict(train_input)

In [24]:
labels_emotion = df_analysis.text_emotion

In [25]:
sentence_embedded.shape

(1274, 768)

In [26]:
labels_emotion.shape

(1274,)

# TSNE

In [27]:
import numpy as np
from sklearn.manifold import TSNE

In [28]:
# create a dataframe containing the 2D Embedding of the sentences and their emotions
X = list(sentence_embedded)

X_embedded = TSNE(n_components=2).fit_transform(X)



In [29]:
df_embeddings = pd.DataFrame(X_embedded)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
df_embeddings = df_embeddings.assign(label=df_analysis.text_emotion.values)

In [30]:
# add the unmodified base sentences, to make the visualization easier
df_embeddings = df_embeddings.assign(text=df_analysis.text.values)

In [31]:
print(df_embeddings)

              x          y    label  \
0    -35.056683  15.546683   afraid   
1    -16.554531  13.487644   afraid   
2    -12.443484  31.655052   afraid   
3    -21.840658  -3.141288   afraid   
4    -18.628744  30.978590   afraid   
...         ...        ...      ...   
1269 -40.450806  -7.815174  anxious   
1270 -38.135921  -7.792165  anxious   
1271 -39.131641  -9.100816  anxious   
1272 -38.031898  -6.314645  anxious   
1273 -30.692741 -13.840830  anxious   

                                                   text  
0      I was so scared by the movie the Grudge, even...  
1      Yesterday I was doing shores outside and was ...  
2      yesterday at work some guy ran his work truck...  
3      With Turkish currency Lira plummeting now, I ...  
4            While working at a restaurant I was robbed  
...                                                 ...  
1269   Anxiety and stress can stay with you all day,...  
1270   It's normal to have some anxiety about giving...  
1271     

# Display Embedding

In [32]:
import plotly.express as px

In [33]:
import plotly.io as pio
pio.renderers.default = 'colab'

In [34]:
fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='label', labels={'color': 'label'},
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')

In [35]:
fig.show()

In [36]:
  # add for example the size of each sentence in our dataframe
  df_embeddings['length_text'] = df_embeddings[['text']].applymap(lambda x : len(x))

In [37]:
# We can integrate this dimension in our display. So the longer the sentence will be, the bigger the point that represents it will be
fig = px.scatter(
    df_embeddings, x='x', y='y',
    color='label', labels={'color': 'label'},
    size = 'length_text', size_max = 10, template = 'simple_white',
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')
fig.show()