In [1]:
!pip install -q transformers
!pip install pydot
!pip install tensorflow==2.11.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.11.0
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint 
from transformers import BertTokenizer, TFBertModel

In [3]:
model_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [4]:
max_length = 200

In [5]:
train = pd.read_csv('train_data_imbalanced_stratified_no_dupe_17k_5k.csv')
val = pd.read_csv("validation_data_imbalanced_stratified_no_dupe_17k_5k.csv")
test = pd.read_csv("test_data_imbalanced_stratified_no_dupe.csv")

In [6]:
train_text = train['text'].tolist()
val_text = val['text'].tolist()
test_text = test['text'].tolist()

In [7]:
train_encodings = bert_tokenizer(train_text, truncation=True, padding='max_length', max_length=max_length, return_tensors='tf')
valid_encodings = bert_tokenizer(val_text, truncation=True, padding='max_length', max_length=max_length, return_tensors='tf')
test_encodings = bert_tokenizer(test_text, truncation=True, padding='max_length', max_length=max_length, return_tensors='tf')

In [8]:
train_labels = pd.Categorical(train['emotions']).codes
npval_labels = pd.Categorical(val['emotions']).codes
nptest_labels = pd.Categorical(test['emotions']).codes

In [9]:
def create_bert_multiclass_model(checkpoint = model_checkpoint,
                                 num_classes = 7,
                                 hidden_size = 201, 
                                 dropout=0.3,
                                 learning_rate=0.00005):
    bert_model = TFBertModel.from_pretrained(checkpoint)                                              
    max_length = 200
    bert_model.trainable = True
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')
    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}    
    bert_out = bert_model(bert_inputs)
    cls_token = bert_out[0][:, 0, :]
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden) 
    classification = tf.keras.layers.Dense(num_classes, activation='softmax',name='classification_layer')(hidden)
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                                 metrics='accuracy') 
    return classification_model

### Part 1: Experiment with different hidden sizes

In [10]:
mod_1 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7, hidden_size = 50)
mod_2 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7, hidden_size = 100)
mod_3 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7, hidden_size = 150)
mod_4 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7, hidden_size = 300)
mod_5 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7, hidden_size = 250)
mod_6 = create_bert_multiclass_model(checkpoint=model_checkpoint, num_classes=7)


Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initiali

In [None]:
mod_1_history = mod_1.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2)  

Epoch 1/2




Epoch 2/2


In [None]:
mod_2_history = mod_2.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2) 

Epoch 1/2




Epoch 2/2


In [None]:
mod_3_history = mod_3.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2) 

Epoch 1/2




Epoch 2/2

In [11]:
mod_4_history = mod_4.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2) 

Epoch 1/2




Epoch 2/2


In [12]:
mod_5_history = mod_5.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2) 

Epoch 1/2




Epoch 2/2


In [13]:
mod_6_history = mod_6.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,   
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npval_labels),    
                                                  batch_size=8, 
                                                  epochs=2) 

Epoch 1/2




Epoch 2/2


There is not much different in loss between the models tested, however model 6 has the highest validation accuracy and lowest loss, so we use this going forward.