<a href="https://colab.research.google.com/github/srilamaiti/spring_2023_w266_final_project_heesuk_iris_srila/blob/main/iris/W266_essay_evaluation_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing new libraries**

In [None]:
!pip install transformers
!pip install emoji==0.6.0
!pip install scikit-multilearn
!pip install iterative-stratification
!pip install tensorflow==2.11.0
!pip install yellowbrick
!pip install tensorflow_gpu==1.15.5

# **Importing libraries**

In [None]:
import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
from transformers import BertTokenizer, TFBertModel
from transformers import TFAutoModel, AutoTokenizer
hf_logging.set_verbosity_error()
'''
import nltk
from nltk.tokenize import sent_tokenize
import spacy      
from spacy import displacy
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from wordcloud import ImageColorGenerator
nltk.download('punkt')
'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from transformers import RobertaTokenizer, TFRobertaModel
ROBERTA_MODEL_CHKPT = "roberta-base"
BERTWEET_MODEL_CHKPT = "vinai/bertweet-base"
BERT_MODEL_CHKPT = 'bert-base-cased'

# Other required libraries
import math
import os
import pandas as pd
import numpy as np
import re
import copy
import sys
import gc
import pprint
import statistics

# data visualization
from matplotlib import cm
import matplotlib.pyplot as plt
from IPython.display import Image

# others
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import set_link_color_palette
from scipy.cluster.hierarchy import linkage
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn import preprocessing

# distances
from scipy.spatial.distance import pdist, squareform

import warnings
warnings.filterwarnings("ignore")

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.layer_utils import count_params
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.losses import mae
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.losses import binary_crossentropy
from keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.regularizers import l1
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from keras.models import load_model
from tensorflow.keras.optimizers import Adam, SGD

import torch
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
print(tf.__version__)

# **General functions**

## **Rounding Off to Custom Decimal Places**

In [None]:
def roundPartial(value, resolution):
    return round (value / resolution) * resolution

## **Set parameters**

In [None]:
def set_config_param(seed = 99):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.backend.clear_session()
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Colab Notebooks"
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_columns', None)
    
    
set_config_param(20230214)

## **Plot loss and accuracy**

In [None]:
def plot_loss_accuracy(history, col_list):
    fig, ax = plt.subplots(2, 6, figsize=(16, 6), sharex='col', sharey='row')
    fig.tight_layout(pad=5.0)
    for idx, col in enumerate(col_list):

        ax[0, idx].plot(history[col + '_loss'], lw=2, color='darkgoldenrod')
        ax[0, idx].plot(history['val_' + col + '_loss'], lw=2, color='indianred')
        #ax[0, idx].legend(loc='center left')
        ax[0, idx].legend(['Train', 'Validation'], fontsize=5)
        ax[0, idx].set_xlabel('Epochs', size=10)
        ax[0, idx].set_title('Loss: ' + col)

        ax[1, idx].plot(history[col + '_accuracy'], lw=2, color='darkgoldenrod')
        ax[1, idx].plot(history['val_' + col + '_accuracy'], lw=2, color='indianred')
        #ax[0, idx].legend(loc='center left')
        ax[1, idx].legend(['Train', 'Validation'], fontsize=5)
        ax[1, idx].set_xlabel('Epochs', size=10)
        ax[1, idx].set_title('Accuracy: ' + col)

## **Plot Loss and other KPI specified**

In [None]:
def custom_plot(df, model_name, kpi_name, kpi_string):
    x_arr = np.arange(len(df['loss'])) + 1
    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, df['loss'], '-o', label = model_name + ' : Train loss')
    ax.plot(x_arr, df['val_loss'], '--<', label = model_name + ' :  Validation loss')
    ax.legend(fontsize = 15)
    ax.set_xlabel('Epoch', size = 15)
    ax.set_ylabel('Loss', size = 15)

    ax = fig.add_subplot(1, 2, 2)
    ax.plot(x_arr, df[kpi_name], '-o', label = model_name + ' : Train ' + kpi_string)
    ax.plot(x_arr, df['val_' + kpi_name], '--<', label = model_name + ' : Validation ' + kpi_string)
    ax.legend(fontsize = 15)
    ax.set_xlabel('Epoch', size = 15)
    ax.set_ylabel(kpi_name, size = 15)
    #ax.set_ylim(0,1)
    plt.show()

## **Text Encode**

In [None]:
def text_encode(texts, tokenizer, max_len):
    input_ids = []
    # token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, 
                          max_length = max_len, 
                          truncation = True, 
                          padding = 'max_length',
                          add_special_tokens = True)
        input_ids.append(token['input_ids'])
        # token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(attention_mask)

## **Custom metric**

In [None]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis = 1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis = -1, keepdims = True)

## **Build Base Model**

In [None]:
def build_regression_model(loss = 'MCRMSE',
                           model_name = 'Roberta', 
                           dense_dim = 6, 
                           MAX_LEN = 512,
                           learning_rate = 1e-5,
                           dropout = .1,
                           number_of_hidden_layers = 1,
                           hidden_layer_node_count = 64,
                           retrain_layer_count = 0):
    
    # Define inputs
    input_ids = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'input_ids')
    attention_masks = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'attention_masks')
    
    if model_name == 'Roberta':
        model_tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL_CHKPT)
        model = TFRobertaModel.from_pretrained(ROBERTA_MODEL_CHKPT)
    elif model_name == 'Bertweet':
        model_tokenizer = AutoTokenizer.from_pretrained(BERTWEET_MODEL_CHKPT)
        model = TFRobertaModel.from_pretrained(BERTWEET_MODEL_CHKPT)
    elif model_name == 'Bert':
        model_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_CHKPT)
        model = TFBertModel.from_pretrained(BERT_MODEL_CHKPT)  

    # Adjust the trainable layer weights based on retrain_layer_count
    # If retrain_layer_count is 0, then base model is frozen.
    # If retrain_layer_count is 12, then the entire base model is trainable.
    # And that implies that all the pretrained weights are lost and it relearns
    # from the input data.
    # If retrain_layer_count is between 1 and 11, then the last n layers of
    # the pretrained model retrained.
    if retrain_layer_count == 0:
        # The pretained model is frozen
        model.trainable = False           

    elif retrain_layer_count == 12:  
        # The pretrained model is retrained thru all layers.       
        model.trainable = True     

    else:    
        # Restrict training to the num_train_layers outer transformer layers
        retrain_layer_list = []
        model.trainable = False  
        for retrain_layer_number in range(retrain_layer_count):

            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layer_list.append(layer_code)
        
        print('Retrain layers: \n', retrain_layer_list)
        #model.compile()
        print(f"Number of trainable parameters : {count_params(model.trainable_weights)}")
        print(f"Number of non-trainable parameters : {count_params(model.non_trainable_variables)}")
        for weight in model.weights:
            weight._trainable = False
            #print("***", layer.name, layer._trainable)
            if 'layer_' in weight.name and weight.name.split(".")[1].split("/")[0] in retrain_layer_list:
                weight._trainable = True
                # print("$$$", weight.name, weight._trainable)
            elif 'layer_' not in weight.name :
                weight._trainable = True
                # print("###", weight.name, weight._trainable)
        model.compile()

        for weight_details in model.weights:
            print(weight_details.name, weight_details.trainable)
    print(f"Number of trainable parameters : {count_params(model.trainable_weights)}")
    print(f"Number of non-trainable parameters : {count_params(model.non_trainable_variables)}")
                
    # Insert pretrained model layer
    pretrained_transformer = model([input_ids, attention_masks])

    # Get the CLS output off the pretrained model
    cls_token = pretrained_transformer[0][:, 0, :]

    # Append the hidden layer and dropout layer
    layer_list = []
    for layer in range(number_of_hidden_layers):
        if layer == 0:
            hidden_layer = tf.keras.layers.Dense(units      = hidden_layer_node_count
                                               , activation = 'relu'
                                               , name       = 'hidden_layer_' + str(layer + 1)
                                                )(cls_token)
        else:
            hidden_layer = tf.keras.layers.Dense(units      = hidden_layer_node_count
                                               , activation = 'relu'
                                               , name       = 'hidden_layer_' + str(layer + 1)
                                            )(layer_list[-1])
        layer_list.append(hidden_layer)
        dropout_layer = tf.keras.layers.Dropout(dropout, 
                                                name = 'dropout_layer_' + str(layer + 1)
                                               )(hidden_layer) 
        layer_list.append(dropout_layer)

    # Add the output layer
    output = tf.keras.layers.Dense(6,)(layer_list[-1])

    # Build the final model
    regression_model = tf.keras.Model(inputs = [input_ids, attention_masks], outputs = output)
    
    # Model compile
    if loss == 'MCRMSE':
        regression_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                                 loss      = MCRMSE,
                                 metrics   = MCRMSE
                                )
    
    print(regression_model.summary())
    keras.utils.plot_model(regression_model, 
                           show_shapes = False, 
                           show_dtype = False, 
                           show_layer_names = True, 
                           dpi = 90)
    return regression_model

In [None]:
def model_fit(model, 
              df_train, 
              train_indices,
              val_indices,
              model_name = 'Roberta', 
              MAX_LEN = 512,
              epochs = 5,
              batch_size = 4,
              validation_split = .2):
  
    # Building the tokenizer for the given model
    if model_name == 'Roberta':
        tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL_CHKPT)
    elif model_name == 'Bertweet':
        tokenizer = AutoTokenizer.from_pretrained(BERTWEET_MODEL_CHKPT)
    elif model_name == 'Bert':
        tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_CHKPT)
        
    train_encoded_input_ids, train_encoded_attention_masks = text_encode(df_train.iloc[list(train_indices)]['full_text'], tokenizer, MAX_LEN)
    val_encoded_input_ids, val_encoded_attention_masks = text_encode(df_train.iloc[list(val_indices)]['full_text'], tokenizer, MAX_LEN)

    y_train = np.array(df_train.iloc[list(train_indices)][label_cols], dtype = "float32")
    y_val = np.array(df_train.iloc[list(val_indices)][label_cols], dtype = "float32")
    
    hist = model.fit([train_encoded_input_ids, train_encoded_attention_masks],
                     y_train,
                     validation_data = ([val_encoded_input_ids, val_encoded_attention_masks], 
                                        y_val
                                       ),
                     batch_size = batch_size,        
                     epochs = epochs
                    )

    df_history = pd.DataFrame(hist.history)
    return df_history

In [None]:
def build_base_model(model_layer, learning_rate, dense_dim = 6):
    
    #define inputs
    input_ids = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'input_ids')
    attention_masks = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'attention_masks')
    
    #insert BERT layer
    transformer_layer = model_layer([input_ids, attention_masks])
    
    #choose only last hidden-state
    x = transformer_layer[1]
    output = tf.keras.layers.Dense(dense_dim)(x)
    #output = tf.keras.layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.models.Model(inputs = [input_ids, attention_masks], outputs = output)

    model.compile(tf.keras.optimizers.Adam(learning_rate), loss = mse_loss, metrics = mse_metrics)
    
    return model

## **Build a model with custom loss**

In [None]:
def build_base_model_with_custom_loss(model_layer, learning_rate, dense_dim = 6):
    
    #define inputs
    input_ids = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'input_ids')
    attention_masks = tf.keras.Input(shape = (MAX_LEN ,), dtype = 'int64', name = 'attention_masks')
    
    #insert BERT layer
    transformer_layer = model_layer([input_ids, attention_masks])
    
    #choose only last hidden-state
    x = transformer_layer[1]
    output = tf.keras.layers.Dense(dense_dim)(x)
    #output = tf.keras.layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.models.Model(inputs = [input_ids, attention_masks], outputs = output)

    model.compile(tf.keras.optimizers.Adam(learning_rate), loss = MCRMSE, metrics = MCRMSE)
    
    return model

##**Evaluate**

In [None]:
def evaluate_model(model, y_test, test_encoded_input_ids, test_encoded_attention_masks):
    score = model.evaluate([test_encoded_input_ids, test_encoded_attention_masks], 
                           y_test
                          ) 
    print('\nTest Loss : {:.2f}%'.format(score[0]))
    print('\nTest Accuracy :  {:.2f}%'.format(score[1]))
    return score[0], score[1]

## **Predict**

In [None]:
def predict_model(model, df_test, test_encoded_input_ids, test_encoded_attention_masks, label_cols):
    predictions = model.predict([test_encoded_input_ids, test_encoded_attention_masks])
    df_predictions = pd.DataFrame(predictions, columns=['pred_' + c for c in label_cols])
    for col in label_cols:
        df_predictions['transformed_pred_' + col] = df_predictions['pred_' + col].apply(lambda x : roundPartial(x, .5))
    df_comparison = pd.merge(df_test, df_predictions, left_index = True, right_index = True)
    return df_predictions, df_comparison

## **Plot Model Structure**

In [None]:
def plot_model_structure(model):
    keras.utils.plot_model(model, show_shapes = False, show_dtype = False, show_layer_names = True, dpi = 90)

## **Samples of predictions**

In [None]:
def hall_of_fame(df, component, num):
  samp = df.query("transformed_pred_"+component+"=="+component).sample(num)
  samp = samp.reset_index()
  for index, row in samp.iterrows():
      print("predicted: ",row["transformed_pred_"+component])
      print("original: ",row[component])
      pprint.pprint(row["full_text"])
      print("**********")

def hall_of_shame(df, component, num):
  samp = df.query("transformed_pred_"+component+"!="+component).sample(num)
  samp = samp.reset_index()
  for index, row in samp.iterrows():
      print("predicted: ",row["transformed_pred_"+component])
      print("original: ",row[component])
      pprint.pprint(row["full_text"])
      print("**********")

# **Read input files**

In [None]:
# data
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

%cd "gdrive/MyDrive/Colab Notebooks/"

In [None]:
input_train_df = pd.read_csv('train.csv')
input_test_df = pd.read_csv('test.csv')
# Cleaning up full_text : Removing tabl and carriage return characters
input_train_df['full_text'] = input_train_df["full_text"].replace(re.compile(r'[\n\r\t]'), ' ', regex = True)
input_test_df['full_text'] = input_test_df["full_text"].replace(re.compile(r'[\n\r\t]'), ' ', regex = True)

label_cols = input_train_df.columns[2:]
input_train_df['score_sum'] = np.sum(input_train_df[label_cols], axis = 1)
pred_col_list = ['transformed_pred_' + col for col in label_cols]

orig_train_df = copy.deepcopy(input_train_df)
orig_train_df.head()

# **Model building**

As we do not have labels for our test data, we are repurposing our training data by splitting it into 80:20 ratio.

The train part is then going thru k fold cross validation and get tested on validation set and final test is done on the test set. Final test accuracy will be the average MCRMSE score across k-folds.



In [None]:
# shuffling them back again
shuffle = np.random.permutation(np.arange(orig_train_df.shape[0]))
orig_train_df = orig_train_df.iloc[shuffle]

# Splitting the data in 80:20 split
split = (0.8, 0.2)
splits = np.multiply(len(orig_train_df), split).astype(int)
df_train, df_test = orig_train_df[ : splits[0]], orig_train_df[splits[0] : ]
y_test = np.array(df_test[label_cols], dtype = "float32")

print(f"Length of train data : {len(df_train)}")
print(f"Length of test data : {len(df_test)}")

In [None]:
# Fixed parameters
dense_dim = 6
number_of_splits = 2
random_state = 2023
MAX_LEN = 128
mse_loss = MCRMSE
mse_metrics = MCRMSE
model_name_list = ['Bert', 'Bertweet'] # Roberta

# Variable parameters
epochs = 5
batch_size = 4
learning_rate = 1e-5
validation_split = .2
dropout = .1
number_of_hidden_layers = 1
hidden_layer_node_count = 64
retrain_layer_count = 0

# Variable parameter dictionary
param_list = [
                 # Completely frozen base layer
                 {'epochs'                  : 5,
                  'batch_size'              : 4,
                  'learning_rate'           : 1e-5,
                  'validation_split'        : .2,
                  'dropout'                 : .1,
                  'number_of_hidden_layers' : 1,
                  'hidden_layer_node_count' : 64,
                  'retrain_layer_count'     : 0
                 },
                 # Partially frozen base layer
                 {'epochs'                  : 5,
                  'batch_size'              : 4,
                  'learning_rate'           : 1e-5,
                  'validation_split'        : .2,
                  'dropout'                 : .1,
                  'number_of_hidden_layers' : 1,
                  'hidden_layer_node_count' : 64,
                  'retrain_layer_count'     : 6
                 },
                 # Completely unfrozen base layer
                 {'epochs'                  : 5,
                  'batch_size'              : 4,
                  'learning_rate'           : 1e-5,
                  'validation_split'        : .2,
                  'dropout'                 : .1,
                  'number_of_hidden_layers' : 1,
                  'hidden_layer_node_count' : 64,
                  'retrain_layer_count'     : 12
                 },
             ]

BERT-base-cased

In [None]:
for idx, param_entry in enumerate(param_list):
    MAX_LEN = 512
    epoch_val = param_entry['epochs']
    batch_size_val = param_entry['batch_size']
    learning_rate_val = param_entry['learning_rate']
    validation_split_val = param_entry['validation_split']
    dropout_val = param_entry['dropout']
    number_of_hidden_layers_val = param_entry['number_of_hidden_layers']
    hidden_layer_node_count_val = param_entry['hidden_layer_node_count']
    retrain_layer_count_val = param_entry['retrain_layer_count']

    print("************************")
    print(f"Iteration : {idx + 1}")
    print("Parameters...")
    print(f"Epochs : {epoch_val}")
    print(f"Batch size : {batch_size_val}")
    print(f"Learning rate : {learning_rate_val}")
    print(f"Validation split : {validation_split_val}")
    print(f"Dropout : {dropout_val}")
    print(f"Number of hidden layers : {number_of_hidden_layers_val}")
    print(f"Hidden layer node count : {hidden_layer_node_count_val}")
    print(f"Retrain layer count : {retrain_layer_count_val}")
    print("************************")
    set_config_param(20230214)

    model_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_CHKPT)
    model = TFRobertaModel.from_pretrained(BERT_MODEL_CHKPT)

    train_input_ids, train_attention_masks = text_encode(df_train['full_text'], model_tokenizer, MAX_LEN)
    test_input_ids, test_attention_masks = text_encode(df_test['full_text'], model_tokenizer, MAX_LEN)

    y_train = np.array(df_train[label_cols], dtype = "float32")

    bert = build_regression_model(loss= "MCRMSE", 
                                      model_name = "Bert",
                                      dense_dim = 6, 
                                      MAX_LEN = 512,
                                      learning_rate = learning_rate_val,
                                      dropout=dropout_val,
                                      number_of_hidden_layers = number_of_hidden_layers_val,
                                      hidden_layer_node_count = hidden_layer_node_count_val,
                                      retrain_layer_count = retrain_layer_count_val)
    bert.summary()

    history_v1 = bert.fit((train_input_ids, train_attention_masks),
                  y_train,
                  batch_size = batch_size_val,     
                  epochs = epoch_val,
                  validation_split = validation_split_val
                  )

    history_v1_df = pd.DataFrame(history_v1.history)

    score_v1 = bert.evaluate([test_input_ids, test_attention_masks], 
                    y_test
                  ) 

    predictions_v1 = bert.predict([test_input_ids, test_attention_masks])
    df_pred_v1 = pd.DataFrame(predictions_v1, columns=['pred_' + c for c in label_cols])

    for col in label_cols:
      df_pred_v1['transformed_pred_' + col] = df_pred_v1['pred_' + col].apply(lambda x : roundPartial(x, .5))
    
    df_compare_v1= pd.merge(df_test, df_pred_v1, left_index = True, right_index = True)
    all = []
    for col in label_cols:
      all.append(((df_compare_v1[col]-df_compare_v1["transformed_pred_"+col]).pow(2).mean())**(0.5))
    RMSE_scaled = statistics.fmean(all)
    print(f"RMSE_scaled: {RMSE_scaled}")
    
    for label in label_cols:
      print(label)
      hall_of_fame(df_compare_v1,label,1)
      print("************************")
      hall_of_shame(df_compare_v1,label,1)
      print("************************")
    for component in label_cols:
      print(component)
      print("% Predicting too high: " + str(len(df_compare_v1.query("transformed_pred_"+component+">"+component))/len(df_compare_v1)))
      print("% Predicted correctly: " + str(len(df_compare_v1.query("transformed_pred_"+component+"=="+component))/len(df_compare_v1)))
      print("% Predicting too low: " + str(len(df_compare_v1.query("transformed_pred_"+component+"<"+component))/len(df_compare_v1)))
      print("****")
    # Predicting too high or low for every score? 
    for component in label_cols:
      print(component)

      for score in [1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0]:
        if len(df_compare_v1[df_compare_v1[component]==score])==0:
          print(component+"=="+str(score)+" has 0 rows")
        else:
          print(f'length: {len(df_compare_v1[df_compare_v1[component]==score])}')
          print("% Predicting the above ("+str(score)+"): " +
                str(len(df_compare_v1[(df_compare_v1[component]==score) &
                                      (df_compare_v1["transformed_pred_"+component]>score)])/len(df_compare_v1[df_compare_v1[component]==score])))
          print("% Predicting the same ("+str(score)+"): " +
                str(len(df_compare_v1[(df_compare_v1[component]==score) & 
                                      (df_compare_v1["transformed_pred_"+component]==score)])/len(df_compare_v1[df_compare_v1[component]==score])))
          print("% Predicting the below ("+str(score)+"): " + 
                str(len(df_compare_v1[(df_compare_v1[component]==score) & 
                                      (df_compare_v1["transformed_pred_"+component]<score)])/len(df_compare_v1[df_compare_v1[component]==score])))
        print("****")

    for component in label_cols:
      print(component)
      print("% Predicting within .5: " + str(len(df_compare_v1.query("transformed_pred_"+component+"<=("+component+"+.5) and transformed_pred_"+component+">=("+component+"-.5)"))/len(df_compare_v1)))

BERTweet

In [None]:
for idx, param_entry in enumerate(param_list):
    MAX_LEN = 512
    epoch_val = param_entry['epochs']
    batch_size_val = param_entry['batch_size']
    learning_rate_val = param_entry['learning_rate']
    validation_split_val = param_entry['validation_split']
    dropout_val = param_entry['dropout']
    number_of_hidden_layers_val = param_entry['number_of_hidden_layers']
    hidden_layer_node_count_val = param_entry['hidden_layer_node_count']
    retrain_layer_count_val = param_entry['retrain_layer_count']

    print("************************")
    print(f"Iteration : {idx + 1}")
    print("Parameters...")
    print(f"Epochs : {epoch_val}")
    print(f"Batch size : {batch_size_val}")
    print(f"Learning rate : {learning_rate_val}")
    print(f"Validation split : {validation_split_val}")
    print(f"Dropout : {dropout_val}")
    print(f"Number of hidden layers : {number_of_hidden_layers_val}")
    print(f"Hidden layer node count : {hidden_layer_node_count_val}")
    print(f"Retrain layer count : {retrain_layer_count_val}")
    print("************************")
    set_config_param(20230214)

    model_tokenizer = AutoTokenizer.from_pretrained(BERTWEET_MODEL_CHKPT)
    model = TFRobertaModel.from_pretrained(BERTWEET_MODEL_CHKPT)

    train_input_ids, train_attention_masks = text_encode(df_train['full_text'], model_tokenizer, MAX_LEN)
    test_input_ids, test_attention_masks = text_encode(df_test['full_text'], model_tokenizer, MAX_LEN)

    y_train = np.array(df_train[label_cols], dtype = "float32")

    bertweet = build_regression_model(loss= "MCRMSE", 
                                      model_name = "Bertweet",
                                      dense_dim = 6, 
                                      MAX_LEN = 512,
                                      learning_rate = learning_rate_val,
                                      dropout=dropout_val,
                                      number_of_hidden_layers = number_of_hidden_layers_val,
                                      hidden_layer_node_count = hidden_layer_node_count_val,
                                      retrain_layer_count = retrain_layer_count_val)
    bertweet.summary()

    history_v1 = bertweet.fit((train_input_ids, train_attention_masks),
                  y_train,
                  batch_size = batch_size_val,     
                  epochs = epoch_val,
                  validation_split = validation_split_val
                  )

    history_v1_df = pd.DataFrame(history_v1.history)

    score_v1 = bertweet.evaluate([test_input_ids, test_attention_masks], 
                    y_test
                  ) 

    predictions_v1 = bertweet.predict([test_input_ids, test_attention_masks])
    df_pred_v1 = pd.DataFrame(predictions_v1, columns=['pred_' + c for c in label_cols])

    for col in label_cols:
      df_pred_v1['transformed_pred_' + col] = df_pred_v1['pred_' + col].apply(lambda x : roundPartial(x, .5))
    
    df_compare_v1= pd.merge(df_test, df_pred_v1, left_index = True, right_index = True)

    all = []
    for col in label_cols:
      all.append(((df_compare_v1[col]-df_compare_v1["transformed_pred_"+col]).pow(2).mean())**(0.5))
    RMSE_scaled = statistics.fmean(all)
    print(f"RMSE_scaled: {RMSE_scaled}")

    for label in label_cols:
      print(label)
      hall_of_fame(df_compare_v1,label,1)
      print("************************")
      hall_of_shame(df_compare_v1,label,1)
      print("************************")
    for component in label_cols:
      print(component)
      print("% Predicting too high: " + str(len(df_compare_v1.query("transformed_pred_"+component+">"+component))/len(df_compare_v1)))
      print("% Predicted correctly: " + str(len(df_compare_v1.query("transformed_pred_"+component+"=="+component))/len(df_compare_v1)))
      print("% Predicting too low: " + str(len(df_compare_v1.query("transformed_pred_"+component+"<"+component))/len(df_compare_v1)))
      print("****")
    # Predicting too high or low for every score? 

    for component in label_cols:
      print(component)

      for score in [1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0]:
        if len(df_compare_v1[df_compare_v1[component]==score])==0:
          print(component+"=="+str(score)+" has 0 rows")
        else:
          print(f'length: {len(df_compare_v1[df_compare_v1[component]==score])}')
          print("% Predicting the above ("+str(score)+"): " +
                str(len(df_compare_v1[(df_compare_v1[component]==score) &
                                      (df_compare_v1["transformed_pred_"+component]>score)])/len(df_compare_v1[df_compare_v1[component]==score])))
          print("% Predicting the same ("+str(score)+"): " +
                str(len(df_compare_v1[(df_compare_v1[component]==score) & 
                                      (df_compare_v1["transformed_pred_"+component]==score)])/len(df_compare_v1[df_compare_v1[component]==score])))
          print("% Predicting the below ("+str(score)+"): " + 
                str(len(df_compare_v1[(df_compare_v1[component]==score) & 
                                      (df_compare_v1["transformed_pred_"+component]<score)])/len(df_compare_v1[df_compare_v1[component]==score])))
        print("****")
    for component in label_cols:
      print(component)
      print("% Predicting within .5: " + str(len(df_compare_v1.query("transformed_pred_"+component+"<=("+component+"+.5) and transformed_pred_"+component+">=("+component+"-.5)"))/len(df_compare_v1)))

In [None]:
len(df_compare_v1)

In [None]:
df_compare_v1["character_count"] = df_compare_v1.full_text.str.len()

In [None]:
len(df_compare_v1[df_compare_v1["character_count"]<280])

Concat

In [None]:
set_config_param(20230214)

# **Clustering**

In [None]:
df_train["score_sum"].hist()
plt.title("Train Dataset's Total Scores")
plt.xlabel("Total Score")
plt.ylabel("Number of Essays")
plt.show()

In [None]:
df_rating = copy.deepcopy(df_train[label_cols])
rating_values_array = np.array(df_rating[label_cols])

# standardize
sc = StandardScaler()
rating_values_array_std = sc.fit(rating_values_array).transform(rating_values_array)

In [None]:
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(2,10))
 
visualizer.fit(rating_values_array_std)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

Here is how the Elbow / SSE Plot would look like. As per the plot given below, for n_clusters = 3 that represents the elbow you start seeing diminishing returns by increasing k. The line starts looking linear.

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5))
for idx, i in enumerate([2, 3, 4]):
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    #q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[idx])
    visualizer.fit(rating_values_array_std) 

Given above, the Silhouette plot for n_clusters = 3 looks to be most appropriate than others as it stands well against all the three measuring criteria (scores below average Silhouette score, Wide fluctuations in the size of the plot, and non-uniform thickness).

In [None]:
km_base = KMeans(n_clusters=3,
           #init='random',
           init='k-means++',
           n_init=10,
           max_iter=300,
           tol=1e-04,
           random_state=1234)

# predict k-means classes
y_km_base = km_base.fit_predict(rating_values_array_std)

# Assigning cluster value to the datafarme
df_train['cluster_id'] = y_km_base

In [None]:
df_train.head()

We can see that an increase in *k* is associated with a decrease in the within-cluster SSE. 

This is because the examples are closer to the centroid they assigned to.

**The elbow solution**: the optimal *k* is where the within-cluster SSE begings to increase most rapidly.

For this particular example the elbow is at *k=2* so we started with a good number of clusters.


In [None]:
df_train_cluster0 = df_train[df_train.cluster_id == 0]
df_train_cluster1 = df_train[df_train.cluster_id == 1]
df_train_cluster2 = df_train[df_train.cluster_id == 2]

print(f"Length of cluster 0 : {len(df_train_cluster0)}")
print(f"Length of cluster 1 : {len(df_train_cluster1)}")
print(f"Length of cluster 2 : {len(df_train_cluster2)}")

In [None]:
print(f"Min and max score in cluster 0 are : {np.min(df_train_cluster0['score_sum'])} and {np.max(df_train_cluster0['score_sum'])}")
print(f"Min and Max score in cluster 1 are : {np.min(df_train_cluster1['score_sum'])} and {np.max(df_train_cluster1['score_sum'])}")
print(f"Min and Max score in cluster 2 are : {np.min(df_train_cluster2['score_sum'])} and {np.max(df_train_cluster2['score_sum'])}")

The cluster is divided based on the distribution of the data. Low-scores are in one bucket, medium scores are placed in another and top scrores are placed in the higher bucket.

# **Model parameter setup**

In [None]:
# Combining the two clusters' data
df_train = pd.concat([df_train_cluster0, df_train_cluster1, df_train_cluster2])

# shuffling them back again
shuffle = np.random.permutation(np.arange(df_train.shape[0]))
df_train = df_train.iloc[shuffle]


MCRMSE_list = []

'''
rating_cluster has two values 0 and 1.
We are doing k fold with stratification using rating_cluster.
We introduced this new column to split on as as our data ouput is multi class
and multi label with continuous values and traditional k fold split does not
support that.
This new column will help us to see if our model is performing better for which 
group : above or below average.
'''
for param_val_model_name in model_name_list:

    for idx, param_entry in enumerate(param_list):
    
        param_val_epoch = param_entry['epochs']
        param_val_batch_size = param_entry['batch_size']
        param_val_learning_rate = param_entry['learning_rate']
        param_val_validation_split = param_entry['validation_split']
        param_val_dropout = param_entry['dropout']
        param_val_number_of_hidden_layers = param_entry['number_of_hidden_layers']
        param_val_hidden_layer_node_count = param_entry['hidden_layer_node_count']
        param_val_retrain_layer_count = param_entry['retrain_layer_count']

        for kfold, (train_indices, val_indices) in enumerate(StratifiedKFold(n_splits     = number_of_splits, 
                                                                             shuffle      = True, 
                                                                             random_state = random_state
                                                                             ).split(df_train['cluster_id'].values.tolist(), 
                                                                                     df_train['cluster_id'].values.tolist()
                                                                                    )
                                                            ):
            print("************************")
            print(f"Model : {param_val_model_name}")
            print(f"Iteration : {idx + 1}")
            print("Parameters...")
            print(f"Epochs : {param_val_epoch}")
            print(f"Batch size : {param_val_batch_size}")
            print(f"Learning rate : {param_val_learning_rate}")
            print(f"Validation split : {param_val_validation_split}")
            print(f"Dropout : {param_val_dropout}")
            print(f"Number of hidden layers : {param_val_number_of_hidden_layers}")
            print(f"Hidden layer node count : {param_val_hidden_layer_node_count}")
            print(f"Retrain layer count : {param_val_retrain_layer_count}")
            print(f"k-fold : {kfold + 1}")
            print(f"length of train data : {len(train_indices)}")
            print(f"length of validation data : {len(val_indices)}")
            print("************************")
            set_config_param(20230214)

            # Building the tokenizer for the given model
            if param_val_model_name == 'Roberta':
                tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL_CHKPT)
            elif param_val_model_name == 'Bertweet':
                tokenizer = AutoTokenizer.from_pretrained(BERTWEET_MODEL_CHKPT)
            elif param_val_model_name == 'Bert':
                tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_CHKPT)
            
            # Model building
            print("Building model...")
            regression_model = build_regression_model(loss                    = 'MCRMSE',
                                                      model_name              = param_val_model_name, 
                                                      dense_dim               = dense_dim, 
                                                      MAX_LEN                 = MAX_LEN,
                                                      learning_rate           = param_val_learning_rate,
                                                      dropout                 = param_val_dropout,
                                                      number_of_hidden_layers = param_val_number_of_hidden_layers,
                                                      hidden_layer_node_count = param_val_hidden_layer_node_count,
                                                      retrain_layer_count     = param_val_retrain_layer_count
                                                     )
        
            # Model fitting
            print("Fitting model...")
            df_history = model_fit(model            = regression_model, 
                                   df_train         = df_train, 
                                   train_indices    = train_indices,
                                   val_indices      = val_indices,
                                   model_name       = param_val_model_name, 
                                   MAX_LEN          = MAX_LEN,
                                   epochs           = param_val_epoch,
                                   batch_size       = param_val_batch_size,
                                   validation_split = param_val_validation_split
                                  )
            print(df_history.T)

            print("Plotting loss and MCRMSE...")
            custom_plot(df         = df_history, 
                        model_name = param_val_model_name, 
                        kpi_name   = 'MCRMSE', 
                        kpi_string = 'MCRMSE'
                       )

            # Prep for model evaluation with test data
            print("Evaluating mode...")
            test_encoded_input_ids, test_encoded_attention_masks = text_encode(texts      = df_test['full_text'], 
                                                                               tokenizer = tokenizer, 
                                                                               max_len    = MAX_LEN
                                                                              )
            # Model evaluation
            test_loss, test_accuracy = evaluate_model(regression_model, 
                                                      y_test, 
                                                      test_encoded_input_ids, 
                                                      test_encoded_attention_masks
                                                     )

            # Model prediction
            print("Model prediction...")
            df_prediction, df_comparison = predict_model(regression_model, 
                                                         df_test, 
                                                         test_encoded_input_ids, 
                                                         test_encoded_attention_masks, 
                                                         label_cols
                                                        )

            print("Plotting model structure...")
            keras.utils.plot_model(regression_model, 
                                   show_shapes      = False, 
                                   show_dtype       = False, 
                                   show_layer_names = True, 
                                   dpi              = 90
                                  )

            print("Appending to kpi list...")
            temp_dict = {'model_name'                  : param_val_model_name,
                         'iteration'                   : idx + 1,
                         'epoch'                       : param_val_epoch,
                         'batch_size'                  : param_val_batch_size,
                         'learning_rate'               : param_val_learning_rate,
                         'validation_split'            : param_val_validation_split,
                         'dropout'                     : param_val_dropout,
                         'number_of_hidden_layers'     : param_val_number_of_hidden_layers,
                         'hidden_layer_node_count'     : param_val_hidden_layer_node_count,
                         'retrain_layer_count'         : param_val_retrain_layer_count,
                         'fold'                        : kfold + 1, 
                         'train_loss'                  : df_history.iloc[-1][0],
                         'train_accuracy'              : df_history.iloc[-1][1],
                         'val_loss'                    : df_history.iloc[-1][2],
                         'val_accuracy'                : df_history.iloc[-1][3],
                         'test_loss'                   : test_loss,
                         'test_accuracy'               : test_accuracy
                        }
            MCRMSE_list.append(temp_dict)
            
            # Saving the model
            print("Saving the model...")
            model_file_name = 'regression_model_' + param_val_model_name.lower() + '_iter_' + str(idx + 1) + '_kfold_' + str(kfold + 1) + ".h5"
            regression_model.save(model_file_name)

            for label in label_cols:
              print(label)
              hall_of_fame(df_comparison,label,1)
              print("************************")
              hall_of_shame(df_comparison,label,1)
              print("************************")
            for component in label_cols:
              print(component)
              print("% Predicted too high: " + str(len(df_comparison.query("transformed_pred_"+component+">"+component))/len(df_comparison)))
              print("% Predicted correctly: " + str(len(df_comparison.query("transformed_pred_"+component+"=="+component))/len(df_comparison)))
              print("% Predicted too low: " + str(len(df_comparison.query("transformed_pred_"+component+"<"+component))/len(df_comparison)))
              print("****")

In [None]:
kpi_col_list = ['model_name',
                'iteration',
                'epoch_val',
                'batch_size_val',
                'learning_rate_val',
                'validation_split_val',
                'dropout_val',
                'number_of_hidden_layers_val',
                'hidden_layer_node_count_val',
                'retrain_layer_count_val',
                'fold', 
                'train_loss', 
                'train_accuracy', 
                'val_loss', 
                'val_accuracy', 
                'test_loss', 
                'test_accuracy'
               ]
df_MCRMSE = pd.DataFrame(MCRMSE_list, columns = kpi_col_list)    
df_MCRMSE.to_csv("kpi_stats_bertweet.csv", index = False)
df_MCRMSE    

In [None]:
print("Average test accuracy and loss...")
df_MCRMSE.groupby(['model_name', 'iteration']).agg({'test_loss'      : [np.mean, np.min, np.max],  
                                                    'test_accuracy'  : [np.mean, np.min, np.max] 
                                                   }
                                                  )

# Appendix
Other experiments with the hyper parameters

In [None]:
def regression_model_with_bert(num_classes=9,                  # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                               num_train_layers=0,
                               num_hidden_layer=1,
                               num_hidden_units=256,
                               dropout=0.3,
                               learning_rate=0.00005,
                               activation = 'relu',
                               optimizer='adam'):
    """
    Build a simple regression model with BERT. Use the CLS Output for regression purposes.
    """
    # =========== BEGIN generate "input features" using pre-trained model tokenizer ==================================
    if num_train_layers == 0:
        bert_model.trainable = False                 # Freeze all layers of pre-trained BERT model

    elif num_train_layers == 12:         
        bert_model.trainable = True                  # Train all layers of the BERT model

    else:                                            # Restrict training to the num_train_layers outer transformer layers
        retrain_layers = []
        for retrain_layer_number in range(num_train_layers):
            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layers.append(layer_code) 
        # print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                #print('freezing: ', w)
                w._trainable = False
    
    # Input Layer
    input_ids = tf.keras.layers.Input(shape=(MAX_LENGTH), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_LENGTH), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'attention_mask': attention_mask
                  }
                      
    # Bert output: being used as an input feature in the classification model below
    bert_out = bert_model(bert_inputs)        # full features as an input to the following classification model
    # pooler_output = bert_out[1]             # one vector for each
    cls_token = bert_out[0][:, 0, :]          # give us a raw CLS tokens


    layer_list = []
    for hidden_layer_number in range(num_hidden_layer):
        if hidden_layer_number == 0:
            hidden_layer = tf.keras.layers.Dense(units = num_hidden_units
                                        , activation = activation
                                        , name = 'hidden_layer_' + str(hidden_layer_number + 1)
                                        )(cls_token)
        else:
            hidden_layer = tf.keras.layers.Dense(units = num_hidden_units
                                        , activation = activation
                                        , name = 'hidden_layer_' + str(hidden_layer_number + 1)
                                        )(layer_list[-1])
        layer_list.append(hidden_layer)
        dropout_layer = tf.keras.layers.Dropout(dropout, name = 'dropout_layer_' + str(hidden_layer_number + 1))(hidden_layer) 
        layer_list.append(dropout_layer)

    output = tf.keras.layers.Dense(6,)(layer_list[-1])
    regression_model = tf.keras.Model(inputs = [input_ids, attention_mask], outputs = output)

    def selected_optimizer(optimizer):
      if optimizer.lower() == 'sgd':
        return SGD(learning_rate=learning_rate)           
      elif optimizer.lower() == 'adam':
        return Adam(learning_rate=learning_rate)          

    regression_model.compile(optimizer = selected_optimizer(optimizer),
                             loss=MCRMSE,
                             metrics=MCRMSE) 

    return regression_model, count_params(regression_model.trainable_weights), count_params(regression_model.non_trainable_weights)
def train_regression(model, batch_size, epochs):  
  checkpoint_filepath = '/content/gdrive/MyDrive/Kaggle/Model_Checkpoint'         #  Create a new directory, Model_Checkpoint, in my Google Drive first and navigate the path here
  model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                                  save_weights_only=True,
                                                                  monitor='val_loss',
                                                                  mode='min',
                                                                  save_best_only=True)  
  # The following parameters say: "If there hasn't been at least an improvement of 0.001 in the validation loss over the previous 3 epochs, then stop the training and keep the best model you found."
  early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', 
                                              min_delta=0.001, # minimium amount of change to count as an improvement
                                              patience=3,      # how many epochs to wait before stopping
                                              restore_best_weights=True)

  print('Training Regression with BERT.....\n====================================='  )
  regression_model_history = model.fit([train_encodings.input_ids, 
                                        train_encodings.attention_mask
                                        ], 
                                        y_train,   
                                        validation_split = .1,
                                        # validation_data =([val_encodings.input_ids, 
                                        #                     val_encodings.attention_mask], 
                                        #                   y_val
                                        #                   ),    
                                        batch_size = batch_size, 
                                        # callbacks=[callback, model_checkpoint_callback, tensorboard_callback],
                                        callbacks=[early_stopping_callback, model_checkpoint_callback],
                                        epochs = epochs 
                                        # verbose=0    # make output invisible
                                        )    
  df_regression_model_history = pd.DataFrame(regression_model_history.history)
  display(df_regression_model_history.T)     
  return df_regression_model_history
def plot_loss_mcrmse(df, eval_metric):
    x_arr = np.arange(len(df['loss'])) + 1
    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, df['loss'], '-o', label = 'Train Loss')
    ax.plot(x_arr, df['val_loss'], '--<', label = 'Validation Loss')
    ax.legend(fontsize = 12)
    ax.set_xlabel('Epoch', size = 12)
    ax.set_ylabel('Loss', size = 12)

    ax = fig.add_subplot(1, 2, 2)
    ax.plot(x_arr, df[eval_metric], '-o', label = 'Train ' + eval_metric)
    ax.plot(x_arr, df['val_' + eval_metric], '--<', label = 'Validation ' + eval_metric)
    ax.legend(fontsize = 12)
    ax.set_xlabel('Epoch', size = 12)
    ax.set_ylabel('MCRMSE', size = 12)
    #ax.set_ylim(0,1)
    plt.show()
def evaluate_test_labels(model):
  score_regression = model.evaluate([test_encodings.input_ids, 
                                          test_encodings.attention_mask
                                          ], 
                                          y_test
                                          ) 
  print('\nEvaluate Test Metrics:\n=================================')
  print('\nTest loss: {:.4f}'.format(score_regression[0]))
  print('\nTest MCRMSE score: {:.4f}'.format(score_regression[1]),'\n')
  return score_regression
def predict_test_labels(model):
  predictions = model.predict([test_encodings.input_ids, test_encodings.attention_mask])    # -1 in reshape function is used when you don't know or want to explicitly tell the dimension of that axis.
  df_pred = pd.DataFrame(predictions, columns=['pred_'+ col for col in label_cols])
  return df_pred
def scaled_pred(df):
  pred_scaled = []
  for col in df:
    df[col + '_scaled'] = df[col].apply(lambda val: round(val/0.5) * 0.5)
    pred_scaled.append(df[col + '_scaled'])
  return pd.DataFrame(pred_scaled).T
def run_regression_experiment(num_train_layers=0,
                              num_hidden_layer=1,
                              num_hidden_units=256,
                              dropout=0.3,
                              learning_rate=0.00005,
                              batch_size=8,
                              csv_filename='perf_summary_regression_w_BERT.csv',
                              activation = 'relu',                                    # 'relu', 'leaky_relu', 'gelu'
                              optimizer='adam',                                       # 'adam', 'sgd'
                              epochs=1): ### UPDATE AT THE END
  set_config_param(20230214)
  df_perf_summary = pd.DataFrame()
  
  for layer in num_train_layers:  
    print('\n******************************************************')
    print(f'Regression with BERT: Number of Unfrozen Layers = {layer}')
    print('******************************************************\n')

    # build a regression model
    regression_with_bert, num_trainable_params, num_non_trainable_params = regression_model_with_bert(num_classes = 9,                          # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                                                                                                      num_train_layers = layer,
                                                                                                      num_hidden_layer = num_hidden_layer,
                                                                                                      num_hidden_units = num_hidden_units,
                                                                                                      dropout = dropout,
                                                                                                      learning_rate = learning_rate,
                                                                                                      activation = activation,
                                                                                                      optimizer=optimizer)
    # print(f'Parameter Values:\n======================\nnum_hidden_layer = {num_hidden_layer}\nnum_hidden_units = {num_hidden_units}\ndropout = {dropout}\nlearning_rate = {learning_rate}\nbatch_size = {batch_size}\n')
    
    # model summary and plot model structure
    display(regression_with_bert.summary())
    display(keras.utils.plot_model(regression_with_bert, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90))

    # train model
    df_regression_model_history = train_regression(regression_with_bert, batch_size, epochs)
    print("\nPlotting loss and MCRMSE...")
    plot_loss_mcrmse(df_regression_model_history, 'MCRMSE')  
    # print("\nTensorBoard: Evolution of Loss and MCRMSE:\n=============================================")
    # %tensorboard --logdir logs/fit

    # Evaluate test set
    score_regression = evaluate_test_labels(regression_with_bert)

    # Predict test set
    df_pred = predict_test_labels(regression_with_bert)
    df_pred_scaled = scaled_pred(df_pred)
    
    # Create a final table with y_true, y_pred_raw, and y_pred_scaled
    # display(generate_final_table(df_pred))

    # ========== Performace metrics summary ===================================
    perf_metrics = pd.DataFrame({'NLP Model':"bert-base-cased",
                                'Num_Trainable_layers': layer,
                                # 'Trainable_Params':  f'{num_trainable_params:,}',
                                # 'Non-Trainable_Params':  f'{num_non_trainable_params:,}',
                                'Epochs':epochs,                                                              
                                'Test_MCRMSE':round(score_regression[1], 4), 
                                'Test_Loss':round(score_regression[0], 4), 
                                'Train_MCRMSE':round(df_regression_model_history.iloc[-1][1], 4), 
                                'Train_Loss':round(df_regression_model_history.iloc[-1][0], 4), 
                                'Val_MCRMSE':round(df_regression_model_history.iloc[-1][3], 4), 
                                'Val_Loss':round(df_regression_model_history.iloc[-1][2], 4),  
                                'Optimizer': optimizer, 
                                'Activation': activation,  
                                'Learning_Rate':learning_rate,                               
                                'Num_Hidden_Layers':num_hidden_layer, 
                                'Num_hidden_Units':num_hidden_units,                                 
                                'Dropout': dropout, 
                                'Batch_Size': batch_size}, index=[0])
    df_perf_summary = df_perf_summary.append(perf_metrics)
  df_perf_summary.to_csv(csv_filename, index=False)
  display(df_perf_summary.reset_index(drop=True))
def generate_final_table(df_pred):
  print('\nFinal Table: y_true vs. y_pred_raw vs. y_pred_scaled\n======================================================')
  df_final = pd.concat([df_test[['full_text']].reset_index(drop=True), df_test[label_cols].reset_index(drop=True), df_pred], axis=1)
  display(df_final)
  return df_final
def run_regression_experiment_1(num_train_layers=0,
                              num_hidden_layer=1,
                              num_hidden_units=256,
                              dropout=0.3,
                              learning_rate=0.00005,
                              batch_size=8,
                              csv_filename='perf_summary_regression_w_BERT.csv',
                              activation = 'relu',                                    # 'relu', 'leaky_relu', 'gelu'
                              optimizer='adam',                                       # 'adam', 'sgd'
                              epochs=10):

  # df_perf_summary = pd.DataFrame()
  # for layer in num_train_layers:  
  print('\n******************************************************')
  print(f'Regression with BERT: Number of Unfrozen Layers = {num_train_layers}')
  print('******************************************************\n')


  # build a regression model
  regression_with_bert, num_trainable_params, num_non_trainable_params = regression_model_with_bert(num_classes = 9,                          # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                                                                                                    num_train_layers = num_train_layers,
                                                                                                    num_hidden_layer = num_hidden_layer,
                                                                                                    num_hidden_units = num_hidden_units,
                                                                                                    dropout = dropout,
                                                                                                    learning_rate = learning_rate,
                                                                                                    activation = activation,
                                                                                                    optimizer=optimizer)
  # print(f'Parameter Values:\n======================\nnum_hidden_layer = {num_hidden_layer}\nnum_hidden_units = {num_hidden_units}\ndropout = {dropout}\nlearning_rate = {learning_rate}\nbatch_size = {batch_size}\n')
  
  # model summary and plot model structure
  display(regression_with_bert.summary())
  display(keras.utils.plot_model(regression_with_bert, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90))

  # train model
  df_regression_model_history = train_regression(regression_with_bert, batch_size, epochs)
  print("\nPlotting loss and MCRMSE...")
  plot_loss_mcrmse(df_regression_model_history, 'MCRMSE')  
  # print("\nTensorBoard: Evolution of Loss and MCRMSE:\n=============================================")
  # %tensorboard --logdir logs/fit

  # Evaluate test set
  score_regression = evaluate_test_labels(regression_with_bert)

  # Predict test set
  df_pred = predict_test_labels(regression_with_bert)
  df_pred_scaled = scaled_pred(df_pred)
  df_pred.to_csv('df_pred.csv', index=False)
  
  # Create a final table with y_true, y_pred_raw, and y_pred_scaled
  df_final = generate_final_table(df_pred)
  display(generate_final_table(df_pred))
  df_final.to_csv('df_final.csv', index=False)

  # ========== Performace metrics summary ===================================
  perf_metrics = pd.DataFrame({'NLP Model':"bert-base-cased",
                              'Num_Trainable_layers': num_train_layers,
                              # 'Trainable_Params':  f'{num_trainable_params:,}',
                              # 'Non-Trainable_Params':  f'{num_non_trainable_params:,}',
                              'Epochs':epochs,                                                              
                              'Test_MCRMSE':round(score_regression[1], 4), 
                              'Test_Loss':round(score_regression[0], 4), 
                              'Train_MCRMSE':round(df_regression_model_history.iloc[-1][1], 4), 
                              'Train_Loss':round(df_regression_model_history.iloc[-1][0], 4), 
                              'Val_MCRMSE':round(df_regression_model_history.iloc[-1][3], 4), 
                              'Val_Loss':round(df_regression_model_history.iloc[-1][2], 4),  
                              'Optimizer': optimizer, 
                              'Activation': activation,  
                              'Learning_Rate':learning_rate,                               
                              'Num_Hidden_Layers':num_hidden_layer, 
                              'Num_hidden_Units':num_hidden_units,                                 
                              'Dropout': dropout, 
                              'Batch_Size': batch_size}, index=[0])
    # df_perf_summary = df_perf_summary.append(perf_metrics)
  perf_metrics.to_csv(csv_filename, index=False)
  display(perf_metrics.reset_index(drop=True))

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,6),
                          activation='relu',
                          optimizer='adam',                          
                          csv_filename='perf_summary_regression_w_BERT_1.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,2),
                          num_hidden_layer=2,
                          num_hidden_units=64,
                          dropout=0.1,
                          learning_rate=0.00001,
                          batch_size=16,
                          csv_filename='perf_summary_regression_w_BERT_2.csv',
                          activation='relu',
                          optimizer='adam',
                          epochs=10)

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,2),
                          csv_filename='perf_summary_regression_w_BERT_5.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,6),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          # dropout=0.3,
                          # learning_rate=0.00005,
                          batch_size=16,
                          activation='relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_4.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,6),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          # dropout=0.3,
                          # learning_rate=0.00005,
                          batch_size=16,
                          activation='gelu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_gelu.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,7,2),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          # dropout=0.3,
                          # learning_rate=0.00005,
                          batch_size=16,
                          activation='leaky_relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(2,5,2),
# run_regression_experiment(num_train_layers=np.arange(0,9,2),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          dropout=0.1,
                          # learning_rate=0.00005,
                          batch_size=16,
                          activation='relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,7,2),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          dropout=0.2,
                          # learning_rate=0.00005,
                          batch_size=16,
                          activation='leaky_relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu_sgd.csv')

In [None]:
 run_regression_experiment(num_train_layers=np.arange(0,7,2),
                          # num_hidden_layer=1,
                          # num_hidden_units=256,
                          dropout=0.2,
                          learning_rate=0.0001,
                          batch_size=16,
                          activation='leaky_relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu_lowerLR.csv')

In [None]:
run_regression_experiment(num_train_layers=np.arange(0,13,4),
                          # num_hidden_layer=1,
                          num_hidden_units=128,
                          dropout=0.2,
                          learning_rate=0.0005,
                          batch_size=16,
                          activation='leaky_relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu_lowerLR2.csv')

In [None]:
 run_regression_experiment(num_train_layers=np.arange(0,13,4),
                          # num_hidden_layer=1,
                          num_hidden_units=64,
                          dropout=0.2,
                          learning_rate=0.0005,
                          batch_size=16,
                          activation='leaky_relu',
                          optimizer='adam',
                          csv_filename='perf_summary_regression_w_BERT_batch16_leakyRelu_lowerLR128.csv')

In [None]:
run_regression_experiment_1(num_train_layers=12,
                          num_hidden_layer=2,
                          num_hidden_units=64,
                          dropout=0.1,
                          learning_rate=0.00001,
                          batch_size=16,
                          csv_filename='perf_summary_regression_w_BERT_final1.csv',
                          activation='relu',
                          optimizer='adam',
                          epochs=10)