<a href="https://colab.research.google.com/github/srilamaiti/spring_2023_w266_final_project_heesuk_iris_srila/blob/main/srila/ELL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install New Libraries**

In [8]:
!pip install nltk
!pip install wordcloud
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Importing Libraries**

In [9]:
#NLP related libraries
import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
from transformers import BertTokenizer, TFBertModel
hf_logging.set_verbosity_error()
import nltk
from nltk.tokenize import sent_tokenize
import spacy      
from spacy import displacy
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from wordcloud import ImageColorGenerator
from sklearn.model_selection import train_test_split
nltk.download('punkt')

# Other required libraries
import os
import pandas as pd
import numpy as np
import re
import copy
import sys
import warnings
warnings.filterwarnings("ignore")

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.layer_utils import count_params
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.losses import mae
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.losses import binary_crossentropy
from keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.regularizers import l1
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from keras.models import load_model

transformers version: 4.27.1


RuntimeError: ignored

# **General Functions**

# **Set Config Parameters**

In [None]:
def set_config_param(seed = 99):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.backend.clear_session()
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Kaggle"
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_columns', None)
    
    
set_config_param(20230214)

# **Count Plot**

In [None]:
def plot_count(df, labels):
  sns.set_style('whitegrid')
  plt.figure(figsize=(18,10))
  for idx, label in enumerate(labels):
      plt.subplot(2, 3, idx+1)
      sns.countplot(x = label, data = df)

# **Adding Feature Columns**

In [None]:
def add_feature(df):

    # Cleaning up full_text : Removing tabl and carriage return characters
    df['full_text'] = df["full_text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)

    # Adding word count, sentence count, total score and full text length
    df['word_count'] = df['full_text'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['full_text'].apply(lambda x: len(sent_tokenize(x)))
    df['total_score'] = df['cohesion'] + df_train['syntax'] + df['vocabulary'] + df['phraseology'] + df['grammar'] + df['conventions']
    df['full_text_len'] = df['full_text'].apply(lambda x: len(x))

    # Adding mean, median score per label and indicator column 
    # whether the label value is below or above mean or median value
    for label in label_cols:
        df[label + '_avg_score'] = np.mean(df[label])
        df[label + '_above_or_below_avg_flag'] = np.where(df[label] > np.mean(df[label]), 1, 0)  
        df[label + '_median_score'] = np.median(df[label])
        df[label + '_above_or_below_median_flag'] = np.where(df[label] > np.median(df[label]), 1, 0)  
        df[label + '_rounded_val'] = np.round(df[label])  
    
    return df

# **Getting Unique Values for Each Label**

In [None]:
def get_unique_values_for_labels(df, col_list):
    print('Unique Values in Each Metric:\n==================================================')
    for col in col_list:
        print(f'{col}: {df[col].unique()}')

# **Getting Value Counts for Each Label**

In [None]:
def get_value_counts_for_labels(df, col_list):
    print('Counts for Each Metric:\n==================================================')
    for col in col_list:
        print(f"Column: {col}")
        print(f'{df[col].value_counts().sort_values()}')
        print("*****")

# **Getting Histogram for a Column**

In [None]:
def get_histogram_for_col(df, col_list, col_name):
    sns.set_style('whitegrid')
    plt.figure(figsize=(18,10))
    for idx, label in enumerate(col_list):
        plt.subplot(2, 3, idx+1)
        sns.histplot(x=col_name, hue = label, data = df)
        plt.xlabel('Word Count', fontsize=10)

# **Getting Count Plot for Indicator Columns**

In [None]:
def get_count_plot_for_ind_col(df, search_string, col_list):
    fig, ax = plt.subplots(1, len(col_list), figsize=(50,10))
    for idx, label in enumerate(col_list):
        sns.countplot(x = df[label + search_string], ax = ax[idx])
        ax[idx].set_title(label)

# **Getting Bar Plot for Each Label**

In [None]:
def get_barplot_for_each_label(df, col_list, search_string):
    avg_score_cols = [col + search_string for col in col_list]
    plt.figure(figsize=(8,5))
    ax = sns.barplot(x=col_list, y=np.array(df[avg_score_cols].drop_duplicates())[0], palette='rocket')
    plt.xlabel('Scoring Metric', fontsize=12)
    if search_string == '_avg_score':
        plt.ylabel('Average Score', fontsize=12)
        plt.title('Average Score in Each Metric', fontsize=16)
    elif search_string == '_median_score':
        plt.ylabel('Median Score', fontsize=12)
        plt.title('Median Score in Each Metric', fontsize=16)

# **Label Processing**

In [None]:
def get_cat_label_cols(col_list):
    return ['cat_' + col for col in col_list]

In [None]:
def apply_label_map(df, label_map, col_list):
  for col in col_list:
      df[col + '_map'] = df[col].map(label_map)
  return df

In [None]:
def get_label_values(df, col_list):
    return [np.array(df[col]) for col in col_list]

In [None]:
def get_label_dict(df, col_list_1, col_list_2):
    return dict(zip(col_list_2, get_label_values(df, col_list_1)))

# **Plot Loss and Accuracy**

In [None]:
def plot_loss_accuracy(history, col_list):
    fig, ax = plt.subplots(2, 6, figsize=(16, 6), sharex='col', sharey='row')
    fig.tight_layout(pad=5.0)
    for idx, col in enumerate(col_list):

        ax[0, idx].plot(history[col + '_loss'], lw=2, color='darkgoldenrod')
        ax[0, idx].plot(history['val_' + col + '_loss'], lw=2, color='indianred')
        #ax[0, idx].legend(loc='center left')
        ax[0, idx].legend(['Train', 'Validation'], fontsize=5)
        ax[0, idx].set_xlabel('Epochs', size=10)
        ax[0, idx].set_title('Loss: ' + col)

        ax[1, idx].plot(history[col + '_accuracy'], lw=2, color='darkgoldenrod')
        ax[1, idx].plot(history['val_' + col + '_accuracy'], lw=2, color='indianred')
        #ax[0, idx].legend(loc='center left')
        ax[1, idx].legend(['Train', 'Validation'], fontsize=5)
        ax[1, idx].set_xlabel('Epochs', size=10)
        ax[1, idx].set_title('Accuracy: ' + col)

# **Read Input Files**

In [None]:
input_train_df = pd.read_csv('train.csv')
input_test_df = pd.read_csv('test.csv')

float_labels, int_map_labels = np.arange(1, 5.5, 0.5), np.arange(9)
label_map = dict(zip(float_labels, int_map_labels))

float_scaled_labels, int_scaled_labels = np.arange(1, 6, 1), np.arange(6)
label_scaled_map = dict(zip(float_scaled_labels, int_scaled_labels))

orig_train_df = copy.deepcopy(input_train_df)
orig_train_df.head()

"Map" columns are basically scaled columns of the original metric values. There are total 9 levels in map columns. Original mretric columns range from 1 to 5. Through map columns, they range from 0 to 8.

"Scaled" columns map numbers .5, 1.5, 2.5, 3.5 and 4.5 to nearset integers. Thus it will have range from 1 to 5.

In [None]:
label_map

In [None]:
label_scaled_map

# **Spliting the Data**

Original test data is very limited, there are only 3 records and it does not have labels to test. So we decided to repurpose the given train data to split into train, test and validation sets.

In [None]:
shuffle = np.random.permutation(np.arange(orig_train_df.shape[0]))
orig_train_df = orig_train_df.iloc[shuffle]
split=(0.8,0.1,0.1)
splits = np.multiply(len(orig_train_df), split).astype(int)
df_train, df_val, df_test = np.split(orig_train_df, [splits[0], splits[0] + splits[1]])

In [None]:
label_cols = df_train.columns[2:]
label_rounded_cols = [col + '_rounded_val' for col in label_cols]
label_map_cols = [col + '_map' for col in label_cols]
cat_label_cols = get_cat_label_cols(label_cols)

df_train = apply_label_map(df_train, label_map, label_cols)
df_test = apply_label_map(df_test, label_map, label_cols)
df_val = apply_label_map(df_val, label_map, label_cols)

In [None]:
label_cols, label_rounded_cols, label_map_cols, cat_label_cols

# **Addiung Other Feature Columns**

In [None]:
df_train = add_feature(df_train)
df_test = add_feature(df_test)
df_val = add_feature(df_val)

# **EDA**

In [None]:
df_train.head()

In [None]:
for col in label_cols:
    print(df_train[col + '_rounded_val'].unique())

In [None]:
df_train.iloc[0]

In [None]:
df_test.head()

In [None]:
df_val.head()

In [None]:
df_train.shape, df_test.shape, df_val.shape

In [None]:
df_train.describe()

In [None]:
df_train.columns

# **Unique Values for Each Label**

In [None]:
get_unique_values_for_labels(df_train, label_cols)

# **Unique Values for Each Rounded Label**

In [None]:
get_unique_values_for_labels(df_train, label_rounded_cols)

# **Value Counts for Each Label**

In [None]:
get_value_counts_for_labels(df_train, label_cols)

In [None]:
plot_count(df_train, label_cols)

# **Value Counts for Each Rounded Label**

In [None]:
get_value_counts_for_labels(df_train, label_rounded_cols)

In [None]:
plot_count(df_train, label_rounded_cols)

# **Full Text Length Stats**

In [None]:
df_train.full_text.str.len().describe()

# **Word Count Stats**

In [None]:
df_train.word_count.describe()

# **Sentence Count Stats**

In [None]:
df_train.sentence_count.describe()

In [None]:
df_train[df_train.sentence_count == 1][['full_text']]

# **Visualization**

# **Corelation Matrix of the Label Columns**

In [None]:
corr = df_train[label_cols].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.set(rc = {"figure.figsize": (10, 8)})

sns.heatmap(corr, 
            annot = True, 
            cmap = "coolwarm", 
            mask = mask,
            fmt  = ".5f")
plt.show()

# **Essay Length, Word Count, Total Score, Sentence Count Distribution**

Essay length, word count and sentence count diostributions have normal shape, though left skewed. Total score distribution looks bi-modal.

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1,4,1)
sns.histplot(data=df_train, x='word_count', kde=True)
plt.axvline(x=df_train['word_count'].mean(),color='red')
plt.axvline(x=df_train['word_count'].median(),color='black')
plt.xlabel('Word Count Distribution',fontsize=12)
plt.title('Word Count Distribution',fontsize=16)

plt.subplot(1,4,2)
sns.histplot(data=df_train, x='full_text_len', kde=True)
plt.axvline(x=df_train['full_text_len'].mean(),color='red')
plt.axvline(x=df_train['full_text_len'].median(),color='black')
plt.xlabel('Full Text Length Distribution',fontsize=12)
plt.title('Full Text Length Distribution',fontsize=16)

plt.subplot(1,4,3)
sns.histplot(data=df_train, x='total_score', kde=True)
plt.axvline(x=df_train['total_score'].mean(),color='red')
plt.axvline(x=df_train['total_score'].median(),color='black')
plt.xlabel('Total Score Distribution',fontsize=12)
plt.title('Total Score Distribution',fontsize=16)

plt.subplot(1,4,4)
sns.histplot(data=df_train, x='sentence_count', kde=True)
plt.axvline(x=df_train['sentence_count'].mean(),color='red')
plt.axvline(x=df_train['sentence_count'].median(),color='black')
plt.xlabel('Sentence Count Distribution',fontsize=12)
plt.title('Sentence Count Distribution',fontsize=16)

# **Word Count Vs Individual Label Scores**

In [None]:
get_histogram_for_col(df_train, label_cols, 'word_count')

# **Word Count Vs Individual Rounded Label Scores**

In [None]:
get_histogram_for_col(df_train, label_rounded_cols, 'word_count')

# **Sentence Count Vs Individual Label Scores**

In [None]:
get_histogram_for_col(df_train, label_cols, 'sentence_count')

# **Sentence Count Vs Individual Rounded Label Scores**

In [None]:
get_histogram_for_col(df_train, label_rounded_cols, 'sentence_count')

# **Distribution of Labels Above and Below Average in the Respective Category**

In [None]:
get_count_plot_for_ind_col(df_train, '_above_or_below_avg_flag', label_cols)

# **Distribution of Labels Above and Below Median in the Respective Category**

In [None]:
get_count_plot_for_ind_col(df_train, '_above_or_below_median_flag', label_cols)

For all the labels we see that most of the label values are below average and median values.

# **Distribution of Labels**

In [None]:
fig, ax = plt.subplots(1, len(label_cols), figsize=(40,10))

for idx, label in enumerate(label_cols):
    sns.distplot(x = df_train[label],
                 ax = ax[idx]
                )
    ax[idx].set_title(label)
    #plt.show(block = False)

# **Average Score Per Label**

In [None]:
get_barplot_for_each_label(df_train, label_cols, '_avg_score')

# **Median Score Per Label**

In [None]:
get_barplot_for_each_label(df_train, label_cols, '_median_score')

# **Most Frequent Words**

In [None]:
text = df_train[df_train[label_cols].sum(axis=1)==30]['full_text'].values[0]
word_cloud = WordCloud(stopwords=STOPWORDS, colormap='Pastel1', collocations=False, width=1200, height=700, background_color = "black").generate(text)
plt.figure(figsize=(20,8))
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
     

# **Model Building**

In [None]:
#make it easier to use a variety of BERT subword models
model_checkpoint = 'bert-base-cased'   # case sensitive (care about upper and lower case)
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
MAX_LENGTH = 512

# **Generate Input Embeddings - Train/Validation/Test Set**

In [None]:
X_train, X_val, X_test = df_train['full_text'], df_val['full_text'], df_test['full_text']
y_train, y_val, y_test = np.array(df_train[label_cols]), np.array(df_val[label_cols]), np.array(df_test[label_cols])

# Handling map columns, this maintains the same number of levels as present in the original kpi sets.
y_train_map, y_val_map, y_test_map = np.array(df_train[label_map_cols]), np.array(df_val[label_map_cols]), np.array(df_test[label_map_cols])
y_train_map_combined = get_label_dict(df_train, label_map_cols, cat_label_cols)
y_test_map_combined = get_label_dict(df_test, label_map_cols, cat_label_cols)
y_val_map_combined = get_label_dict(df_val, label_map_cols, cat_label_cols)

# Handling scaled values. Here we are converting the decimal values to nearest integers.
# Thus .5, 1.5, 2.5, 3.5 and 4.5 map to 1, 2, 3, 4 and 5 respectively.
y_train_scaled, y_val_scaled, y_test_scaled = np.array(df_train[label_rounded_cols]), np.array(df_val[label_rounded_cols]), np.array(df_test[label_rounded_cols])
y_train_scaled_combined = get_label_dict(df_train, label_rounded_cols, cat_label_cols)
y_test_scaled_combined = get_label_dict(df_test, label_rounded_cols, cat_label_cols)
y_val_scaled_combined = get_label_dict(df_val, label_rounded_cols, cat_label_cols)

In [None]:
y_train_scaled_combined

In [None]:
y_train_map_combined

In [None]:
train_encodings = bert_tokenizer(X_train.tolist(), dtype="int32", truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
val_encodings = bert_tokenizer(X_val.tolist(), dtype="int32", truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_encodings = bert_tokenizer(X_test.tolist(), dtype="int32", truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

In [None]:
classification_loss_dict = {label : tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) for label in cat_label_cols}
classification_loss_dict

In [None]:
classification_metrics_dict = {label : 'accuracy' for label in cat_label_cols}
classification_metrics_dict

In [None]:
regression_loss_dict = {label : 'huber_loss' for label in label_cols}
regression_loss_dict

In [None]:
regression_metrics_dict = {label : tf.keras.metrics.RootMeanSquaredError() for label in label_cols}
regression_metrics_dict

In [None]:
def create_bert_model(checkpoint = model_checkpoint,
                      num_classes = 9,   # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                      number_of_hidden_layer = 1,
                      hidden_layer_node_count = 256,
                      dropout = 0.3,
                      learning_rate = 0.00001,
                      trainable_flag = True,
                      retrain_layer_count = 999,#All layers are trainable
                      classification_regression_flag = 'C',
                      max_length = MAX_LENGTH):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """
    set_config_param()
    bert_model = TFBertModel.from_pretrained(checkpoint, name = 'bert_model')    
    if trainable_flag:
        if retrain_layer_count == 999:
            # Train all layers of the BERT model
            bert_model.trainable = True         
        else:
            retrain_layers_list = []
            for retrain_layer_number in range(retrain_layer_count):
                retrain_layers_list.append('retrain_layer_' + str(retrain_layer_number)) 
            print('retrain layers: ', retrain_layers_list)

            for weight in bert_model.weights:
                if not any([x in weight.name for layer in retrain_layers_list]):
                    #print('freezing: ', w)
                    weight._trainable = False
    else:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False 
    
    # Input layer
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}
                   
    # Bert output: being used as an input feature in the classification model below
    bert_out = bert_model(bert_inputs)        # full features as an input to the following classification model
    # pooler_output = bert_out[1]             # one vector for each
    cls_token = bert_out[0][:, 0, :]          # give us a raw CLS tokens

    # =========== END generate "input features" using BERT tokenizer ==================================

    # =========== BEGIN build a "multi-classification model" below passing the BERT input features ======
    layer_list = []
    for hidden_layer_number in range(number_of_hidden_layer):
        if hidden_layer_number == 0:
            hidden_layer = tf.keras.layers.Dense(units = hidden_layer_node_count
                                               , activation = 'relu'
                                               , name = 'hidden_layer_' + str(hidden_layer_number + 1)
                                                )(cls_token)
        else:
            hidden_layer = tf.keras.layers.Dense(units = hidden_layer_node_count
                                               , activation = 'relu'
                                               , name = 'hidden_layer_' + str(hidden_layer_number + 1)
                                                )(layer_list[-1])
        layer_list.append(hidden_layer)
        #dropout_layer = tf.keras.layers.Dropout(dropout, name = 'dropout_layer_' + str(hidden_layer_number + 1))(hidden_layer) 
        #layer_list.append(dropout_layer)

    if number_of_hidden_layer > 0:
        dropout_layer = tf.keras.layers.Dropout(dropout, name = 'dropout_layer')(hidden_layer) 
        layer_list.append(dropout_layer)

    if classification_regression_flag == 'C':
        # Output classification layer
        classification_cohesion = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_cohesion')(layer_list[-1]) 
        classification_syntax = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_syntax')(layer_list[-1])
        classification_vocabulary = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_vocabulary')(layer_list[-1])
        classification_phraseology = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_phraseology')(layer_list[-1])
        classification_grammar = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_grammar')(layer_list[-1])
        classification_conventions = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_conventions')(layer_list[-1])

        outputs = [classification_cohesion, 
                   classification_syntax, 
                   classification_vocabulary, 
                   classification_phraseology, 
                   classification_grammar, 
                   classification_conventions
                  ]
        classification_model = tf.keras.Model(inputs = [input_ids, 
                                                        token_type_ids, 
                                                        attention_mask
                                                       ], 
                                              outputs = outputs
                                             )
        classification_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                                     loss = classification_loss_dict,
                                     metrics = classification_metrics_dict
                                    )                
        return classification_model

    elif classification_regression_flag == 'R':

        # Output regression layer
        regression_cohesion = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_cohesion')(layer_list[-1]) 
        regression_syntax = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_syntax')(layer_list[-1])
        regression_vocabulary = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_vocabulary')(layer_list[-1])
        regression_phraseology = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_phraseology')(layer_list[-1])
        regression_grammar = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_grammar')(layer_list[-1])
        regression_conventions = tf.keras.layers.Dense(num_classes, activation = 'softmax', name = 'cat_conventions')(layer_list[-1])

        outputs = [regression_cohesion, 
                   regression_syntax, 
                   regression_vocabulary, 
                   regression_phraseology, 
                   regression_grammar, 
                   regression_conventions
                  ]
        regression_model = tf.keras.Model(inputs = [input_ids, 
                                                    token_type_ids, 
                                                    attention_mask
                                                   ], 
                                          outputs = outputs
                                         )
        regression_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                                 loss = regression_loss_dict,
                                 metrics = regression_metrics_dict
                                )                
        return regression_model

# **"bert_classification_model_1" is the baseline classification model with multilabel multi output(1..9). We are retraining the pre-trained model.**

In [None]:
bert_classification_model_1 = create_bert_model(checkpoint = model_checkpoint,
                                                num_classes = 9,   # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                                                number_of_hidden_layer = 1,
                                                hidden_layer_node_count = 128,
                                                dropout = 0.3,
                                                learning_rate = 0.00001,
                                                trainable_flag = True,
                                                retrain_layer_count = 999,#All layers trainable
                                                classification_regression_flag = 'C',
                                                max_length = MAX_LENGTH
                                              )

print(bert_classification_model_1.summary())

In [None]:
keras.utils.plot_model(bert_classification_model_1, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90)

In [None]:
bert_classification_model_1_history = bert_classification_model_1.fit([train_encodings.input_ids, 
                                                                       train_encodings.token_type_ids, 
                                                                       train_encodings.attention_mask
                                                                      ], 
                                                                      y_train_map_combined,   
                                                                      validation_data =([val_encodings.input_ids, 
                                                                                         val_encodings.token_type_ids, 
                                                                                         val_encodings.attention_mask
                                                                                        ], 
                                                                                        y_val_map_combined
                                                                                      ),    
                                                                      batch_size = 8, 
                                                                      epochs = 5
                                                                     )                                                  
bert_classification_model_1_history_df = pd.DataFrame(bert_classification_model_1_history.history)
bert_classification_model_1_history_df.T

In [None]:
score_classification_model_1 = bert_classification_model_1.evaluate([test_encodings.input_ids, 
                                                                     test_encodings.token_type_ids, 
                                                                     test_encodings.attention_mask
                                                                    ], 
                                                                    y_test_map_combined
                                                                   ) 

print('Test loss:', score_classification_model_1[0]) 
print('Test accuracy:', score_classification_model_1[1])

In [None]:
predictions_classification_model_1 = bert_classification_model_1.predict([test_encodings.input_ids, 
                                                                          test_encodings.token_type_ids, 
                                                                          test_encodings.attention_mask
                                                                          ]
                                                                         )
predictions_classification_model_1 = np.clip(predictions_classification_model_1, 0, 8)
predictions_classification_model_1

In [None]:
plot_loss_accuracy(bert_classification_model_1_history_df, cat_label_cols)

# **"bert_classification_model_2" is the baseline classification model with multilabel multi output. We are freezing the pre-trained model.**

In [None]:
bert_classification_model_2 = create_bert_model(checkpoint = model_checkpoint,
                                                num_classes = 9,   # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                                                number_of_hidden_layer = 1,
                                                hidden_layer_node_count = 128,
                                                dropout = 0.3,
                                                learning_rate = 0.00001,
                                                trainable_flag = False,
                                                retrain_layer_count = 999,#All layers trainable
                                                classification_regression_flag = 'C',
                                                max_length = MAX_LENGTH
                                              )

print(bert_classification_model_2.summary())

In [None]:
keras.utils.plot_model(bert_classification_model_2, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90)

In [None]:
bert_classification_model_2_history = bert_classification_model_2.fit([train_encodings.input_ids, 
                                                                       train_encodings.token_type_ids, 
                                                                       train_encodings.attention_mask
                                                                      ], 
                                                                      y_train_map_combined,   
                                                                      validation_data =([val_encodings.input_ids, 
                                                                                         val_encodings.token_type_ids, 
                                                                                         val_encodings.attention_mask
                                                                                        ], 
                                                                                        y_val_map_combined
                                                                                      ),    
                                                                      batch_size = 8, 
                                                                      epochs = 5
                                                                     )                                                  
bert_classification_model_2_history_df = pd.DataFrame(bert_classification_model_2_history.history)
bert_classification_model_2_history_df.T

In [None]:
score_classification_model_2 = bert_classification_model_2.evaluate([test_encodings.input_ids, 
                                                                     test_encodings.token_type_ids, 
                                                                     test_encodings.attention_mask
                                                                    ], 
                                                                    y_test_map_combined
                                                                   ) 

print('Test loss:', score_classification_model_2[0]) 
print('Test accuracy:', score_classification_model_2[1])

In [None]:
predictions_classification_model_2 = bert_classification_model_2.predict([test_encodings.input_ids, 
                                                                          test_encodings.token_type_ids, 
                                                                          test_encodings.attention_mask
                                                                          ]
                                                                         )
predictions_classification_model_2 = np.clip(predictions_classification_model_2, 0, 8)
predictions_classification_model_2

In [None]:
plot_loss_accuracy(bert_classification_model_2_history_df, cat_label_cols)

# **"bert_classification_model_3" is the baseline classification model with multilabel multi output (1..5). We are un-freezing the pre-trained model and running with scaled output levels.**

In [None]:
bert_classification_model_3 = create_bert_model(checkpoint = model_checkpoint,
                                                num_classes = 5,   # [1, 1.5, 2, 2.5....4.5, 5]: 9 classes
                                                number_of_hidden_layer = 1,
                                                hidden_layer_node_count = 128,
                                                dropout = 0.3,
                                                learning_rate = 0.00001,
                                                trainable_flag = True,
                                                retrain_layer_count = 999,#All layers trainable
                                                classification_regression_flag = 'C',
                                                max_length = MAX_LENGTH
                                              )

print(bert_classification_model_3.summary())

In [None]:
keras.utils.plot_model(bert_classification_model_3, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90)

In [None]:
bert_classification_model_3_history = bert_classification_model_3.fit([train_encodings.input_ids, 
                                                                       train_encodings.token_type_ids, 
                                                                       train_encodings.attention_mask
                                                                      ], 
                                                                      y_train_scaled_combined,   
                                                                      validation_data =([val_encodings.input_ids, 
                                                                                         val_encodings.token_type_ids, 
                                                                                         val_encodings.attention_mask
                                                                                        ], 
                                                                                        y_val_scaled_combined
                                                                                      ),    
                                                                      batch_size = 8, 
                                                                      epochs = 5
                                                                     )                                                  
bert_classification_model_3_history_df = pd.DataFrame(bert_classification_model_3_history.history)
bert_classification_model_3_history_df.T

In [None]:
score_classification_model_3 = bert_classification_model_3.evaluate([test_encodings.input_ids, 
                                                                     test_encodings.token_type_ids, 
                                                                     test_encodings.attention_mask
                                                                    ], 
                                                                    y_test_scaled_combined
                                                                   ) 

print('Test loss:', score_classification_model_3[0]) 
print('Test accuracy:', score_classification_model_3[1])

In [None]:
predictions_classification_model_3 = bert_classification_model_3.predict([test_encodings.input_ids, 
                                                                          test_encodings.token_type_ids, 
                                                                          test_encodings.attention_mask
                                                                          ]
                                                                         )
predictions_classification_model_3 = np.clip(predictions_classification_model_3, 1, 5)
predictions_classification_model_3

In [None]:
plot_loss_accuracy(bert_classification_model_3_history_df, cat_label_cols)