## Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import sklearn as sk
import matplotlib.pyplot as plt
from keras.preprocessing import text, sequence
import tensorflow_addons as tfa
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import datetime
from tqdm import tqdm
import gc 
import re
import time
np.set_printoptions(suppress=True)
from scipy.special import erfinv
import tensorflow_probability as tfp

 The versions of TensorFlow you are currently using is 2.8.2 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Load Data

In [3]:
data_df = pd.read_csv("DataSplit/train.tsv", delimiter="\t", dtype={0:'str',1:'str'})
test_df = pd.read_csv("DataSplit/test.tsv", delimiter="\t", dtype={0:'str',1:'str'})
val_df = pd.read_csv("DataSplit/test.tsv", delimiter="\t", dtype={0:'str',1:'str'})

# Tokenizing and generating Embeddings

In [None]:
EMBEDDING_FILE = 'GloVe/glove.840B.300d.txt'

In [None]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

## Tokenizing data

In [4]:
max_features = 180000
maxlen = 400

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data_df['Plot Summary'])

tokenized_train = tokenizer.texts_to_sequences(data_df['Plot Summary'])
train_dataset = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(test_df['Plot Summary'])
test_dataset = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

tokenized_val = tokenizer.texts_to_sequences(val_df['Plot Summary'])
val_dataset = sequence.pad_sequences(tokenized_val, maxlen=maxlen)

## Generating Embeds

In [None]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))

In [None]:
embed_size = 300
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

#embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, 50))
embedding_matrix = np.zeros((nb_words, embed_size))

c = 0
for word, i in word_index.items():
    if i >= nb_words: continue
    c += 1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


## Cleaning up to save RAM

In [None]:
del all_embs
del embeddings_index
del tokenized_train
del tokenized_test
del tokenized_val
gc.collect()

191

# Utility Functions

In [5]:
def make_one_hot_separate_cols(no_classes, dataframe):
    """
    Given a dataframe with column containing one-hot-encoding of type 010010 
    (single column one hot vector). Splits it into multiple columns depending
    on number of classes specified (no_classes). Returns array containing
    no_classes number of cols and same number of rows as dataframe.
    """
    columns = [i for i in range(1,no_classes + 1)]
    genres = []
    for item in dataframe['Encoded Genres']:
        splits = []
        n = len(item)
        while n + len(splits) < no_classes:
            splits.append(0)
        for i in item:
            splits.append(int(i))
        genres.append(splits)
    separated = pd.DataFrame(genres, columns = columns)
    return separated.to_numpy()

def threshold_predictions(predictions, threshold):
    """
    Function to threshold predictions matrix and make values below threshold
    as 0 and values >= threshold as 1.
    """
    predictions[predictions >= threshold] = 1
    predictions[predictions < threshold] = 0
    return predictions

def Eval_MC_Dropouts(save_path, no_of_reps, X_test, Y_test, threshold_value, confidence_threshold, conf_perc_factor):
    """
    Runs over no_of_reps times to create different models using MC dropouts and
    predicts a different set of probabilites during each repetition.
    
    Returns average of all predictions taken over no_of_reps number of predict-
    ions, stanadard deviation across all predictions taken over no_of_reps num-
    ber of predictions and the F1 score for each model.
    """
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    f1_list = [] # array of shape no_of_reps x 1 (F1 score)
    all_prediction_array = np.zeros(shape = (no_of_reps, Y_test.shape[0], Y_test.shape[1])) # 3D array to store all no_of_reps number of different prediction values
    model = tf.keras.models.load_model(save_path)
    for i in tqdm(range(no_of_reps)):
        predictions = model.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        model_f1 = eval_model(thresholded_predictions, Y_test) # returns f1 of model
        f1_list.append(model_f1)
    avg_f1 = np.mean(np.array(f1_list), axis = 0)
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average stddev of all reps)
    now = datetime.datetime.now()
    name = 'MC_Values/'+str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)+"_"+str(no_of_reps)+"_" # file nomenclature
    pd.DataFrame(avg_prediction_array).to_csv(name+'average_preds.csv', index = False, header = None)
    pd.DataFrame(std_prediction_array).to_csv(name+'std_preds.csv', index = False, header = None) # saving prediction avg and std over no_of_reps
    # Fequentist calculation
    final_predictions_freq = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            avg = avg_prediction_array[i,j]
            std = std_prediction_array[i,j]
            lower_lim = avg - conf_level_factor*std
            upper_lim = avg + conf_level_factor*std
            if lower_lim > threshold_value and upper_lim > threshold_value:
                final_predictions_freq[i,j] = 1
            elif lower_lim < threshold_value and upper_lim < threshold_value:
                final_predictions_freq[i,j] = 0
            else: 
                flag = 1
        if flag == 0:
            thresholded_predictions_freq.append(final_predictions_freq[i])
            selected_Y_vals_freq.append(Y_test[i])
    selected_freq_preds_f1 = eval_model(np.array(thresholded_predictions_freq), np.array(selected_Y_vals_freq), printing = False)
    # Frequentist calculation ends
    avg_thresholded_predictions = threshold_predictions(avg_prediction_array, threshold_value)
    avg_pred_f1 = eval_model(avg_thresholded_predictions, Y_test, printing = False) # returns f1 of Averaged predictions from no_of_reps models
    avg_pred_precision = precision_score(avg_thresholded_predictions, Y_test, average='micro')
    avg_pred_recall = recall_score(avg_thresholded_predictions, Y_test, average='micro')
    all_prediction_array_thresholded = threshold_predictions(all_prediction_array, threshold_value)
    confidence_matrix = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    final_predictions_maj_vote = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    print(all_prediction_array_thresholded.shape)
    for i in range(Y_test.shape[0]):
        for j in range(Y_test.shape[1]):
            no_of_zeros = 0
            no_of_ones = 0
            for k in range(no_of_reps):
                if all_prediction_array_thresholded[k,i,j] == 0:
                    no_of_zeros += 1
                elif all_prediction_array_thresholded[k,i,j] == 1:
                    no_of_ones += 1
            if no_of_ones > no_of_zeros:
                confidence_matrix[i, j] = no_of_ones/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 1
            else:
                confidence_matrix[i, j] = no_of_zeros/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 0
    thresholded_confidence_predictions = []
    selected_ground_truth_values = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            if confidence_matrix[i, j] < confidence_threshold:
                flag = 1
        if flag == 0:
            thresholded_confidence_predictions.append(final_predictions_maj_vote[i])
            selected_ground_truth_values.append(Y_test[i])
    avg_f1_conf_thresholded = eval_model(np.array(thresholded_confidence_predictions), np.array(selected_ground_truth_values), printing = False)

    f1_maj_vote = eval_model(np.array(final_predictions_maj_vote), Y_test, printing = False)

    print("Max F1 score of all reps: "+str(max(f1_list)))
    print("Average F1 score of all reps: "+str(avg_f1))
    print("F1 Score from ALL Models' predictions AVERAGED: "+str(avg_pred_f1))
    print("Precision Score from ALL Models' predictions AVERAGED: "+str(avg_pred_precision))
    print("Recall Score from ALL Models' predictions AVERAGED: "+str(avg_pred_recall))
    print("-----\nApplying confidence thresholding - F1 score of selected predictions: "+str(avg_f1_conf_thresholded))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_ground_truth_values).shape[0]))
    print("-----\nApplying frequentist methods - F1 score of selected predictions: "+str(selected_freq_preds_f1))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_Y_vals_freq).shape[0]))
    print("Majority vote F1:"+str(f1_maj_vote))
    return avg_prediction_array, std_prediction_array, f1_list

def eval_model(predictions, truths, printing = True):
    """
    Function to evaluate a single models' predictions. Prints F1, precision and 
    recall score of model on the test set. returns model F1 score.
    """
    if type(truths) is not np.ndarray:
        truths = np.array(truths)
    if type(predictions) is not np.ndarray:
        predictions = np.array(predictions)
    model_f1 = f1_score(predictions, truths, average='micro')
    model_precision = precision_score(predictions, truths, average='micro')
    model_recall = recall_score(predictions, truths, average='micro')
    if printing:
        print("\nModel F1 on test data is: "+str(model_f1))
        print("Model Precision on test data is: "+str(model_precision))
        print("Model Recall on test data is: "+str(model_recall))
    return model_f1

def train_model_ensembling(X_train, Y_train, save_folder, no_of_reps = 10, epochs_per_model = 7):
    """
    Takes X_train, Y_train, Saved models folder, no_of_reps and epochs per model
    as arguments and trains no_of_reps models and saves them into the appropria-
    te folder. Numbered from 1 - no_of_reps. NOTE: get_model() function must be
    defined with the appropriate model to be ensembled.
    """
    for i in tqdm(range(no_of_reps)):
        model_itr = get_model()
        model_itr.fit(X_train, Y_train, batch_size = 128, validation_data = (val_dataset, Y_val), epochs = epochs_per_model, workers = 4, use_multiprocessing = True)
        path = str(save_folder)+str(i+1)+"_EN_BiLSTM.h5" # file nomenclature
        model_itr.save(path)
    return

def eval_model_ensembling(save_folder, no_of_reps, X_test, Y_test, threshold_value, confidence_threshold, conf_perc_factor):
    """
    Evaluates ensembled models stored in save_folder path. Loads each model num-
    bered from 1-no_of_reps and predicts no_of_reps number of different output 
    predictions.

    Returns average of all predictions taken over no_of_reps number of predict-
    ions, stanadard deviation across all predictions taken over no_of_reps num-
    ber of predictions and the F1 score for each model.
    """
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    f1_list = [] # array of shape no_of_reps x 1 (F1 score)
    all_prediction_array = np.zeros(shape = (no_of_reps, Y_test.shape[0], Y_test.shape[1])) # 3D array to store all no_of_reps number of different prediction values
    for i in tqdm(range(no_of_reps)):
        path = str(save_folder)+str(i+1)+"_EN_BiLSTM.h5"
        model_itr = tf.keras.models.load_model(path)
        predictions = model_itr.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        model_f1 = eval_model(thresholded_predictions, Y_test) # returns f1 of model
        f1_list.append(model_f1)
    avg_f1 = np.mean(np.array(f1_list), axis = 0)
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average stddev of all reps)
    now = datetime.datetime.now()
    name = 'EN_Values/'+str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)+"_"+str(no_of_reps)+"_" # file nomenclature
    pd.DataFrame(avg_prediction_array).to_csv(name+'average_preds.csv', index = False, header = None)
    pd.DataFrame(std_prediction_array).to_csv(name+'std_preds.csv', index = False, header = None) # saving prediction avg and std over no_of_reps
    # Fequentist calculation
    final_predictions_freq = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            avg = avg_prediction_array[i,j]
            std = std_prediction_array[i,j]
            lower_lim = avg - conf_level_factor*std
            upper_lim = avg + conf_level_factor*std
            if lower_lim > threshold_value and upper_lim > threshold_value:
                final_predictions_freq[i,j] = 1
            elif lower_lim < threshold_value and upper_lim < threshold_value:
                final_predictions_freq[i,j] = 0
            else: 
                flag = 1
        if flag == 0:
            thresholded_predictions_freq.append(final_predictions_freq[i])
            selected_Y_vals_freq.append(Y_test[i])
    selected_freq_preds_f1 = eval_model(np.array(thresholded_predictions_freq), np.array(selected_Y_vals_freq), printing = False)
    # Frequentist calculation ends
    avg_thresholded_predictions = threshold_predictions(avg_prediction_array, threshold_value)
    avg_pred_f1 = eval_model(avg_thresholded_predictions, Y_test) # returns f1 of Averaged predictions from no_of_reps models
    avg_pred_precision = precision_score(avg_thresholded_predictions, Y_test, average='micro')
    avg_pred_recall = recall_score(avg_thresholded_predictions, Y_test, average='micro')
    all_prediction_array_thresholded = threshold_predictions(all_prediction_array, threshold_value)
    confidence_matrix = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    final_predictions_maj_vote = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    print(all_prediction_array_thresholded.shape)
    for i in range(Y_test.shape[0]):
        for j in range(Y_test.shape[1]):
            no_of_zeros = 0
            no_of_ones = 0
            for k in range(no_of_reps):
                if all_prediction_array_thresholded[k,i,j] == 0:
                    no_of_zeros += 1
                elif all_prediction_array_thresholded[k,i,j] == 1:
                    no_of_ones += 1
            if no_of_ones > no_of_zeros:
                confidence_matrix[i, j] = no_of_ones/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 1
            else:
                confidence_matrix[i, j] = no_of_zeros/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 0
    thresholded_confidence_predictions = []
    selected_ground_truth_values = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            if confidence_matrix[i, j] < confidence_threshold:
                flag = 1
        if flag == 0:
            thresholded_confidence_predictions.append(final_predictions_maj_vote[i])
            selected_ground_truth_values.append(Y_test[i])
    avg_f1_conf_thresholded = eval_model(np.array(thresholded_confidence_predictions), np.array(selected_ground_truth_values), printing = False)

    f1_maj_vote = eval_model(np.array(final_predictions_maj_vote), Y_test, printing = False)

    print("Max F1 score of all reps: "+str(max(f1_list)))
    print("Average F1 score of all reps: "+str(avg_f1))
    print("F1 Score from ALL Models' predictions AVERAGED: "+str(avg_pred_f1))
    print("Precision Score from ALL Models' predictions AVERAGED: "+str(avg_pred_precision))
    print("Recall Score from ALL Models' predictions AVERAGED: "+str(avg_pred_recall))
    print("-----\nApplying confidence thresholding- F1 score of selected predictions: "+str(avg_f1_conf_thresholded))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_ground_truth_values).shape[0]))
    print("-----\nApplying frequentist methods - F1 score of selected predictions: "+str(selected_freq_preds_f1))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_Y_vals_freq).shape[0]))
    print("Majority vote F1:"+str(f1_maj_vote))
    return avg_prediction_array, std_prediction_array, f1_list

def single_EN_pred(save_folder, no_of_reps, X_test, Y_test, threshold_value, num_classes, maxlen_embedding, conf_perc_factor):
    """
    Takes a single training example as input along with no_of_reps to predict n-
    o_of_reps number of inferences 
    """
    if X_test.shape is not (1, maxlen_embedding):
        X_test = np.reshape(X_test, (1, maxlen_embedding))
    if Y_test.shape is not (1, num_classes):
        Y_test = np.reshape(Y_test, (1, num_classes))
    all_prediction_array = np.zeros(shape = (no_of_reps, 1, num_classes)) # 3D array to store all no_of_reps number of different prediction values
    confidence_count_zero = [0 for i in range(num_classes)]
    confidence_count_one = [0 for i in range(num_classes)]
    confidence_factor = [0 for i in range(num_classes)]
    final_prediction_MV = [0 for i in range(num_classes)]
    for i in tqdm(range(no_of_reps)):
        path = str(save_folder)+str(i+1)+"_EN_BiLSTM.h5"
        model_itr = tf.keras.models.load_model(path)
        predictions = model_itr.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        print(thresholded_predictions)
        for i in range(thresholded_predictions.shape[1]):
            if thresholded_predictions[0][i] == 0:
                confidence_count_zero[i] += 1
            else:
                confidence_count_one[i] += 1
    for i in range(num_classes):
        if confidence_count_zero[i] > confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_zero[i] / no_of_reps)*100
        elif confidence_count_zero[i] < confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_one[i] / no_of_reps)*100
            final_prediction_MV[i] = 1
        else:
            confidence_factor[i] =  50.0
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average stddev of all reps)
    # frequentist starts 
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    final_predictions_freq = np.zeros(shape = (1, num_classes))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    lower_lim_array = []
    upper_lim_array = []
    flag = 0
    for j in range(num_classes):
        avg = avg_prediction_array[0,j]
        std = std_prediction_array[0,j]
        lower_lim = avg - conf_level_factor*std
        upper_lim = avg + conf_level_factor*std
        lower_lim_array.append(lower_lim)
        upper_lim_array.append(upper_lim)
        if lower_lim > threshold_value and upper_lim > threshold_value:
            final_predictions_freq[0,j] = 1
        elif lower_lim < threshold_value and upper_lim < threshold_value:
            final_predictions_freq[0,j] = 0
        else: 
            flag = 1
    if flag == 0:
        thresholded_predictions_freq.append(final_predictions_freq[0])
    #frequentist ends
    avg_prediction_thresholded = threshold_predictions(avg_prediction_array, threshold_value)
    genres_list = ['Spy Fiction', 'Alternate History', 'Non Fiction', 'Adevnture Novel', 'Detective Fiction', 'Historical Fiction', 'Romance Novel', 'Horror', 'Thriller', 'Historical Novel', 'Crime Fiction', 'Suspense', 'Young Adult Literature', 'Mystery', 'Childrens Literature', 'Fantasy', 'Novel', 'Science Fiction', 'Speculative Fiction', 'Fiction']
    answer_list = []
    ground_truth_list = []
    for i in range(20):
        if avg_prediction_thresholded[0][i] == 1:
            answer_list.append(genres_list[i])
        if Y_test[0][i] == 1:
            ground_truth_list.append(genres_list[i])
    print("Majority Vote Prediction of all models: ")
    print(final_prediction_MV)
    print("Ground truth Value: ")
    print(Y_test)
    print("Ground truth class labels: ")
    print(ground_truth_list)
    print("Final prediction after confidence interval thresholding: ")
    print(final_predictions_freq)
    print("Confidence factors for each class: ")
    for i in range(num_classes):
        print("(",lower_lim_array[i],", ",upper_lim_array[i],")")
    if flag == 0:
        print("Prediction passes - CONSIDERED")
    else:
        print("prediction fails - IGNORED")
    return avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor

def single_MC_pred(save_path, no_of_reps, X_test, Y_test, threshold_value, num_classes, maxlen_embedding, conf_perc_factor):
    if X_test.shape is not (1, maxlen_embedding):
        X_test = np.reshape(X_test, (1, maxlen_embedding))
    if Y_test.shape is not (1, num_classes):
        Y_test = np.reshape(Y_test, (1, num_classes))
    all_prediction_array = np.zeros(shape = (no_of_reps, 1, num_classes)) # 3D array to store all no_of_reps number of different prediction values
    confidence_count_zero = [0 for i in range(num_classes)]
    confidence_count_one = [0 for i in range(num_classes)]
    confidence_factor = [0 for i in range(num_classes)]
    final_prediction_MV = [0 for i in range(num_classes)]
    loaded_model = tf.keras.models.load_model(save_path)
    for i in tqdm(range(no_of_reps)):
        predictions = loaded_model.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        print(thresholded_predictions)
        for i in range(thresholded_predictions.shape[1]):
            if thresholded_predictions[0][i] == 0:
                confidence_count_zero[i] += 1
            else:
                confidence_count_one[i] += 1
    for i in range(num_classes):
        if confidence_count_zero[i] > confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_zero[i] / no_of_reps)*100
        elif confidence_count_zero[i] < confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_one[i] / no_of_reps)*100
            final_prediction_MV[i] = 1
        else:
            confidence_factor[i] =  50.0
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average stddev of all reps)
    # frequentist starts 
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    final_predictions_freq = np.zeros(shape = (1, num_classes))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    lower_lim_array = []
    upper_lim_array = []
    flag = 0
    for j in range(num_classes):
        avg = avg_prediction_array[0,j]
        std = std_prediction_array[0,j]
        lower_lim = avg - conf_level_factor*std
        upper_lim = avg + conf_level_factor*std
        lower_lim_array.append(lower_lim)
        upper_lim_array.append(upper_lim)
        if lower_lim > threshold_value and upper_lim > threshold_value:
            final_predictions_freq[0,j] = 1
        elif lower_lim < threshold_value and upper_lim < threshold_value:
            final_predictions_freq[0,j] = 0
        else: 
            flag = 1
    if flag == 0:
        thresholded_predictions_freq.append(final_predictions_freq[0])
    #frequentist ends
    avg_prediction_thresholded = threshold_predictions(avg_prediction_array, threshold_value)
    genres_list = ['Spy Fiction', 'Alternate History', 'Non Fiction', 'Adevnture Novel', 'Detective Fiction', 'Historical Fiction', 'Romance Novel', 'Horror', 'Thriller', 'Historical Novel', 'Crime Fiction', 'Suspense', 'Young Adult Literature', 'Mystery', 'Childrens Literature', 'Fantasy', 'Novel', 'Science Fiction', 'Speculative Fiction', 'Fiction']
    answer_list = []
    ground_truth_list = []
    for i in range(20):
        if avg_prediction_thresholded[0][i] == 1:
            answer_list.append(genres_list[i])
        if Y_test[0][i] == 1:
            ground_truth_list.append(genres_list[i])
    print("Majority Vote Prediction of all models: ")
    print(final_prediction_MV)
    print("Ground truth Value: ")
    print(Y_test)
    print("Ground truth class labels: ")
    print(ground_truth_list)
    print("Final prediction after confidence interval thresholding: ")
    print(final_predictions_freq)
    print("Confidence factors for each class: ")
    for i in range(num_classes):
        print("(",lower_lim_array[i],", ",upper_lim_array[i],")")
    if flag == 0:
        print("Prediction passes - CONSIDERED")
    else:
        print("prediction fails - IGNORED")
    return avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor

# Preprocessing Labels to feed into network Training, val and test

In [6]:
Y_train = make_one_hot_separate_cols(20, data_df)
Y_val = make_one_hot_separate_cols(20, val_df)
Y_test = make_one_hot_separate_cols(20, test_df)

# Creating Models

## Model with NO Inference time dropouts

In [None]:
def get_model(lr=0.001):
    model = tf.keras.models.Sequential(name = "BiLSTM_Model")
    model.add(tf.keras.layers.Embedding(nb_words, output_dim=embed_size, weights=[embedding_matrix], input_length = maxlen, trainable=False))
    model.add(tf.keras.layers.Dropout(0.1)) # embedding dropouts
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, recurrent_dropout = 0.5, activation = 'tanh')))# weight drop on recurrent layers using recurrent_dropout
    model.add(tf.keras.layers.GlobalMaxPooling1D(data_format="channels_last", keepdims=False))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(512, activation = 'relu'))
    model.add(tf.keras.layers.Dense(20))
    model.add(tf.keras.layers.Activation('sigmoid'))

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=False) , optimizer = 'adam', metrics = ['accuracy', tfa.metrics.F1Score(num_classes = 20, average = 'micro')])
    return model

## Model With Inference Time dropouts Enabled (MC Model)

In [None]:
def get_model(lr=0.001):
    inputs = tf.keras.layers.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(nb_words, output_dim=embed_size, weights=[embedding_matrix], input_length = maxlen, trainable=False)(inputs)
    x = tf.keras.layers.Dropout(0.1)(x, training = True) # embedding dropouts
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, recurrent_dropout = 0.5, activation = 'tanh'))(x)# weight drop on recurrent layers using recurrent_dropout
    x = tf.keras.layers.GlobalMaxPooling1D(data_format="channels_last", keepdims=False)(x)
    x = tf.keras.layers.Dropout(0.5)(x, training = True)
    x = tf.keras.layers.Dense(512)(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.Dense(20)(x)
    outputs = tf.keras.layers.Activation('sigmoid')(x)

    model = tf.keras.Model(inputs, outputs, name="BiLSTM_MC_Model")

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=False) , optimizer = 'adam', metrics = ['accuracy', tfa.metrics.F1Score(num_classes = 20, average = 'micro')])
    return model

In [None]:
model = get_model()
model.summary()

Model: "BiLSTM_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 300)          34723800  
                                                                 
 dropout (Dropout)           (None, 400, 300)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 400, 512)         1140736   
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 512)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense (Dense)               (None, 512)              

# Training and Saving Model (Training and saving SINGLE Model)

## Train MC Model

In [None]:
batchsize = 128
history = model.fit(train_dataset, Y_train, batch_size = batchsize, validation_data = (val_dataset, Y_val), initial_epoch = 0, epochs = 12, workers = 4, use_multiprocessing = True)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [None]:
preds = model.predict(test_dataset)
preds = threshold_predictions(preds, 0.5)
f1_score(preds, Y_test, average = 'micro') # MC (12 epochs)

0.4756687077022236

In [None]:
now = datetime.datetime.now()
name = 'Models/'+str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)+"_MC_BiLSTM.h5" # file nomenclature
model.save(name)

# Evaluating Model Uncertainty (MC dropout based eval)

## MC eval of entire dataset




SETTING frequentist confidence factor to 90%

In [None]:
path = 'Models/MC_BiLSTM.h5'
avg_prediction_array, std_prediction_array, avg_f1_array = Eval_MC_Dropouts(path, 10, test_dataset, Y_test, 0.5, 100, 0.90)



 10%|█         | 1/10 [00:10<01:35, 10.61s/it]


Model F1 on test data is: 0.4695484493009802
Model Precision on test data is: 0.36172319881158704
Model Recall on test data is: 0.6689560439560439


 20%|██        | 2/10 [00:20<01:23, 10.41s/it]


Model F1 on test data is: 0.46769527483124407
Model Precision on test data is: 0.3602376825947017
Model Recall on test data is: 0.6665139715987174


 30%|███       | 3/10 [00:31<01:12, 10.36s/it]


Model F1 on test data is: 0.47106109324758844
Model Precision on test data is: 0.36271354295617725
Model Recall on test data is: 0.6717102246675837


 40%|████      | 4/10 [00:41<01:01, 10.33s/it]


Model F1 on test data is: 0.4608960976393127
Model Precision on test data is: 0.35528596187175043
Model Recall on test data is: 0.6558500914076782


 50%|█████     | 5/10 [00:51<00:51, 10.23s/it]


Model F1 on test data is: 0.46771602955348535
Model Precision on test data is: 0.36048526863084923
Model Recall on test data is: 0.6657521719250115


 60%|██████    | 6/10 [01:01<00:40, 10.18s/it]


Model F1 on test data is: 0.4788553259141494
Model Precision on test data is: 0.3728645704382273
Model Recall on test data is: 0.669035984007108


 70%|███████   | 7/10 [01:11<00:30, 10.19s/it]


Model F1 on test data is: 0.474451385551818
Model Precision on test data is: 0.36667491953453824
Model Recall on test data is: 0.6719600725952813


 80%|████████  | 8/10 [01:22<00:20, 10.47s/it]


Model F1 on test data is: 0.4714584338318058
Model Precision on test data is: 0.36296112899232486
Model Recall on test data is: 0.6724770642201835


 90%|█████████ | 9/10 [01:32<00:10, 10.33s/it]


Model F1 on test data is: 0.47162270183852917
Model Precision on test data is: 0.36518940331765287
Model Recall on test data is: 0.6656137184115524


100%|██████████| 10/10 [01:42<00:00, 10.29s/it]


Model F1 on test data is: 0.4694337194337195
Model Precision on test data is: 0.3612280267392919
Model Recall on test data is: 0.6701883325677538





(10, 1926, 20)
Max F1 score of all reps: 0.4788553259141494
Average F1 score of all reps: 0.47027385111426323
F1 Score from ALL Models' predictions AVERAGED: 0.4700613700447836
Precision Score from ALL Models' predictions AVERAGED: 0.35082941322109434
Recall Score from ALL Models' predictions AVERAGED: 0.7120603015075377
-----
Applying confidence thresholding - F1 score of selected predictions: 0.5600739371534196
Predictions Refused: 1565
-----
Applying frequentist methods - F1 score of selected predictions: 0.539906103286385
Predictions Refused: 1497


## Single sample MC evaluation

In [7]:
test_df.iloc[35]

Encoded Genres                                 00000000000000110000
Plot Summary       After more than three years in exile on Grays...
Name: 35, dtype: object

In [8]:
%%time
saved_path = 'Models/MC_BiLSTM.h5'
avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor = single_MC_pred(saved_path, 10, test_dataset[35], Y_test[35], 0.5, 20, 400, 0.90)

 10%|██████████▊                                                                                                 | 1/10 [00:01<00:13,  1.51s/it]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 20%|█████████████████████▌                                                                                      | 2/10 [00:01<00:06,  1.17it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 30%|████████████████████████████████▍                                                                           | 3/10 [00:02<00:04,  1.56it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 40%|███████████████████████████████████████████▏                                                                | 4/10 [00:02<00:03,  1.78it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 50%|██████████████████████████████████████████████████████                                                      | 5/10 [00:03<00:02,  1.82it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 60%|████████████████████████████████████████████████████████████████▊                                           | 6/10 [00:03<00:02,  1.74it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 70%|███████████████████████████████████████████████████████████████████████████▌                                | 7/10 [00:04<00:01,  1.83it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 8/10 [00:04<00:01,  1.87it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 9/10 [00:05<00:00,  1.77it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.66it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]
Majority Vote Prediction of all models: 
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
Ground truth Value: 
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0]]
Ground truth class labels: 
['Childrens Literature', 'Fantasy']
Final prediction after confidence interval thresholding: 
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]
Confidence factors for each class: 
( -0.0039299078358248155 ,  0.07093572095762046 )
( 0.05140874564437163 ,  0.2975949209949564 )
( -0.007082209357269033 ,  0.04443129635716222 )
( -2.689394992610994e-06 ,  0.0002655769675275754 )
( -3.3273947112933483e-06 ,  1.7535114193622034e-05 )
( 0.10851591439960695 ,  0.19333998946429992 )
( 0.07337166911806742 ,  0.27780889623914085 )
( -0.00043164103904225074 ,  0.007084204849390761 )
( -0.004665702171336057 ,  0.02046665508271301 )
( -4.128841811507005e-05 ,  0.0003164840549379703 )
( -0.0006335571226355164 ,  0.003027524035858783 




# Model Ensembling

## Training models 

Training 10 models and saving them into folder

In [None]:
save_folder = "Models/EnsemblingModels/"
train_model_ensembling(train_dataset, Y_train, save_folder, no_of_reps = 10, epochs_per_model = 12)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 10%|█         | 1/10 [16:25<2:27:49, 985.45s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 20%|██        | 2/10 [33:50<2:16:05, 1020.70s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 30%|███       | 3/10 [50:09<1:56:49, 1001.33s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 40%|████      | 4/10 [1:06:34<1:39:29, 994.98s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 50%|█████     | 5/10 [1:22:55<1:22:29, 989.98s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 60%|██████    | 6/10 [1:39:30<1:06:06, 991.69s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 70%|███████   | 7/10 [1:56:56<50:28, 1009.35s/it] 

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 80%|████████  | 8/10 [2:13:31<33:29, 1004.74s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


 90%|█████████ | 9/10 [2:31:56<17:16, 1036.34s/it]

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


100%|██████████| 10/10 [2:49:22<00:00, 1016.22s/it]


## Loading trained models for testing

In [None]:
path = 'Models/EnsemblingModels/'
avg_prediction_array, std_prediction_array, f1_list = eval_model_ensembling(path, 10, test_dataset,Y_test, 0.5, 100, 0.90)

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:12<01:48, 12.10s/it]


Model F1 on test data is: 0.5218424962852897
Model Precision on test data is: 0.4347610794751176
Model Recall on test data is: 0.6525455221107395


 20%|██        | 2/10 [00:23<01:33, 11.70s/it]


Model F1 on test data is: 0.5059981544140265
Model Precision on test data is: 0.4072790294627383
Model Recall on test data is: 0.6678846934632562


 30%|███       | 3/10 [00:34<01:20, 11.45s/it]


Model F1 on test data is: 0.5266676355180933
Model Precision on test data is: 0.44862589749938103
Model Recall on test data is: 0.6375791695988741


 40%|████      | 4/10 [00:45<01:08, 11.38s/it]


Model F1 on test data is: 0.5526827912149014
Model Precision on test data is: 0.49220103986135183
Model Recall on test data is: 0.6301109350237718


 50%|█████     | 5/10 [00:57<00:57, 11.43s/it]


Model F1 on test data is: 0.5414084507042254
Model Precision on test data is: 0.4758603614756128
Model Recall on test data is: 0.6278993792878145


 60%|██████    | 6/10 [01:08<00:45, 11.42s/it]


Model F1 on test data is: 0.5164375373580395
Model Precision on test data is: 0.4278286704629859
Model Recall on test data is: 0.6513381078024878


 70%|███████   | 7/10 [01:20<00:34, 11.54s/it]


Model F1 on test data is: 0.5249630723781389
Model Precision on test data is: 0.4399603862342164
Model Recall on test data is: 0.6506774075430245


 80%|████████  | 8/10 [01:31<00:22, 11.40s/it]


Model F1 on test data is: 0.5483196206944637
Model Precision on test data is: 0.4867541470661055
Model Recall on test data is: 0.6277139208173691


 90%|█████████ | 9/10 [01:43<00:11, 11.37s/it]


Model F1 on test data is: 0.5334291876347951
Model Precision on test data is: 0.4592720970537262
Model Recall on test data is: 0.6361454046639232


100%|██████████| 10/10 [01:54<00:00, 11.49s/it]


Model F1 on test data is: 0.537841726618705
Model Precision on test data is: 0.462738301559792
Model Recall on test data is: 0.6420474063895568






Model F1 on test data is: 0.5334917570176742
Model Precision on test data is: 0.44466452092102005
Model Recall on test data is: 0.6666666666666666
(10, 1926, 20)
Max F1 score of all reps: 0.5526827912149014
Average F1 score of all reps: 0.5309590672820678
F1 Score from ALL Models' predictions AVERAGED: 0.5334917570176742
Precision Score from ALL Models' predictions AVERAGED: 0.44466452092102005
Recall Score from ALL Models' predictions AVERAGED: 0.6666666666666666
-----
Applying confidence thresholding- F1 score of selected predictions: 0.6385964912280702
Predictions Refused: 1658
-----
Applying frequentist methods - F1 score of selected predictions: 0.6356756756756756
Predictions Refused: 1638


## Single Sample Evaluation

In [None]:
%%time
models_folder = 'Models/EnsemblingModels/'
avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor = single_EN_pred(models_folder, 10, test_dataset[245], Y_test[245], 0.5, 20, 400, 0.90)

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:02<00:19,  2.11s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 20%|██        | 2/10 [00:04<00:16,  2.11s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 30%|███       | 3/10 [00:07<00:17,  2.44s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 40%|████      | 4/10 [00:09<00:13,  2.31s/it]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 50%|█████     | 5/10 [00:11<00:11,  2.26s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 60%|██████    | 6/10 [00:14<00:09,  2.46s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 70%|███████   | 7/10 [00:19<00:09,  3.29s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 80%|████████  | 8/10 [00:22<00:06,  3.28s/it]

[[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


 90%|█████████ | 9/10 [00:25<00:03,  3.16s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


100%|██████████| 10/10 [00:27<00:00,  2.74s/it]

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Majority Vote Prediction of all models: 
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Ground truth Value: 
[[0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]]
Ground truth class labels: 
['Adevnture Novel', 'Romance Novel', 'Crime Fiction']
Final prediction after confidence interval thresholding: 
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Confidence factors for each class: 
( 0.00010382275734871537 ,  0.001975175855356903 )
( -1.968875906742539e-05 ,  0.00015530112426050835 )
( 0.010755567629868889 ,  0.03686081785238276 )
( 0.45438969949598346 ,  0.6549543656933781 )
( -0.028337194125005696 ,  0.2170137197774143 )
( 0.0005557563757182232 ,  0.00616859464015503 )
( 0.3135220321615584 ,  0.5193205503026598 )
( 0.0002768824963020689 ,  0.006433783153432369 )
( 0.0010490995757527935 ,  0.006153985585992755 )
( 0.0015170950254814542 ,  0.01408595626257194 )
( 0.4058630784447494 ,  0.61822833219215


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BNN

## Prior posterior defenitions

In [None]:
# Define the prior weight distribution as Normal of mean=0 and stddev=1.
# Note that, in this example, the we prior distribution is not trainable,
# as we fix its parameters.
def prior(kernel_size, bias_size, dtype=None):
    n = kernel_size + bias_size
    prior_model = keras.Sequential(
        [
            tfp.layers.DistributionLambda(
                lambda t: tfp.distributions.MultivariateNormalDiag(
                     loc=tf.fill(dims = (n,), value = 0.5), scale_diag=tf.fill(dims = (n,), value = 0.5)#loc=tf.zeros(n), scale_diag=tf.ones(n)
                )
            )
        ]
    )
    return prior_model


# Define variational posterior weight distribution as multivariate Gaussian.
# Note that the learnable parameters for this distribution are the means,
# variances, and covariances.
def posterior(kernel_size, bias_size, dtype=None):
    n = kernel_size + bias_size
    posterior_model = keras.Sequential(
        [
            tfp.layers.VariableLayer(
                tfp.layers.MultivariateNormalTriL.params_size(n), dtype=dtype
            ),
            tfp.layers.MultivariateNormalTriL(n),
        ]
    )
    return posterior_model

## Embedding matrix generation with different dimensions

In [None]:
embed_size = 300
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

  if self.run_code(code, result):


In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## BNN Model

In [None]:
def get_bnn_model(train_size):
    initializer = tf.keras.initializers.GlorotUniform()

    model = tf.keras.models.Sequential(name = "BNN_Model_with_priors")
    model.add(tf.keras.layers.Embedding(nb_words+1, output_dim=300, weights=[embedding_matrix], input_length = maxlen, trainable=False))
    model.add(tf.keras.layers.Dropout(0.1)) # embedding dropouts
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, recurrent_dropout = 0.5, activation = 'tanh', kernel_initializer=initializer)))# weight drop on recurrent layers using recurrent_dropout
    # model.add(tf.keras.layers.LSTM(128, return_sequences=True, recurrent_dropout = 0.5, activation = 'tanh', kernel_initializer=initializer)) 
    # model.add(tf.keras.layers.LSTM(256, return_sequences=True, recurrent_dropout = 0.5, activation = 'tanh', kernel_initializer=initializer))
    model.add(tf.keras.layers.GlobalMaxPooling1D(data_format="channels_last", keepdims=False))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(512, kernel_initializer=initializer, activation='relu'))
    # model.add(tf.keras.layers.Activation('relu'))
    model.add(tfp.layers.DenseVariational(units = 16, make_prior_fn = prior, make_posterior_fn = posterior, kl_weight = 1 / train_size, activation="relu"))
    model.add(tf.keras.layers.Dense(20, kernel_initializer=initializer))
    model.add(tf.keras.layers.Activation('sigmoid'))
    # model.add(tfp.layers.IndependentLogistic(event_shape=(20, )))

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=False) , optimizer = 'adam', metrics = ['accuracy', tfa.metrics.F1Score(num_classes = 20, average = 'micro')])

    return model

## BNN Utility

In [None]:
def Eval_BNN_Model(model, no_of_reps, X_test, Y_test, threshold_value, confidence_threshold, conf_perc_factor):
    """
    Runs over no_of_reps times to create different models using BNN and
    predicts a different set of probabilites during each repetition.
    
    Returns average of all predictions taken over no_of_reps number of predict-
    ions, stanadard deviation across all predictions taken over no_of_reps num-
    ber of predictions and the F1 score for each model.
    """
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    f1_list = [] # array of shape no_of_reps x 1 (F1 score)
    all_prediction_array = np.zeros(shape = (no_of_reps, Y_test.shape[0], Y_test.shape[1])) # 3D array to store all no_of_reps number of different prediction values
    # model = tf.keras.models.load_model(save_path)
    
    for i in tqdm(range(no_of_reps)):
        predictions = model.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        model_f1 = eval_model(thresholded_predictions, Y_test) # returns f1 of model
        f1_list.append(model_f1)
    avg_f1 = np.mean(np.array(f1_list), axis = 0)
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape no_of_samples x no_of_classes (Average stddev of all reps)
    now = datetime.datetime.now()
    name = '/content/drive/MyDrive/GenreClassificationBayes/'+str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)+"_"+str(no_of_reps)+"_" # file nomenclature
    pd.DataFrame(avg_prediction_array).to_csv(name+'average_preds.csv', index = False, header = None)
    pd.DataFrame(std_prediction_array).to_csv(name+'std_preds.csv', index = False, header = None) # saving prediction avg and std over no_of_reps
    # Fequentist calculation
    final_predictions_freq = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            avg = avg_prediction_array[i,j]
            std = std_prediction_array[i,j]
            lower_lim = avg - conf_level_factor*std
            upper_lim = avg + conf_level_factor*std
            if lower_lim > threshold_value and upper_lim > threshold_value:
                final_predictions_freq[i,j] = 1
            elif lower_lim < threshold_value and upper_lim < threshold_value:
                final_predictions_freq[i,j] = 0
            else: 
                flag = 1
        if flag == 0:
            thresholded_predictions_freq.append(final_predictions_freq[i])
            selected_Y_vals_freq.append(Y_test[i])
    # Frequentist calculation ends
    avg_thresholded_predictions = threshold_predictions(avg_prediction_array, threshold_value)
    avg_pred_f1 = eval_model(avg_thresholded_predictions, Y_test, printing = False) # returns f1 of Averaged predictions from no_of_reps models
    avg_pred_precision = precision_score(avg_thresholded_predictions, Y_test, average='micro')
    avg_pred_recall = recall_score(avg_thresholded_predictions, Y_test, average='micro')
    all_prediction_array_thresholded = threshold_predictions(all_prediction_array, threshold_value)
    confidence_matrix = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    final_predictions_maj_vote = np.zeros(shape = (Y_test.shape[0], Y_test.shape[1]))
    print(all_prediction_array_thresholded.shape)
    for i in range(Y_test.shape[0]):
        for j in range(Y_test.shape[1]):
            no_of_zeros = 0
            no_of_ones = 0
            for k in range(no_of_reps):
                if all_prediction_array_thresholded[k,i,j] == 0:
                    no_of_zeros += 1
                elif all_prediction_array_thresholded[k,i,j] == 1:
                    no_of_ones += 1
            if no_of_ones > no_of_zeros:
                confidence_matrix[i, j] = no_of_ones/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 1
            else:
                confidence_matrix[i, j] = no_of_zeros/no_of_reps * 100
                final_predictions_maj_vote[i, j] = 0
    thresholded_confidence_predictions = []
    selected_ground_truth_values = []
    for i in range(Y_test.shape[0]):
        flag = 0
        for j in range(Y_test.shape[1]):
            if confidence_matrix[i, j] < confidence_threshold:
                flag = 1
        if flag == 0:
            thresholded_confidence_predictions.append(final_predictions_maj_vote[i])
            selected_ground_truth_values.append(Y_test[i])
    avg_f1_conf_thresholded = eval_model(np.array(thresholded_confidence_predictions), np.array(selected_ground_truth_values), printing = False)
    f1_maj_vote = eval_model(np.array(final_predictions_maj_vote), Y_test, printing = False)

    print("Max F1 score of all reps: "+str(max(f1_list)))
    print("Average F1 score of all reps: "+str(avg_f1))
    print("F1 Score from ALL Models' predictions AVERAGED: "+str(avg_pred_f1))
    print("Precision Score from ALL Models' predictions AVERAGED: "+str(avg_pred_precision))
    print("Recall Score from ALL Models' predictions AVERAGED: "+str(avg_pred_recall))
    print("-----\nApplying confidence thresholding - F1 score of selected predictions: "+str(avg_f1_conf_thresholded))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_ground_truth_values).shape[0]))
    print("-----\nApplying frequentist methods - F1 score of selected predictions: "+str(selected_freq_preds_f1))
    print("Predictions Refused: "+str(Y_test.shape[0] - np.array(selected_Y_vals_freq).shape[0]))
    print("Majority vote F1:"+str(f1_maj_vote))
    return avg_prediction_array, std_prediction_array, f1_list

In [None]:
def single_BNN_pred(cur_model, no_of_reps, X_test, Y_test, threshold_value, num_classes, maxlen_embedding, conf_perc_factor):
    if X_test.shape is not (1, maxlen_embedding):
        X_test = np.reshape(X_test, (1, maxlen_embedding))
    if Y_test.shape is not (1, num_classes):
        Y_test = np.reshape(Y_test, (1, num_classes))
    all_prediction_array = np.zeros(shape = (no_of_reps, 1, num_classes)) # 3D array to store all no_of_reps number of different prediction values
    confidence_count_zero = [0 for i in range(num_classes)]
    confidence_count_one = [0 for i in range(num_classes)]
    confidence_factor = [0 for i in range(num_classes)]
    final_prediction_MV = [0 for i in range(num_classes)]
    loaded_model = cur_model
    # loaded_model = tf.keras.models.load_model(save_path)
    for i in tqdm(range(no_of_reps)):
        predictions = loaded_model.predict(X_test)
        all_prediction_array[i] = predictions
        thresholded_predictions = threshold_predictions(predictions, threshold_value)
        print(thresholded_predictions)
        for i in range(thresholded_predictions.shape[1]):
            if thresholded_predictions[0][i] == 0:
                confidence_count_zero[i] += 1
            else:
                confidence_count_one[i] += 1
    for i in range(num_classes):
        if confidence_count_zero[i] > confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_zero[i] / no_of_reps)*100
        elif confidence_count_zero[i] < confidence_count_one[i]:
            confidence_factor[i] = (confidence_count_one[i] / no_of_reps)*100
            final_prediction_MV[i] = 1
        else:
            confidence_factor[i] =  50.0
    avg_prediction_array = np.mean(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average predictions of all reps)
    std_prediction_array = np.std(all_prediction_array, axis = 0) # shape 1 x no_of_classes (Average stddev of all reps)
    # frequentist starts 
    conf_level_factor = np.sqrt(2) * erfinv(conf_perc_factor)
    final_predictions_freq = np.zeros(shape = (1, num_classes))
    thresholded_predictions_freq = []
    selected_Y_vals_freq = []
    lower_lim_array = []
    upper_lim_array = []
    flag = 0
    for j in range(num_classes):
        avg = avg_prediction_array[0,j]
        std = std_prediction_array[0,j]
        lower_lim = avg - conf_level_factor*std
        upper_lim = avg + conf_level_factor*std
        lower_lim_array.append(lower_lim)
        upper_lim_array.append(upper_lim)
        if lower_lim > threshold_value and upper_lim > threshold_value:
            final_predictions_freq[0,j] = 1
        elif lower_lim < threshold_value and upper_lim < threshold_value:
            final_predictions_freq[0,j] = 0
        else: 
            flag = 1
    if flag == 0:
        thresholded_predictions_freq.append(final_predictions_freq[0])
    #frequentist ends
    avg_prediction_thresholded = threshold_predictions(avg_prediction_array, threshold_value)
    genres_list = ['Spy Fiction', 'Alternate History', 'Non Fiction', 'Adevnture Novel', 'Detective Fiction', 'Historical Fiction', 'Romance Novel', 'Horror', 'Thriller', 'Historical Novel', 'Crime Fiction', 'Suspense', 'Young Adult Literature', 'Mystery', 'Childrens Literature', 'Fantasy', 'Novel', 'Science Fiction', 'Speculative Fiction', 'Fiction']
    answer_list = []
    ground_truth_list = []
    for i in range(20):
        if avg_prediction_thresholded[0][i] == 1:
            answer_list.append(genres_list[i])
        if Y_test[0][i] == 1:
            ground_truth_list.append(genres_list[i])
    # print("Average Prediction of all models Thresholded: ")
    # print(avg_prediction_thresholded)
    # print("Classes Predicted: ")
    # print(answer_list)
    print("Majority Vote Prediction of all models: ")
    print(final_prediction_MV)
    print("Ground truth Value: ")
    print(Y_test)
    print("Ground truth class labels: ")
    print(ground_truth_list)
    print("Final prediction after confidence interval thresholding: ")
    print(final_predictions_freq)
    print("Confidence factors for each class: ")
    for i in range(num_classes):
        print("(",lower_lim_array[i],", ",upper_lim_array[i],")")
    if flag == 0:
        print("Prediction passes - CONSIDERED")
    else:
        print("prediction fails - IGNORED")
    # print("Confidence Factors for each class in %: ")
    # print(confidence_factor)
    # print("Overall average confidence factor for prediction: "+str(np.mean(np.array(confidence_factor))))
    return avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor

## Training BNN Model

In [None]:
def run_expt(model, train_dataset, Y_train, val_dataset, Y_val, test_dataset):

    print("Starting model training.")
    model.fit(train_dataset, Y_train, epochs=num_epochs, validation_data=(val_dataset, Y_val))
    print("Model training finished.")
    loss, acc, f1  = model.evaluate(train_dataset, Y_train, verbose=0)
    print(f"Train results: {round(loss, 3)}, {round(acc, 3)}, {round(f1, 3)}")

    print("Evaluating model performance...")
    loss, acc, f1 = model.evaluate(test_dataset, Y_test, verbose=0)
    print(f"Test results: {round(loss, 3)}, {round(acc, 3)}, {round(f1, 3)}")

In [None]:
new_model = get_bnn_model(len(train_dataset))
run_expt(new_model, train_dataset, Y_train, val_dataset, Y_val, test_dataset)

In [None]:
bnn_model.save_weights("Models/bnn_model_weights.h5")

## Loading Saved Weights 

In [None]:
new_model = get_bnn_model(len(train_dataset))
new_model.load_weights("Models/bnn_model_weights.h5")



## Evaluation over test dataset

In [None]:
apa, spa, f = Eval_BNN_Model(new_model, 10, test_dataset, Y_test, 0.08693312, 100, 0.90 )

## Single Sample BNN Evaluation

In [None]:
avg_prediction_array, std_prediction_array, avg_prediction_thresholded, final_prediction_MV, confidence_factor = single_BNN_pred(new_model, 10, test_dataset[245], Y_test[245], 0.08693312, 20, 400, 0.90)

 10%|█         | 1/10 [00:00<00:03,  2.32it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


 20%|██        | 2/10 [00:00<00:03,  2.61it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


 30%|███       | 3/10 [00:01<00:02,  2.71it/s]

[[0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.]]


 40%|████      | 4/10 [00:01<00:02,  2.78it/s]

[[1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1.]]


 50%|█████     | 5/10 [00:01<00:01,  2.80it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1.]]


 60%|██████    | 6/10 [00:02<00:01,  2.79it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


 70%|███████   | 7/10 [00:02<00:01,  2.80it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.]]


 80%|████████  | 8/10 [00:02<00:00,  2.78it/s]

[[1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


 90%|█████████ | 9/10 [00:03<00:00,  2.73it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


100%|██████████| 10/10 [00:03<00:00,  2.73it/s]

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
Majority Vote Prediction of all models: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Ground truth Value: 
[[0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]]
Ground truth class labels: 
['Adevnture Novel', 'Romance Novel', 'Crime Fiction']
Final prediction after confidence interval thresholding: 
[[0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1.]]
Confidence factors for each class: 
( 0.04796163661857207 ,  0.2555953383586828 )
( 0.03526183239457961 ,  0.28384800233604557 )
( 0.20363885848753274 ,  0.3215742877221841 )
( 0.08495674161401581 ,  0.4101674486866872 )
( 0.023676623703732635 ,  0.22751546025107844 )
( 0.19241470676570413 ,  0.3098147155651522 )
( 0.38981662613218765 ,  0.4356504001014854 )
( 0.04452851300933042 ,  0.23780866304538567 )
( 0.09769014022615531 ,  0.2941817323181333 )
( 0.05538221818615238 ,  0.2596905686218616 )
( 0.1795514446420427 ,  0.4112981456594709 )
( 0.0108707558465696


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
