# Imports

In [100]:
import os
import re

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# tf and keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    GlobalAveragePooling1D,
    Dropout,
    TextVectorization,
    Input,
    Conv1D,
    LSTM,
    MaxPooling1D,
    Bidirectional,
)
from tensorflow.keras.models import Model
# import tensorflow_datasets as tfds

pd.set_option('display.max_colwidth', 100) 

# UTILS

In [101]:
def display_model(model):
    display(model.layers)
    display(model.summary())

    # Retrieve the embeddings layer, which itself is wrapped in a list.
    embeddings = model.layers[1].get_weights()[0]
    print('-'*100)
    display("Embeddings layer - shape: ", embeddings.shape)
    print('-'*100)
    display("Embeddings layer - parameter matrix (before training): ", embeddings)

In [102]:
def read_files():
    X_train = pd.read_csv('./data/final/X_train.csv')
    y_train = pd.read_csv('./data/final/y_train.csv')
    X_val = pd.read_csv('./data/final/X_val.csv')
    y_val = pd.read_csv('./data/final/y_val.csv')
    X_test = pd.read_csv('./data/final/X_test.csv')
    y_test = pd.read_csv('./data/final/y_test.csv')
    
    train_not_na_indices = (X_train['fulltext'].notna())
    val_not_na_indices = (X_val['fulltext'].notna())
    test_not_na_indices = (X_test['fulltext'].notna())
    
    X_train = X_train[train_not_na_indices]
    X_val = X_val[val_not_na_indices]
    X_test = X_test[test_not_na_indices]
    
    y_train = y_train[train_not_na_indices]
    y_val = y_val[val_not_na_indices]
    y_test = y_test[test_not_na_indices]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [103]:
def get_vectorization_layer(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

In [104]:
def get_vectorization_layer_ngrams(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16, ngrams=3):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        ngrams=ngrams,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

# Goal

Evaluate all the losses (mean squared error) of the models:
 - Model 2 - Basic Embedding Model - 2 Dense Layers (64, 32)
 - Model 2 - 3 Ngram Basic Embedding Model - 2 Dense Layers (64, 32)
 - Model 3 - Conv1d Model, Two Hidden Dense Layers (64, 32)
 - Model 3b- Conv1d Model, Two Hidden Dense Layers (128, 64, 64, 32)
 - Model 3c: Single Simple RNN, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3d: Double Simple RNN, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3e: Conv1d Model, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3 + Dropout: Conv1d Model, Two Hidden Dense Layers (64, 32), 50% Dropout
 - Model 4: LSTM RNN Model, Two Hidden Dense Layers (64, 32)
 - Model 5: BiDirectional LSTM RNN Model, Two Hidden Dense Layers (64, 32)

# Load Model Performance CSV's and Models

In [105]:
hist_2 = pd.read_csv("./model_performance/hist_2.csv").drop(columns=["Unnamed: 0"])
hist_2_ngrams = pd.read_csv("./model_performance/hist_2_ngrams.csv").drop(columns=["Unnamed: 0"])
hist_2_mf = pd.read_csv("./model_performance/hist_2_mf.csv").drop(columns=["Unnamed: 0"])
hist_3 = pd.read_csv("./model_performance/hist_3.csv").drop(columns=["Unnamed: 0"])
hist_3_b = pd.read_csv("./model_performance/hist_3_b.csv").drop(columns=["Unnamed: 0"])
hist_3_c = pd.read_csv("./model_performance/hist_3_c.csv").drop(columns=["Unnamed: 0"])
hist_3_d = pd.read_csv("./model_performance/hist_3_d.csv").drop(columns=["Unnamed: 0"])
hist_3_e = pd.read_csv("./model_performance/hist_3_e.csv").drop(columns=["Unnamed: 0"])
hist_3_dp = pd.read_csv("./model_performance/hist_3_dp.csv").drop(columns=["Unnamed: 0"])
hist_4_tanh = pd.read_csv("./model_performance/hist_4_tanh.csv").drop(columns=["Unnamed: 0"])
hist_5_tanh = pd.read_csv("./model_performance/hist_5_tanh.csv").drop(columns=["Unnamed: 0"])


In [106]:
model_2 = tf.keras.models.load_model('./models/model_2.h5')
model_2_multifeatur = tf.keras.models.load_model('./models/model_2_multifeature.h5')
model_2_ngrams = tf.keras.models.load_model('./models/model_2_ngrams.h5')

model_3 = tf.keras.models.load_model('./models/model_3.h5')
model_3_b = tf.keras.models.load_model('./models/model_3_b.h5')
model_3_c = tf.keras.models.load_model('./models/model_3_c.h5')
model_3_d = tf.keras.models.load_model('./models/model_3_d.h5')
model_3_e = tf.keras.models.load_model('./models/model_3_e.h5')
model_3_dropout = tf.keras.models.load_model('./models/model_3_dp.h5')

model_4_tanh = tf.keras.models.load_model('./models/model_4_tanh.h5')

model_5_tanh = tf.keras.models.load_model('./models/model_5_tanh.h5')

2024-08-02 14:15:11.090705: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-02 14:15:11.093961: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-02 14:15:11.096672: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [107]:
X_train, y_train, X_val, y_val, X_test, y_test = read_files()

In [108]:
text_data = tf.constant(X_train['fulltext'].values)
text_data_val = tf.constant(X_val['fulltext'].values)
text_data_test = tf.constant(X_test['fulltext'].values)

In [109]:
vectorize_layer = get_vectorization_layer(X_train, 'fulltext')
vectorize_layer_3_ngrams = get_vectorization_layer_ngrams(X_train, 'fulltext')

In [110]:
vectorized_text = vectorize_layer(text_data)
vectorized_text_val = vectorize_layer(text_data_val)
vectorized_text_test = vectorize_layer(text_data_test)

vectorized_text_3_ngrams = vectorize_layer_3_ngrams(text_data)
vectorized_text_val_3_ngrams = vectorize_layer_3_ngrams(text_data_val)
vectorized_text_test_3_ngrams = vectorize_layer_3_ngrams(text_data_test)

In [111]:
predicted_scores = model_5_tanh.predict(vectorized_text)

2024-08-02 14:17:52.530854: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-02 14:17:52.534797: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-02 14:17:52.540547: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [115]:
pd.DataFrame(predicted_scores).describe()

Unnamed: 0,0
count,29574.0
mean,-0.068331
std,0.16373
min,-0.390075
25%,-0.121994
50%,-0.086435
75%,-0.045658
max,3.313236


# Evaluations

In [54]:
hist_2

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.941614,0.112412,0.0,0.609355,0.09801,0.0
1,0.861312,0.114036,0.0,0.634518,0.110827,0.0


In [28]:
hist_2_mf

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.555413,0.204005,0.0,0.714234,0.211522,0.0
1,1.492719,0.14094,0.0,0.705502,0.143369,0.0
2,1.489709,0.135813,0.0,0.701544,0.129748,0.0
3,1.486173,0.138126,0.0,0.69243,0.100725,0.0
4,1.441503,0.127889,0.0,0.628241,0.094379,0.0


In [29]:
hist_2_ngrams

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.923758,0.111134,0.0,0.71009,0.112743,0.0
1,0.860586,0.111333,0.0,0.727123,0.123201,0.0
2,0.841891,0.112423,0.0,0.723785,0.119927,0.0
3,0.817931,0.109606,0.0,0.725325,0.121725,0.0
4,0.756845,0.104392,0.0,0.722778,0.124703,0.0


In [20]:
hist_3

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968016,0.118088,0.0,0.705901,0.122682,0.0
1,0.96225,0.125279,0.0,0.68157,0.09618,0.0
2,0.950995,0.122447,0.0,0.707882,0.099532,0.0


In [21]:
hist_3_b

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968016,0.118088,0.0,0.705901,0.122682,0.0
1,0.96225,0.125279,0.0,0.68157,0.09618,0.0
2,0.950995,0.122447,0.0,0.707882,0.099532,0.0


In [22]:
hist_3_c

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.493012,0.124013,0.0,0.712301,0.191536,0.0
1,1.492829,0.139525,0.0,0.713138,0.10189,0.0


In [23]:
hist_3_d

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.493058,0.124635,0.0,0.714317,0.200483,0.0
1,1.493849,0.137843,0.0,0.710901,0.092969,0.0
2,1.493515,0.147939,0.0,0.705894,0.131465,0.0
3,1.491341,0.153276,0.0,0.707012,0.117426,0.0


In [36]:
hist_3_e

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.490502,0.133164,0.0,0.706304,0.142727,0.0
1,1.490278,0.137152,0.0,0.706345,0.131975,0.0


In [25]:
hist_3_dp

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.960744,0.123068,0.0,0.694064,0.126233,0.0
1,0.947665,0.119645,0.0,0.679076,0.138082,0.0
2,0.939346,0.121279,0.0,0.687265,0.114197,0.0


In [26]:
hist_4_tanh

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968242,0.12155,0.0,0.70656,0.119283,0.0
1,0.968232,0.124366,0.0,0.708794,0.100467,0.0


In [27]:
hist_5_tanh

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.967287,0.158545,0.0,0.706388,0.126739,0.0
1,0.929697,0.133679,0.0,0.621943,0.113237,0.0
2,0.88564,0.148274,0.0,0.640811,0.106668,0.0


# Model Losses (Train and Validation)

In [69]:
def validate_model(model, x_train, y_train, x_val, y_val, x_test=None, y_test=None, name="model"):
    print(f"Evaluating {name}")
    train_loss_mse, train_mae, train_acc = model.evaluate(x_train, y_train, verbose=0)
    val_loss_mse, val_mae, val_acc = model.evaluate(x_val, y_val, verbose=0)
    
    if x_test is not None:
        test_loss_mse, test_mae, test_acc = model.evaluate(x_test, y_test, verbose=0)
    else:
        test_loss_mse, test_mae, test_acc = "NA", "NA", "NA"

    return {
        "train": {"mse":train_loss_mse,"mae":train_mae,"acc":train_acc},
        "val": {"mse":val_loss_mse,"mae":val_mae,"acc":val_acc},
        "test": {"mse":test_loss_mse,"mae":test_mae,"acc":test_acc},
        "model_name": name
    }

In [70]:
model_2_eval = validate_model(model_2 ,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_2")
model_2_multifeature_eval = validate_model(model_2_multifeature ,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_2_multifeature")
model_2_ngrams_eval = validate_model(model_2_ngrams,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_2_ngrams")

model_3_eval = validate_model(model_3, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_3")
model_3_b_eval = validate_model(model_3_b, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test,"model_3_b")
model_3_c_eval = validate_model(model_3_c, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_c")
model_3_d_eval = validate_model(model_3_d, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_d")
model_3_e_eval = validate_model(model_3_e, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_e")
model_3_dropout_eval = validate_model(model_3_dropout,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_3_dropout")

model_4_tanh_eval = validate_model(model_4_tanh, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_4")

model_5_tanh_eval = validate_model(model_5_tanh, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test, "model_5")

Evaluating model_2
Evaluating model_3
Evaluating model_3_b
Evaluating model_c
Evaluating model_d
Evaluating model_e
Evaluating model_3_dropout
Evaluating model_4


2024-08-02 09:36:51.794255: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-02 09:36:51.795953: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-02 09:36:51.798041: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Evaluating model_5


2024-08-02 09:40:48.245313: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-02 09:40:48.247393: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-02 09:40:48.249911: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [76]:
pd.DataFrame(model_2_eval)

Unnamed: 0,train,val,test,model_name
mse,0.859088,0.609356,0.284379,model_2
mae,0.090757,0.09801,0.076583,model_2
acc,0.0,0.0,0.0,model_2


In [77]:
pd.DataFrame(model_3_eval)

Unnamed: 0,train,val,test,model_name
mse,0.936278,0.681571,0.351257,model_3
mae,0.088931,0.09618,0.070521,model_3
acc,0.0,0.0,0.0,model_3


In [79]:
pd.DataFrame(model_3_b_eval)

Unnamed: 0,train,val,test,model_name
mse,0.936278,0.681571,0.351257,model_3_b
mae,0.088931,0.09618,0.070521,model_3_b
acc,0.0,0.0,0.0,model_3_b


In [83]:
pd.DataFrame(model_3_c_eval)

Unnamed: 0,train,val,test,model_name
mse,0.974347,0.712302,0.388859,model_c
mae,0.185198,0.191535,0.166179,model_c
acc,0.0,0.0,0.0,model_c


In [84]:
pd.DataFrame(model_3_d_eval)

Unnamed: 0,train,val,test,model_name
mse,0.967555,0.705893,0.378834,model_d
mae,0.123531,0.131465,0.104127,model_d
acc,0.0,0.0,0.0,model_d


In [86]:
pd.DataFrame(model_3_e_eval)

Unnamed: 0,train,val,test,model_name
mse,0.967551,0.706304,0.379708,model_e
mae,0.135746,0.142727,0.116717,model_e
acc,0.0,0.0,0.0,model_e


In [97]:
pd.DataFrame(model_3_dropout_eval).loc['mse']

train                0.930402
val                  0.679076
test                 0.350863
model_name    model_3_dropout
Name: mse, dtype: object

In [98]:
pd.DataFrame(model_4_tanh_eval).T

Unnamed: 0,mse,mae,acc
train,0.967334,0.111757,0.0
val,0.706559,0.119283,0.0
test,0.378461,0.092595,0.0
model_name,model_4,model_4,model_4


In [89]:
pd.DataFrame(model_5_tanh_eval)

Unnamed: 0,train,val,test,model_name
mse,0.870323,0.621942,0.320861,model_5
mae,0.103918,0.113237,0.09116,model_5
acc,0.0,0.0,0.0,model_5


Model 2 performed the best in terms of validation dataset loss, which calculated as the mean squared error.

Model 2 - Validation Loss after 5 Epochs - 2145500.25
Model 3 - Validation Loss after 3 Epochs - 2365950.75
Model 4 - Validation Loss after 4 Epochs - 2463587.00
Model 5 - Validation Loss after 5 Epochs - 2465304.75

# End of file

Sources:
* https://stackoverflow.com/questions/73878049/how-do-you-convert-the-pandas-dataframe-to-tensorflow-python-data-ops-dataset-op