# Imports

In [22]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# tf and keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    GlobalAveragePooling1D,
    Dropout,
    TextVectorization,
    Input,
    Conv1D,
    LSTM,
    MaxPooling1D,
    Bidirectional,
)
from tensorflow.keras.models import Model
# import tensorflow_datasets as tfds

pd.set_option('display.max_colwidth', 100) 

# UTILS

In [3]:
def display_model(model):
    display(model.layers)
    display(model.summary())

    # Retrieve the embeddings layer, which itself is wrapped in a list.
    embeddings = model.layers[1].get_weights()[0]
    print('-'*100)
    display("Embeddings layer - shape: ", embeddings.shape)
    print('-'*100)
    display("Embeddings layer - parameter matrix (before training): ", embeddings)

In [4]:
def read_files():
    X_train = pd.read_csv('./data/final/X_train.csv')
    y_train = pd.read_csv('./data/final/y_train.csv')
    X_val = pd.read_csv('./data/final/X_val.csv')
    y_val = pd.read_csv('./data/final/y_val.csv')
    X_test = pd.read_csv('./data/final/X_test.csv')
    y_test = pd.read_csv('./data/final/y_test.csv')
    
    train_not_na_indices = (X_train['fulltext'].notna())
    val_not_na_indices = (X_val['fulltext'].notna())
    test_not_na_indices = (X_test['fulltext'].notna())
    
    X_train = X_train[train_not_na_indices]
    X_val = X_val[val_not_na_indices]
    X_test = X_test[test_not_na_indices]
    
    y_train = y_train[train_not_na_indices]
    y_val = y_val[val_not_na_indices]
    y_test = y_test[test_not_na_indices]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [5]:
def get_vectorization_layer(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

In [6]:
def get_vectorization_layer_ngrams(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16, ngrams=3):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        ngrams=ngrams,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

In [7]:
X_train, y_train, X_val, y_val, X_test, y_test = read_files()

### Convert the fulltext into tensors

In [8]:
text_data = tf.constant(X_train['fulltext'].values)
text_data_val = tf.constant(X_val['fulltext'].values)
text_data_test = tf.constant(X_test['fulltext'].values)

2024-08-01 16:03:37.799689: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


### Get vectorization layer

In [9]:
vectorize_layer = get_vectorization_layer(X_train, 'fulltext')
vectorize_layer_3_ngrams = get_vectorization_layer_ngrams(X_train, 'fulltext')

### Vectorize Text

In [10]:
vectorized_text = vectorize_layer(text_data)
vectorized_text_val = vectorize_layer(text_data_val)
vectorized_text_test = vectorize_layer(text_data_test)

vectorized_text_3_ngrams = vectorize_layer_3_ngrams(text_data)
vectorized_text_val_3_ngrams = vectorize_layer_3_ngrams(text_data_val)
vectorized_text_test_3_ngrams = vectorize_layer_3_ngrams(text_data_test)

# Model 1 - Removed

# Model 2: TextVectorization Layer, Basic Embedding Model, Average Pooling, Two Hidden Dense Layers (64, 32)

In [11]:
def build_model_2(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = GlobalAveragePooling1D()(x) 

    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [12]:
model_2 = build_model_2()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_2 = model_2.fit(vectorized_text, y_train, epochs=5, batch_size=2,verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist2 = pd.DataFrame(history_2.history)
hist2

Epoch 1/5
Epoch 2/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.941614,0.112412,0.0,0.609355,0.09801,0.0
1,0.861312,0.114036,0.0,0.634518,0.110827,0.0


In [13]:
model_2.save("./models/model_2.h5")

In [14]:
hist2.to_csv('/model_performance/hist2.csv')

# Model 2 with 3 Ngrams: TextVectorization Layer, Basic Embedding Model, Average Pooling, Two Hidden Dense Layers (64, 32)

In [15]:
model_2_ngrams_3 = build_model_2()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
# history_2_ngrams = model_2_ngrams.fit(vectorized_text_3_ngrams, y_train, epochs=5, batch_size=2,verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
history_2_ngrams_3 = model_2_ngrams_3.fit(vectorized_text_3_ngrams, y_train, epochs=5, batch_size=2,verbose=1, validation_data=(vectorized_text_val, y_val))
hist_2_ngrams_3 = pd.DataFrame(history_2_ngrams_3.history)
hist_2_ngrams_3

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.923758,0.111134,0.0,0.71009,0.112743,0.0
1,0.860586,0.111333,0.0,0.727123,0.123201,0.0
2,0.841891,0.112423,0.0,0.723785,0.119927,0.0
3,0.817931,0.109606,0.0,0.725325,0.121725,0.0
4,0.756845,0.104392,0.0,0.722778,0.124703,0.0


In [17]:
model_2_ngrams_3.save("./models/model_2_ngrams.h5")
hist_2_ngrams_3.to_csv('/model_performance/hist_2_ngrams.csv')

# Model 3: TextVectorization Layer, Convolutional NN Embedding Model, Two Hidden Dense Layers (64, 32)

In [16]:
def build_model_3(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = layers.Conv1D(32, 4, activation='relu')(x)
    x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [20]:
model_3 = build_model_3()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3 = model_3.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3 = pd.DataFrame(history_3.history)
hist_3

Epoch 1/5
Epoch 2/5
Epoch 3/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.959536,0.11765,0.0,0.68698,0.106175,0.0
1,0.942791,0.118814,0.0,0.679683,0.101125,0.0
2,0.932319,0.123929,0.0,0.69005,0.101403,0.0


In [21]:
model_3.save("./models/model_3.h5")

In [22]:
hist_3.to_csv('/model_performance/hist_3.csv')

# Model 3 B: TextVectorization Layer, Convolutional NN Embedding Model, Four Hidden Dense Layers (128, 64, 64, 32)

In [27]:
def build_model_3_b(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = layers.Conv1D(32, 4, activation='relu')(x)
    x = layers.MaxPooling1D()(x)
    
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [28]:
model_3_b = build_model_3_b()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3_b = model_3_b.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3_b = pd.DataFrame(history_3_b.history)
hist_3_b

Epoch 1/5
Epoch 2/5
Epoch 3/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968016,0.118088,0.0,0.705901,0.122682,0.0
1,0.96225,0.125279,0.0,0.68157,0.09618,0.0
2,0.950995,0.122447,0.0,0.707882,0.099532,0.0


In [30]:
model_3_b.save("./models/model_3_b.h5")
hist_3_b.to_csv('/model_performance/hist_3_b.csv')

# Model 3 C: TextVectorization Layer, 1 Simple RNN Embedding Model, Four Hidden Dense Layers (128, 64, 64, 32)

In [18]:
def build_model_3_c(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    # x = layers.SimpleRNN(32, return_sequences=True)(x)
    x = layers.SimpleRNN(32)(x)
    
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [19]:
model_3_c = build_model_3_c()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3_c = model_3_c.fit(vectorized_text[:10000], y_train[:10000], epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3_c = pd.DataFrame(history_3_c.history)
hist_3_c

Epoch 1/5
Epoch 2/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.493012,0.124013,0.0,0.712301,0.191536,0.0
1,1.492829,0.139525,0.0,0.713138,0.10189,0.0


In [20]:
model_3_c.save("./models/model_3_c.h5")
hist_3_c.to_csv('/model_performance/hist_3_c.csv')

# Model 3 D: TextVectorization Layer, Double Simple RNN Embedding Model, Four Hidden Dense Layers (128, 64, 64, 32)

In [69]:
def build_model_3_d(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim*2, input_length=output_sequence_length)(inputs)

    x = layers.SimpleRNN(32, return_sequences=True)(x)
    x = layers.SimpleRNN(32)(x)
    
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model
model_3_d = build_model_3_d()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3_d = model_3_d.fit(vectorized_text[:10000], y_train[:10000], epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3_d = pd.DataFrame(history_3_d.history)
hist_3_d

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.493058,0.124635,0.0,0.714317,0.200483,0.0
1,1.493849,0.137843,0.0,0.710901,0.092969,0.0
2,1.493515,0.147939,0.0,0.705894,0.131465,0.0
3,1.491341,0.153276,0.0,0.707012,0.117426,0.0


In [71]:
model_3_d.save("./models/model_3_d.h5")
hist_3_d.to_csv('/model_performance/hist_3_d.csv')

# Model 3 E: TextVectorization Layer, 1D Convolutional Embedding Model, Four Hidden Dense Layers (128, 64, 64, 32)

In [73]:
def build_model_3_e(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = layers.Conv1D(32, 4, activation='relu')(x)
    x = layers.MaxPooling1D()(x)
    
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    x = layers.Flatten(name='flatten_1')(x)
    
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

model_3_e = build_model_3_e()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3_e = model_3_e.fit(vectorized_text[:10000], y_train[:10000], epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3_e = pd.DataFrame(history_3_e.history)
hist_3_e

Epoch 1/5
Epoch 2/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.490502,0.133164,0.0,0.706304,0.142727,0.0
1,1.490278,0.137152,0.0,0.706345,0.131975,0.0


In [74]:
model_3_e.save("./models/model_3_e.h5")
hist_3_e.to_csv('/model_performance/hist_3_e.csv')

# Model 3 with Dropout: TextVectorization Layer, 1D Convolutional Embedding Model, Two Hidden Dense Layers (64, 32), 50% Dropout

In [23]:
def build_model_3_dropout(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)

    x = layers.Conv1D(32, 4, activation='relu')(x)
    x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [24]:
model_3_dp = build_model_3_dropout()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_3_dp = model_3_dp.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_3_dp = pd.DataFrame(history_3_dp.history)
hist_3_dp

Epoch 1/5
Epoch 2/5
Epoch 3/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.960744,0.123068,0.0,0.694064,0.126233,0.0
1,0.947665,0.119645,0.0,0.679076,0.138082,0.0
2,0.939346,0.121279,0.0,0.687265,0.114197,0.0


In [25]:
model_3_dp.save("./models/model_3_dp.h5")

In [26]:
hist_3_dp.to_csv('/model_performance/hist_3_dp.csv')

# Model 4: TextVectorization Layer, LSTM RNN Embedding Model, Two Hidden Dense Layers (64, 32)

In [31]:
def build_model_4_tanh(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)
    
    x = layers.LSTM(32, activation='tanh')(x)
    # x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='tanh')(x)
    x = Dense(32, activation='tanh')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])

    return model

In [32]:
model_4_tanh = build_model_4_tanh()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_4_tanh = model_4_tanh.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_4_tanh = pd.DataFrame(history_4_tanh.history)

2024-07-31 07:58:52.330468: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 07:58:52.336520: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 07:58:52.340800: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/5


2024-07-31 07:58:55.313395: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 07:58:55.321563: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 07:58:55.329862: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-07-31 09:04:56.422212: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 09:04:56.428072: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 09:04:56.432751: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5


In [35]:
hist_4_tanh

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968242,0.12155,0.0,0.70656,0.119283,0.0
1,0.968232,0.124366,0.0,0.708794,0.100467,0.0


In [33]:
model_4_tanh.save("./models/model_4_tanh.h5")

In [34]:
hist_4_tanh.to_csv('/model_performance/hist_4_tanh.csv')

In [67]:
def build_model_4_tanh_b(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)
    
    x = layers.LSTM(64, return_sequences=True)(x)
    x = layers.LSTM(64)(x)
    
    x = Dense(64, activation='tanh')(x)
    x = Dense(32, activation='tanh')(x)

    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])

    return model

In [None]:
model_4_tanh_b = build_model_4_tanh_b()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_4_tanh_b = model_4_tanh_b.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_4_tanh_b = pd.DataFrame(history_4_tanh_b.history)
hist_4_tanh_b

In [None]:
model_4_tanh_b.save("./models/model_4_tanh_b.h5")
hist_4_tanh_b.to_csv('/model_performance/hist_4_tanh_b.csv')

# Model 5: TextVectorization Layer, Bi-Directional LSTM RNN Embedding Model, Two Hidden Dense Layers (64, 32)

In [36]:
def build_model_5_tanh(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    inputs = Input(shape=(output_sequence_length,))
    x = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(inputs)
    
    x = Bidirectional(LSTM(32, activation='tanh'))(x)
    # x = layers.MaxPooling1D()(x)
    
    x = Dense(64, activation='tanh')(x)
    x = Dense(32, activation='tanh')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [37]:
model_5_tanh = build_model_5_tanh()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_5_tanh = model_5_tanh.fit(vectorized_text, y_train, epochs=5, batch_size=2, verbose=1, callbacks=[early_stopping], validation_data=(vectorized_text_val, y_val))
hist_5_tanh = pd.DataFrame(history_5_tanh.history)
hist_5_tanh

2024-07-31 10:07:41.764336: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 10:07:41.766802: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 10:07:41.768389: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/5


2024-07-31 10:07:41.981980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 10:07:41.986243: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 10:07:41.988697: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-07-31 11:33:56.485077: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 11:33:56.488362: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 11:33:56.491713: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.967287,0.158545,0.0,0.706388,0.126739,0.0
1,0.929697,0.133679,0.0,0.621943,0.113237,0.0
2,0.88564,0.148274,0.0,0.640811,0.106668,0.0


In [39]:
model_5_tanh.save("./models/model_5_tanh.h5")

In [38]:
hist_5_tanh.to_csv('/model_performance/hist_5_tanh.csv')

# Model Evaluations

Evaluate all the losses (mean squared error) of the models:
 - Model 2 - Basic Embedding Model - 2 Dense Layers (64, 32)
 - Model 2 - 3 Ngram Basic Embedding Model - 2 Dense Layers (64, 32)
 - Model 3 - Conv1d Model, Two Hidden Dense Layers (64, 32)
 - Model 3b- Conv1d Model, Two Hidden Dense Layers (128, 64, 64, 32)
 - Model 3c: Single Simple RNN, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3d: Double Simple RNN, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3e: Conv1d Model, Four Hidden Dense Layers (128, 64, 64, 32)
 - Model 3 + Dropout: Conv1d Model, Two Hidden Dense Layers (64, 32), 50% Dropout
 - Model 4: LSTM RNN Model, Two Hidden Dense Layers (64, 32)
 - Model 5: BiDirectional LSTM RNN Model, Two Hidden Dense Layers (64, 32)

In [41]:
hist2

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.940697,0.111765,0.0,0.609747,0.104846,0.0
1,0.860433,0.116496,0.0,0.627505,0.117921,0.0


In [42]:
hist_3

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.959536,0.11765,0.0,0.68698,0.106175,0.0
1,0.942791,0.118814,0.0,0.679683,0.101125,0.0
2,0.932319,0.123929,0.0,0.69005,0.101403,0.0


In [43]:
hist_3_b

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968016,0.118088,0.0,0.705901,0.122682,0.0
1,0.96225,0.125279,0.0,0.68157,0.09618,0.0
2,0.950995,0.122447,0.0,0.707882,0.099532,0.0


In [85]:
hist_3_c

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.491987,0.125689,0.0,0.712539,0.205601,0.0
1,1.49409,0.1322,0.0,0.708502,0.102327,0.0
2,1.49337,0.16472,0.0,0.706946,0.118365,0.0
3,1.490325,0.135241,0.0,0.707049,0.116929,0.0


In [86]:
hist_3_d

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.493058,0.124635,0.0,0.714317,0.200483,0.0
1,1.493849,0.137843,0.0,0.710901,0.092969,0.0
2,1.493515,0.147939,0.0,0.705894,0.131465,0.0
3,1.491341,0.153276,0.0,0.707012,0.117426,0.0


In [87]:
hist_3_e

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.490502,0.133164,0.0,0.706304,0.142727,0.0
1,1.490278,0.137152,0.0,0.706345,0.131975,0.0


In [44]:
hist_3_dp

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.960744,0.123068,0.0,0.694064,0.126233,0.0
1,0.947665,0.119645,0.0,0.679076,0.138082,0.0
2,0.939346,0.121279,0.0,0.687265,0.114197,0.0


In [45]:
hist_4_tanh

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.968242,0.12155,0.0,0.70656,0.119283,0.0
1,0.968232,0.124366,0.0,0.708794,0.100467,0.0


In [46]:
hist_5_tanh

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.967287,0.158545,0.0,0.706388,0.126739,0.0
1,0.929697,0.133679,0.0,0.621943,0.113237,0.0
2,0.88564,0.148274,0.0,0.640811,0.106668,0.0


# Model Losses (Train and Validation)

In [49]:
def validate_model(model, x_train, y_train, x_val, y_val, x_test, y_test):
    train_loss_mse, train_mae, train_acc = model.evaluate(x_train, y_train, verbose=0)
    val_loss_mse, val_mae, val_acc = model.evaluate(x_val, y_val, verbose=0)
    test_loss_mse, test_mae, test_acc = model.evaluate(x_test, y_test, verbose=0)
    
    return {
        "train": {"mse":train_loss_mse,"mae":train_mae,"acc":train_acc},
        "val": {"mse":val_loss_mse,"mae":val_mae,"acc":val_acc},
        "test": {"mse":test_loss_mse,"mae":test_mae,"acc":test_acc},
    }

In [50]:
model_2_saved = tf.keras.models.load_model('model_2.h5')

model_3_saved = tf.keras.models.load_model('model_3.h5')
model_3_b_saved = tf.keras.models.load_model('model_3_b.h5')
model_3_c_saved = tf.keras.models.load_model('model_3_c.h5')
model_3_d_saved = tf.keras.models.load_model('model_3_d.h5')
model_3_e_saved = tf.keras.models.load_model('model_3_e.h5')
model_3_dropout_saved = tf.keras.models.load_model('model_3_dp.h5')

model_4_tanh_saved = tf.keras.models.load_model('model_4_tanh.h5')

model_5_tanh_saved = tf.keras.models.load_model('model_5_tanh.h5')

2024-07-31 17:25:32.804751: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 17:25:32.807242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 17:25:32.811232: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [79]:
model_2_eval = validate_model(model_2_saved ,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)

model_3_eval = validate_model(model_3_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)
model_3_b_val = validate_model(model_3_b_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)
model_3_c_val = validate_model(model_3_c_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)
model_3_d_val = validate_model(model_3_d_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)
model_3_e_val = validate_model(model_3_e_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)
model_3_dropout_eval = validate_model(model_3_dropout_saved,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)

model_4_tanh_eval = validate_model(model_4_tanh_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)

model_5_tanh_eval = validate_model(model_5_tanh_saved, vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)

2024-08-01 12:37:01.444014: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-01 12:37:01.446195: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-01 12:37:01.448980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [52]:
pd.DataFrame(model_2_eval).T

Unnamed: 0,mse,mae,acc
train,0.860489,0.098758,0.0
val,0.609747,0.104846,0.0
test,0.284968,0.083762,0.0


In [53]:
pd.DataFrame(model_3_eval).T

Unnamed: 0,mse,mae,acc
train,0.936278,0.088931,0.0
val,0.681571,0.09618,0.0
test,0.351257,0.070521,0.0


In [54]:
pd.DataFrame(model_3_b_val).T

Unnamed: 0,mse,mae,acc
train,0.936278,0.088931,0.0
val,0.681571,0.09618,0.0
test,0.351257,0.070521,0.0


In [80]:
pd.DataFrame(model_3_c_val).T

Unnamed: 0,mse,mae,acc
train,0.967674,0.110951,0.0
val,0.706946,0.118365,0.0
test,0.378608,0.091737,0.0


In [81]:
pd.DataFrame(model_3_d_val).T

Unnamed: 0,mse,mae,acc
train,0.967555,0.123531,0.0
val,0.705893,0.131465,0.0
test,0.378834,0.104127,0.0


In [84]:
pd.DataFrame(model_3_e_val).T

Unnamed: 0,mse,mae,acc
train,0.967551,0.135746,0.0
val,0.706304,0.142727,0.0
test,0.379708,0.116717,0.0


In [55]:
pd.DataFrame(model_3_dropout_eval).T

Unnamed: 0,mse,mae,acc
train,0.930402,0.131928,0.0
val,0.679076,0.138082,0.0
test,0.350863,0.113639,0.0


In [56]:
pd.DataFrame(model_4_tanh_eval).T

Unnamed: 0,mse,mae,acc
train,0.967334,0.111757,0.0
val,0.706559,0.119283,0.0
test,0.378461,0.092595,0.0


In [57]:
pd.DataFrame(model_5_tanh_eval).T

Unnamed: 0,mse,mae,acc
train,0.870323,0.103918,0.0
val,0.621942,0.113237,0.0
test,0.320861,0.09116,0.0


Model 2 performed the best in terms of validation dataset loss, which calculated as the mean squared error.

Model 2 - Validation Loss after 5 Epochs - 2145500.25
Model 3 - Validation Loss after 3 Epochs - 2365950.75
Model 4 - Validation Loss after 4 Epochs - 2463587.00
Model 5 - Validation Loss after 5 Epochs - 2465304.75

# End of file

Sources:
* https://stackoverflow.com/questions/73878049/how-do-you-convert-the-pandas-dataframe-to-tensorflow-python-data-ops-dataset-op