In [24]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# tf and keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    GlobalAveragePooling1D,
    Dropout,
    TextVectorization,
    Input,
    Conv1D,
    LSTM,
    MaxPooling1D,
    Bidirectional,
    Concatenate,
)
from tensorflow.keras.models import Model
# import tensorflow_datasets as tfds

pd.set_option('display.max_colwidth', 100) 

# UTILS

In [4]:
def display_model(model):
    display(model.layers)
    display(model.summary())

    # Retrieve the embeddings layer, which itself is wrapped in a list.
    embeddings = model.layers[1].get_weights()[0]
    print('-'*100)
    display("Embeddings layer - shape: ", embeddings.shape)
    print('-'*100)
    display("Embeddings layer - parameter matrix (before training): ", embeddings)

In [5]:
def read_files():
    X_train = pd.read_csv('./data/final/X_train.csv')
    y_train = pd.read_csv('./data/final/y_train.csv')
    X_val = pd.read_csv('./data/final/X_val.csv')
    y_val = pd.read_csv('./data/final/y_val.csv')
    X_test = pd.read_csv('./data/final/X_test.csv')
    y_test = pd.read_csv('./data/final/y_test.csv')
    
    train_not_na_indices = (X_train['fulltext'].notna())
    val_not_na_indices = (X_val['fulltext'].notna())
    test_not_na_indices = (X_test['fulltext'].notna())
    
    X_train = X_train[train_not_na_indices]
    X_val = X_val[val_not_na_indices]
    X_test = X_test[test_not_na_indices]
    
    y_train = y_train[train_not_na_indices]
    y_val = y_val[val_not_na_indices]
    y_test = y_test[test_not_na_indices]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [6]:
def get_vectorization_layer(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

In [7]:
def get_vectorization_layer_ngrams(df, column, max_tokens=10000, output_sequence_length=250, embedding_dim=16, ngrams=3):
    vectorize_layer = layers.TextVectorization(
        max_tokens=max_tokens,
        ngrams=ngrams,
        output_mode='int',
        output_sequence_length=output_sequence_length)

    df[column] = df[column].astype(str)
    vectorize_layer.adapt(df[column].values)

    return vectorize_layer

In [30]:
X_train, y_train, X_val, y_val, X_test, y_test = read_files()

### Convert the fulltext into tensors

In [9]:
text_data = tf.constant(X_train['fulltext'].values)
text_data_val = tf.constant(X_val['fulltext'].values)
text_data_test = tf.constant(X_test['fulltext'].values)

2024-08-01 13:43:53.060966: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


### Get vectorization layer

In [10]:
vectorize_layer = get_vectorization_layer(X_train, 'fulltext')
vectorize_layer_3_ngrams = get_vectorization_layer_ngrams(X_train, 'fulltext')

### Vectorize Text

In [11]:
vectorized_text = vectorize_layer(text_data)
vectorized_text_val = vectorize_layer(text_data_val)
vectorized_text_test = vectorize_layer(text_data_test)

vectorized_text_3_ngrams = vectorize_layer_3_ngrams(text_data)
vectorized_text_val_3_ngrams = vectorize_layer_3_ngrams(text_data_val)
vectorized_text_test_3_ngrams = vectorize_layer_3_ngrams(text_data_test)

# Data Manipulation

In [59]:
def create_time_fields(X_train, X_val, X_test):
    X_test['created'] = pd.to_datetime(X_test['created'])
    X_train['created'] = pd.to_datetime(X_train['created'])
    X_val['created'] = pd.to_datetime(X_val['created'])

    # Create the new columns
    X_test['hour_of_day'] = X_test['created'].dt.hour
    X_test['day_of_week'] = X_test['created'].dt.dayofweek
    

    X_train['hour_of_day'] = X_train['created'].dt.hour
    X_train['day_of_week'] = X_train['created'].dt.dayofweek

    X_val['hour_of_day'] = X_val['created'].dt.hour
    X_val['day_of_week'] = X_val['created'].dt.dayofweek
    
    return X_train, X_val, X_test

X_train, X_val, X_test = create_time_fields(X_train, X_val, X_test)

# Model 1 

In [40]:
def build_multifeature_model(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    text_input = Input(shape=(output_sequence_length,), name="fulltext")
    dense_input = Input(shape=(1,), dtype=tf.float32, name='hour_of_day')
    
    embeddings = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(text_input)
    dense_hidden = Dense(32, activation='relu')(dense_input)
    dense_hidden = Dropout(0.5)(dense_hidden)
    
    flattened_text = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
    combined = Concatenate()([flattened_text, dense_hidden])
    
    x = Dense(64, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    
    outputs = Dense(1)(x)
    
    model = Model(inputs=[text_input, dense_input], outputs=outputs)

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [45]:
model_1_multifeature = build_multifeature_model()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_1_mf = model_1_multifeature.fit(
    {
        'fulltext': vectorized_text[:10000],
        'hour_of_day': X_train['hour_of_day'][:10000],
    },
    y_train[:10000],
    epochs=5,
    batch_size=2,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=({
        'fulltext': vectorized_text_val,
        'hour_of_day': X_val['hour_of_day']
    }, y_val)
)

hist_1_mf = pd.DataFrame(history_1_mf.history)
hist_1_mf

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.590415,0.241543,0.0,0.710831,0.194981,0.0
1,1.494863,0.145407,0.0,0.706158,0.151563,0.0
2,1.492243,0.136103,0.0,0.703952,0.127958,0.0
3,1.484957,0.133357,0.0,0.695268,0.117376,0.0
4,1.469626,0.130431,0.0,0.656998,0.1023,0.0


In [47]:
model_1_multifeature.save("model_1_multifeature.h5")

In [48]:
hist_1_mf.to_csv('hist_1_mf.csv')

In [60]:
def build_multifeature_model_2(max_tokens=10000, output_sequence_length=250, embedding_dim=16):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    text_input = Input(shape=(output_sequence_length,), name="fulltext")
    dense_input_1 = Input(shape=(1,), dtype=tf.float32, name='hour_of_day')
    dense_input_2 = Input(shape=(1,), dtype=tf.float32, name='day_of_week')
    
    dense_hidden_1 = Dense(32, activation='relu')(dense_input_1)
    dense_hidden_1 = Dropout(0.5)(dense_hidden_1)

    dense_hidden_2 = Dense(32, activation='relu')(dense_input_2)
    dense_hidden_2 = Dropout(0.5)(dense_hidden_2)

    embeddings = Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=output_sequence_length)(text_input)
    flattened_text = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
    combined = Concatenate()([flattened_text, dense_hidden_1, dense_hidden_2])
    
    x = Dense(64, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    
    outputs = Dense(1)(x)
    
    model = Model(inputs=[text_input, dense_input_1, dense_input_2], outputs=outputs)

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error','accuracy'])
    
    return model

In [61]:
model_2_multifeature = build_multifeature_model_2()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history_2_mf = model_2_multifeature.fit(
    {
        'fulltext': vectorized_text[:10000],
        'hour_of_day': X_train['hour_of_day'][:10000],
        'day_of_week': X_train['day_of_week'][:10000],
    },
    y_train[:10000],
    epochs=5,
    batch_size=2,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=({
        'fulltext': vectorized_text_val,
        'hour_of_day': X_val['hour_of_day'],
        'day_of_week': X_val['day_of_week']
    }, y_val)
)

hist_2_mf = pd.DataFrame(history_2_mf.history)
hist_2_mf

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.555413,0.204005,0.0,0.714234,0.211522,0.0
1,1.492719,0.14094,0.0,0.705502,0.143369,0.0
2,1.489709,0.135813,0.0,0.701544,0.129748,0.0
3,1.486173,0.138126,0.0,0.69243,0.100725,0.0
4,1.441503,0.127889,0.0,0.628241,0.094379,0.0


In [62]:
model_2_multifeature.save("model_2_multifeature.h5")
hist_2_mf.to_csv('hist_2_mf.csv')

# Model Evaluations

In [41]:
hist_1_mf

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,0.940697,0.111765,0.0,0.609747,0.104846,0.0
1,0.860433,0.116496,0.0,0.627505,0.117921,0.0


In [63]:
hist_2_mf

Unnamed: 0,loss,mean_absolute_error,accuracy,val_loss,val_mean_absolute_error,val_accuracy
0,1.555413,0.204005,0.0,0.714234,0.211522,0.0
1,1.492719,0.14094,0.0,0.705502,0.143369,0.0
2,1.489709,0.135813,0.0,0.701544,0.129748,0.0
3,1.486173,0.138126,0.0,0.69243,0.100725,0.0
4,1.441503,0.127889,0.0,0.628241,0.094379,0.0


# Model Losses (Train and Validation)

In [49]:
def validate_model(model, x_train, y_train, x_val, y_val, x_test, y_test):
    train_loss_mse, train_mae, train_acc = model.evaluate(x_train, y_train, verbose=0)
    val_loss_mse, val_mae, val_acc = model.evaluate(x_val, y_val, verbose=0)
    test_loss_mse, test_mae, test_acc = model.evaluate(x_test, y_test, verbose=0)
    
    return {
        "train": {"mse":train_loss_mse,"mae":train_mae,"acc":train_acc},
        "val": {"mse":val_loss_mse,"mae":val_mae,"acc":val_acc},
        "test": {"mse":test_loss_mse,"mae":test_mae,"acc":test_acc},
    }

In [50]:
model_2_saved = tf.keras.models.load_model('model_2.h5')

2024-07-31 17:25:32.804751: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 17:25:32.807242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 17:25:32.811232: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [51]:
model_2_eval = validate_model(model_2_saved ,vectorized_text, y_train, vectorized_text_val, y_val, vectorized_text_test, y_test)

2024-07-31 17:27:26.126366: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-31 17:27:26.128360: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-31 17:27:26.131305: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [52]:
pd.DataFrame(model_2_eval).T

Unnamed: 0,mse,mae,acc
train,0.860489,0.098758,0.0
val,0.609747,0.104846,0.0
test,0.284968,0.083762,0.0


Model 2 performed the best in terms of validation dataset loss, which calculated as the mean squared error.

Model 2 - Validation Loss after 5 Epochs - 2145500.25
Model 3 - Validation Loss after 3 Epochs - 2365950.75
Model 4 - Validation Loss after 4 Epochs - 2463587.00
Model 5 - Validation Loss after 5 Epochs - 2465304.75

# End of file

Sources:
* https://stackoverflow.com/questions/73878049/how-do-you-convert-the-pandas-dataframe-to-tensorflow-python-data-ops-dataset-op