In [1]:
import numpy as np
import os
import warnings
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler


warnings.filterwarnings("ignore", category=FutureWarning)


cwd = os.getcwd()
tf.get_logger().setLevel('INFO')

2024-08-01 15:56:04.008594: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-01 15:56:04.319835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 15:56:04.422866: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 15:56:04.454035: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-01 15:56:04.669731: I tensorflow/core/platform/cpu_feature_guar

In [7]:
#read data
X_test = pd.read_csv(cwd+r'/data/final/X_test.csv')
X_train = pd.read_csv(cwd+r'/data/final/X_train.csv')
X_val = pd.read_csv(cwd+r'/data/final/X_val.csv')
Y_test = pd.read_csv(cwd+r'/data/final/y_test.csv')
Y_train = pd.read_csv(cwd+r'/data/final/y_train.csv')
Y_val = pd.read_csv(cwd+r'/data/final/y_val.csv')

print(X_test.shape)
print(Y_test.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

(6338, 32)
(6338, 1)
(29574, 32)
(29574, 1)
(6337, 32)
(6337, 1)


In [8]:
# total_awards_received	num_comments title (positve, negative, neutral), selftext (positve, negative, neutral), created (hour of day)
X_test['created'] = pd.to_datetime(X_test['created'])
X_train['created'] = pd.to_datetime(X_train['created'])
X_val['created'] = pd.to_datetime(X_val['created'])

# Create the new columns
X_test['hour_of_day'] = X_test['created'].dt.hour
X_test['day_of_week'] = X_test['created'].dt.day_name()

X_train['hour_of_day'] = X_train['created'].dt.hour
X_train['day_of_week'] = X_train['created'].dt.day_name()

X_val['hour_of_day'] = X_val['created'].dt.hour
X_val['day_of_week'] = X_val['created'].dt.day_name()

columns_to_retain = ['day_of_week', 'title', 'selftext', 'hour_of_day']

X_test = X_test[columns_to_retain]
X_train = X_train[columns_to_retain]
X_val = X_val[columns_to_retain]
Y_test = Y_test[['engagement_score_std']]
Y_train = Y_train[['engagement_score_std']]
Y_val = Y_val[['engagement_score_std']]

print(X_test.shape)
print(Y_test.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)


(6338, 4)
(6338, 1)
(29574, 4)
(29574, 1)
(6337, 4)
(6337, 1)


In [10]:

# Columns to standardize
cols_to_standardize_Y = ['engagement_score_std']

# Initialize the scaler
scaler = StandardScaler()

# Standardize the score
Y_train['engagement_score_std'] = scaler.fit_transform(Y_train[['engagement_score_std']])
Y_val['engagement_score_std'] = scaler.transform(Y_val[['engagement_score_std']])
Y_test['engagement_score_std'] = scaler.transform(Y_test[['engagement_score_std']])

# Convert specific columns to string
X_train['title'] = X_train['title'].astype(str)
X_train['selftext'] = X_train['selftext'].astype(str)

X_val['title'] = X_val['title'].astype(str)
X_val['selftext'] = X_val['selftext'].astype(str)

X_test['title'] = X_test['title'].astype(str)
X_test['selftext'] = X_test['selftext'].astype(str)

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/cliftonh/nltk_data...


In [12]:
def classify_sentiment(text):
    if not text:
        return 'neutral'
    sentiment_scores = sid.polarity_scores(text)
    compound = sentiment_scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [13]:
# Apply the function to classify sentiment
X_train['title'] = X_train['title'].apply(classify_sentiment)
X_train['selftext'] = X_train['selftext'].apply(classify_sentiment)

X_val['title'] = X_val['title'].apply(classify_sentiment)
X_val['selftext'] = X_val['selftext'].apply(classify_sentiment)

X_test['title'] = X_test['title'].apply(classify_sentiment)
X_test['selftext'] = X_test['selftext'].apply(classify_sentiment)

In [14]:
print(X_train.head(5))

  day_of_week     title  selftext  hour_of_day
0    Thursday  negative  positive           20
1    Thursday  positive  positive           12
2      Friday   neutral  positive           16
3   Wednesday   neutral  negative            7
4    Thursday   neutral  positive           17


In [15]:
# One-hot encode 'title'
X_train = pd.get_dummies(X_train, columns=['title'])
X_val = pd.get_dummies(X_val, columns=['title'])
X_test = pd.get_dummies(X_test, columns=['title'])
X_train = pd.get_dummies(X_train, columns=['selftext'])
X_val = pd.get_dummies(X_val, columns=['selftext'])
X_test = pd.get_dummies(X_test, columns=['selftext'])

In [16]:
X_train = pd.get_dummies(X_train, columns=['day_of_week'])
X_val = pd.get_dummies(X_val, columns=['day_of_week'])
X_test = pd.get_dummies(X_test, columns=['day_of_week'])

In [17]:
X_train = pd.get_dummies(X_train, columns=['hour_of_day'])
X_val = pd.get_dummies(X_val, columns=['hour_of_day'])
X_test = pd.get_dummies(X_test, columns=['hour_of_day'])

In [18]:
data_std = pd.concat([X_train, Y_train], axis=1)

corr = data_std.corr()
print(corr['engagement_score_std'].sort_values(ascending=False))

engagement_score_std     1.000000
hour_of_day_11           0.029030
hour_of_day_10           0.018544
hour_of_day_21           0.013102
selftext_negative        0.010098
title_neutral            0.006938
title_negative           0.006783
hour_of_day_12           0.006141
day_of_week_Thursday     0.005996
day_of_week_Tuesday      0.005474
hour_of_day_5            0.003964
hour_of_day_3            0.003640
hour_of_day_4            0.003152
hour_of_day_20           0.002023
day_of_week_Saturday     0.001806
hour_of_day_18           0.000483
day_of_week_Wednesday   -0.000324
day_of_week_Friday      -0.001358
hour_of_day_17          -0.001476
hour_of_day_13          -0.001530
hour_of_day_2           -0.002578
selftext_neutral        -0.002709
hour_of_day_15          -0.003206
hour_of_day_6           -0.003385
hour_of_day_9           -0.003852
day_of_week_Sunday      -0.004415
hour_of_day_1           -0.004612
hour_of_day_8           -0.004746
hour_of_day_22          -0.004839
hour_of_day_7 

In [19]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
def build_model(num_features, learning_rate):
    """Build a TF linear regression model using Keras."""
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    model = tf.keras.Sequential()
    model.add(Dense(
        units=1,
        input_shape=[num_features],
        use_bias=True,
        kernel_initializer=tf.keras.initializers.GlorotUniform(),
        bias_initializer=tf.keras.initializers.Zeros(),
        kernel_regularizer=l2(0.001)
    ))

    optimizer = SGD(learning_rate=learning_rate)

    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])

    return model

In [20]:
tf.random.set_seed(0)

# Build and compile model
num_features = X_train.shape[1]
learning_rate = 0.001
model_tf = build_model(num_features, learning_rate)

# Fit the model
num_epochs = 25
batch_size = 16

history = model_tf.fit(
    X_train, Y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_val, Y_val)
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 42ms/step - loss: 1.1982 - mse: 1.1965 - val_loss: 0.7947 - val_mse: 0.7936
Epoch 2/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 18ms/step - loss: 1.1051 - mse: 1.1040 - val_loss: 0.7626 - val_mse: 0.7618
Epoch 3/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 18ms/step - loss: 1.0797 - mse: 1.0789 - val_loss: 0.7488 - val_mse: 0.7482
Epoch 4/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 20ms/step - loss: 1.0684 - mse: 1.0678 - val_loss: 0.7417 - val_mse: 0.7412
Epoch 5/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - loss: 1.0624 - mse: 1.0620 - val_loss: 0.7376 - val_mse: 0.7372
Epoch 6/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - loss: 1.0589 - mse: 1.0585 - val_loss: 0.7350 - val_mse: 0.7346
Epoch 7/25
[1m1849/1849[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [21]:
# Evaluate the model
train_loss, train_MSE = model_tf.evaluate(X_train, Y_train, verbose=0)
val_loss, val_MSE = model_tf.evaluate(X_val, Y_val, verbose=0)
test_loss, test_MSE = model_tf.evaluate(X_test, Y_test, verbose=0)

print(f"MSE of train: {train_MSE:.4f}")
print(f"MSE of validation: {val_MSE:.4f}")
print(f"MSE of test: {test_MSE:.4f}")

MSE of train: 0.9980
MSE of validation: 0.7283
MSE of test: 0.3926
