In [4]:
import numpy as np
import os
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  
sns.set(style="darkgrid")  

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow import keras
from keras import metrics
cwd = os.getcwd()
tf.get_logger().setLevel('INFO')

In [5]:
#read data
X_test = pd.read_csv(cwd+'\data\X_test.csv')
X_train = pd.read_csv(cwd+'\data\X_train.csv')
X_val = pd.read_csv(cwd+'\data\X_val.csv')
Y_test = pd.read_csv(cwd+'\data\y_test.csv')
Y_train = pd.read_csv(cwd+'\data\y_train.csv')
Y_val = pd.read_csv(cwd+'\data\y_val.csv')

print(X_test.shape)
print(Y_test.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

(5483, 25)
(5483, 2)
(25585, 25)
(25585, 2)
(5483, 25)
(5483, 2)


In [6]:
# total_awards_received	num_comments title (positve, negative, neutral), selftext (positve, negative, neutral), created (hour of day)
X_test['created'] = pd.to_datetime(X_test['created'])
X_train['created'] = pd.to_datetime(X_train['created'])
X_val['created'] = pd.to_datetime(X_val['created'])

# Create the new columns
X_test['hour_of_day'] = X_test['created'].dt.hour
X_test['day_of_week'] = X_test['created'].dt.day_name()

X_train['hour_of_day'] = X_train['created'].dt.hour
X_train['day_of_week'] = X_train['created'].dt.day_name()

X_val['hour_of_day'] = X_val['created'].dt.hour
X_val['day_of_week'] = X_val['created'].dt.day_name()

columns_to_retain = ['day_of_week', 'title', 'selftext', 'hour_of_day']

X_test = X_test[columns_to_retain]
X_train = X_train[columns_to_retain]
X_val = X_val[columns_to_retain]
Y_test = Y_test[['score']]
Y_train = Y_train[['score']]
Y_val = Y_val[['score']]

print(X_test.shape)
print(Y_test.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)


(5483, 4)
(5483, 1)
(25585, 4)
(25585, 1)
(5483, 4)
(5483, 1)


In [7]:
from sklearn.preprocessing import StandardScaler
# Columns to standardize
cols_to_standardize_Y = ['score']

# Initialize the scaler
scaler = StandardScaler()

# Standardize the score
Y_train['score'] = scaler.fit_transform(Y_train[['score']])
Y_val['score'] = scaler.transform(Y_val[['score']])
Y_test['score'] = scaler.transform(Y_test[['score']])

# Convert specific columns to string
X_train['title'] = X_train['title'].astype(str)
X_train['selftext'] = X_train['selftext'].astype(str)

X_val['title'] = X_val['title'].astype(str)
X_val['selftext'] = X_val['selftext'].astype(str)

X_test['title'] = X_test['title'].astype(str)
X_test['selftext'] = X_test['selftext'].astype(str)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\clift\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
def classify_sentiment(text):
    if not text:
        return 'neutral'
    sentiment_scores = sid.polarity_scores(text)
    compound = sentiment_scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [10]:
# Apply the function to classify sentiment
X_train['title'] = X_train['title'].apply(classify_sentiment)
X_train['selftext'] = X_train['selftext'].apply(classify_sentiment)

X_val['title'] = X_val['title'].apply(classify_sentiment)
X_val['selftext'] = X_val['selftext'].apply(classify_sentiment)

X_test['title'] = X_test['title'].apply(classify_sentiment)
X_test['selftext'] = X_test['selftext'].apply(classify_sentiment)

In [11]:
print(X_train.head(5))

  day_of_week     title  selftext  hour_of_day
0    Thursday   neutral  positive           14
1    Saturday   neutral  positive            0
2      Friday  negative  positive            5
3      Monday   neutral  positive            3
4    Thursday  positive  positive           15


In [12]:
# One-hot encode 'title'
X_train = pd.get_dummies(X_train, columns=['title'])
X_val = pd.get_dummies(X_val, columns=['title'])
X_test = pd.get_dummies(X_test, columns=['title'])
X_train = pd.get_dummies(X_train, columns=['selftext'])
X_val = pd.get_dummies(X_val, columns=['selftext'])
X_test = pd.get_dummies(X_test, columns=['selftext'])

In [13]:
X_train = pd.get_dummies(X_train, columns=['day_of_week'])
X_val = pd.get_dummies(X_val, columns=['day_of_week'])
X_test = pd.get_dummies(X_test, columns=['day_of_week'])

In [14]:
X_train = pd.get_dummies(X_train, columns=['hour_of_day'])
X_val = pd.get_dummies(X_val, columns=['hour_of_day'])
X_test = pd.get_dummies(X_test, columns=['hour_of_day'])

In [15]:
data_std = pd.concat([X_train, Y_train], axis=1)

corr = data_std.corr()
print(corr['score'].sort_values(ascending=False))

score                    1.000000
title_negative           0.021747
day_of_week_Thursday     0.013777
hour_of_day_11           0.011735
selftext_negative        0.011528
hour_of_day_16           0.010687
title_positive           0.009451
hour_of_day_14           0.008827
hour_of_day_10           0.006786
hour_of_day_4            0.006407
hour_of_day_12           0.005917
hour_of_day_19           0.005571
hour_of_day_13           0.003938
hour_of_day_20           0.003674
hour_of_day_22           0.001713
hour_of_day_21           0.001503
day_of_week_Saturday     0.000839
day_of_week_Wednesday    0.000410
hour_of_day_6            0.000395
day_of_week_Tuesday     -0.000651
selftext_positive       -0.002018
day_of_week_Monday      -0.002970
hour_of_day_0           -0.003188
hour_of_day_15          -0.003509
hour_of_day_7           -0.004124
hour_of_day_5           -0.004402
hour_of_day_8           -0.004693
hour_of_day_2           -0.004770
hour_of_day_1           -0.005212
day_of_week_Su

In [16]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
def build_model(num_features, learning_rate):
    """Build a TF linear regression model using Keras."""
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    model = tf.keras.Sequential()
    model.add(Dense(
        units=1,        # output dim
        input_shape=[num_features],  # input dim
        use_bias=True,
        kernel_initializer=tf.keras.initializers.GlorotUniform(),  # Better weight initializer
        bias_initializer=tf.keras.initializers.Zeros(),  # Initialize bias to zero
        kernel_regularizer=l2(0.01)  # L2 regularization to prevent overfitting
    ))

    optimizer = SGD(learning_rate=learning_rate)

    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])

    return model

In [17]:
tf.random.set_seed(0)

# 2. Build and compile model
num_features = X_train.shape[1]
learning_rate = 0.001
model_tf = build_model(num_features, learning_rate)

# 3. Fit the model
num_epochs = 50
batch_size = 16

history = model_tf.fit(
    X_train, Y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_val, Y_val)
)

# Evaluate the model
mse = model_tf.evaluate(X_test, Y_test)
print(f"Mean Squared Error on test data: {mse[0]}")




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 1.7051 - mse: 1.6861 - val_loss: 0.6314 - val_mse: 0.6168
Epoch 2/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.6433 - mse: 1.6297 - val_loss: 0.6071 - val_mse: 0.5960
Epoch 3/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.6209 - mse: 1.6104 - val_loss: 0.5948 - val_mse: 0.5860
Epoch 4/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.6088 - mse: 1.6004 - val_loss: 0.5872 - val_mse: 0.5800
Epoch 5/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.6011 - mse: 1.5942 - val_loss: 0.5822 - val_mse: 0.5762
Epoch 6/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.5959 - mse: 1.5901 - val_loss: 0.5788 - val_mse: 0.5737
Epoch 7/50
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

In [18]:
train_loss, train_MSE = model_tf.evaluate(X_train, Y_train, verbose=0)
val_loss, val_MSE = model_tf.evaluate(X_val, Y_val, verbose=0)
test_loss, test_MSE = model_tf.evaluate(X_test, Y_test, verbose=0)

print(f"MSE of train: {train_MSE:.4f}")
print(f"MSE of validation: {val_MSE:.4f}")
print(f"MSE of test: {test_MSE:.4f}")

MSE of train: 0.9983
MSE of validation: 0.5682
MSE of test: 0.8114


In [69]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin

# Custom KerasRegressor class
class KerasRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, build_fn=None, learning_rate=0.01, l2_reg=0.01, num_features=10, batch_size=32, epochs=100, verbose=0):
        self.build_fn = build_fn
        self.learning_rate = learning_rate
        self.l2_reg = l2_reg
        self.num_features = num_features
        self.batch_size = batch_size
        self.epochs = epochs
        self.verbose = verbose
        self.model_ = None

    def fit(self, X, y, **fit_kwargs):
        self.model_ = self.build_fn(learning_rate=self.learning_rate, l2_reg=self.l2_reg, num_features=self.num_features)
        self.model_.fit(X, y, batch_size=self.batch_size, epochs=self.epochs, verbose=self.verbose, **fit_kwargs)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def score(self, X, y):
        return -self.model_.evaluate(X, y, verbose=0)[0]

# Define the build function for the model
def build_model_for_grid(learning_rate, l2_reg, num_features):
    model = tf.keras.Sequential()
    model.add(Dense(
        units=1,
        input_shape=[num_features],
        use_bias=True,
        kernel_initializer=tf.keras.initializers.GlorotUniform(),
        bias_initializer=tf.keras.initializers.Zeros(),
        kernel_regularizer=l2(l2_reg)
    ))

    optimizer = SGD(learning_rate=learning_rate)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# Sample data creation for demonstration purposes
# X, Y should be your actual dataset
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Ensure data is scaled
scaler_X = StandardScaler()

# Extract the number of features from the training data
num_features = X_train.shape[1]

# Wrap the Keras model for use in scikit-learn
model = KerasRegressor(build_fn=build_model_for_grid, num_features=num_features)

# Define the grid search parameters
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'l2_reg': [0.001, 0.01, 0.1],
    'batch_size': [16, 32, 64],
    'epochs': [50, 100, 200]
}

# Perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

# Display the best results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 