In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
#pip install transformers tensorflow

In [6]:
# Step 1: Load data
data_path = "./data/final/"
X_train = pd.read_csv(data_path + "X_train.csv")
y_train = pd.read_csv(data_path + "y_train.csv")
X_val = pd.read_csv(data_path + "X_val.csv")
y_val = pd.read_csv(data_path + "y_val.csv")
X_test = pd.read_csv(data_path + "X_test.csv")
y_test = pd.read_csv(data_path + "y_test.csv")

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (29574, 32)
y_train shape: (29574, 1)
X_val shape: (6337, 32)
y_val shape: (6337, 1)
X_test shape: (6338, 32)
y_test shape: (6338, 1)


In [31]:
# Step 2: Preprocess Data
# Tokenization
max_vocab_size = 10000
maxlen = 256
tokenizer = Tokenizer(num_words=max_vocab_size)

# Fit the tokenizer on the training text
tokenizer.fit_on_texts(X_train['fulltext'])

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train['fulltext'])
X_val_sequences = tokenizer.texts_to_sequences(X_val['fulltext'])
X_test_sequences = tokenizer.texts_to_sequences(X_test['fulltext'])

# Pad sequences to ensure uniform length
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen, padding='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=maxlen, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen, padding='post')

# Ensure labels have the correct shape for regression
y_train = y_train['engagement_score_std'].values if isinstance(y_train, pd.DataFrame) else y_train
y_val = y_val['engagement_score_std'].values if isinstance(y_val, pd.DataFrame) else y_val
y_test = y_test['engagement_score_std'].values if isinstance(y_test, pd.DataFrame) else y_test

print(f"X_train_padded shape: {X_train_padded.shape}")
print(f"X_val_padded shape: {X_val_padded.shape}")
print(f"X_test_padded shape: {X_test_padded.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_padded shape: (29574, 256)
X_val_padded shape: (6337, 256)
X_test_padded shape: (6338, 256)
y_train shape: (29574, 1, 1, 1)
y_val shape: (6337, 1, 1, 1)
y_test shape: (6338, 1, 1, 1)


In [40]:
y_val.std()

1568.9514684737458

In [32]:
# Create TensorFlow Datasets
batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train)).batch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((X_val_padded, y_val)).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test)).batch(batch_size)

In [35]:
# Step 3: Define the model
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim)])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

# Define constants
embed_dim = 256
dense_dim = 2048
num_heads = 8
vocab_size = 10000
sequence_length = 256

# Define model inputs
encoder_inputs = tf.keras.Input(shape=(sequence_length,), dtype="int64", name="encoder_inputs")

# Define positional embeddings
encoder_embeddings = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

# Define transformer encoder
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(encoder_embeddings)

# Define model outputs for regression
outputs = layers.Dense(1)(encoder_outputs[:, 0, :])

# Define the model
transformer = Model(encoder_inputs, outputs)

# Compile the model
transformer.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])

# Print the model summary
transformer.summary()

In [36]:
# Step 4: Train Model
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

# Train the model
history = transformer.fit(train_ds, epochs=20, validation_data=val_ds, callbacks=[early_stopping])

Epoch 1/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 631ms/step - loss: 5053831.5000 - mae: 172.9015 - val_loss: 2464688.0000 - val_mae: 215.6037
Epoch 2/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 632ms/step - loss: 5041266.0000 - mae: 226.9090 - val_loss: 2462288.7500 - val_mae: 237.9604
Epoch 3/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 632ms/step - loss: 5039407.0000 - mae: 244.1018 - val_loss: 2462076.0000 - val_mae: 241.4503
Epoch 4/20
[1m281/925[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m6:24[0m 596ms/step - loss: 6457628.0000 - mae: 254.1452

KeyboardInterrupt: 

In [37]:
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

# Adjust learning rate
transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mean_squared_error", metrics=["mae"])

# Train the model
history = transformer.fit(train_ds, epochs=20, validation_data=val_ds, callbacks=[early_stopping])

Epoch 1/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 621ms/step - loss: 5039619.0000 - mae: 242.6380 - val_loss: 2462103.5000 - val_mae: 241.6290
Epoch 2/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 620ms/step - loss: 5039355.5000 - mae: 245.9875 - val_loss: 2462141.5000 - val_mae: 241.7082
Epoch 3/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 619ms/step - loss: 5039140.0000 - mae: 247.1489 - val_loss: 2462109.0000 - val_mae: 242.3424
Epoch 4/20
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 620ms/step - loss: 5039098.0000 - mae: 246.9682 - val_loss: 2462176.0000 - val_mae: 242.0878


In [38]:
# Step 5: Evaluate the model on the test data
test_loss, test_mae = transformer.evaluate(test_ds)
print(f"Test MAE: {test_mae}")

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 216ms/step - loss: 500481.5312 - mae: 181.4793
Test MAE: 192.59097290039062
