## Recurrent Neural Network

In [4]:
# %% ================================
# Recurrent model for user activity (LSTM, one-hot cats, no IDs)
# ================================
# Config
DATA_PATH = "../data/cleaned_data.csv"

# Sequence construction
MAX_STEPS_PER_USER = 60       # first N impressions as "early actions"
MIN_STEPS_REQUIRED  = 5       # drop users with too few rows
ORDER_BY = ["day_dt", "impressPosition_log"]  # use parsed datetime for stable temporal order

# Train/val/test split (by user)
VAL_SIZE   = 0.15
TEST_SIZE  = 0.15
RANDOM_SEED = 42

# Model / training
LSTM_UNITS   = 64
DROPOUT      = 0.2
BATCH_SIZE   = 128
EPOCHS       = 15
LEARNING_RATE = 1e-3
POS_CLASS_WEIGHT = 2.0   # upweight positives if imbalanced

# %% ----------------
# Imports
# -------------------
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import sklearn

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

# Reproducibility
np.random.seed(RANDOM_SEED)
tf.keras.utils.set_random_seed(RANDOM_SEED)

# %% ----------------
# Load data
# -------------------
df = pd.read_csv(DATA_PATH)

# Optional: handle common column typos (uncomment if needed)
# if "PublishMlogCnt" in df.columns and "PushlishMlogCnt" not in df.columns:
#     df = df.rename(columns={"PublishMlogCnt": "PushlishMlogCnt"})

# %% ----------------
# Column specification
# -------------------
# Binary early actions
binary_actions = [
    "isClick","isComment","isIntoPersonalHomepage","isShare","isViewComment","isLike"
]

# Continuous/log features (ensure numeric)
cont_feats = ["mlogViewTime_log","impressPosition_log","followCnt_log","pop_index_pca_lag1"]

# Categorical (one-hot)
cat_feats = ["province","type","creatorType"]

# Numeric context (we'll use a numeric day_index, not raw day string)
num_feats = ["creator_level","PushlishMlogCnt","age_gender_missing","day_index"]

# Target + grouping key
TARGET   = "y_active"
USER_KEY = "userId"

# %% ----------------
# Parse day as datetime and build numeric day_index
# -------------------
# If CSV's 'day' is string like '2019-11-23', parse it
if "day" in df.columns:
    df["day_dt"] = pd.to_datetime(df["day"], errors="coerce")
else:
    raise KeyError("Expected column 'day' not found in the CSV.")

# Build day_index = days since min(date)
min_day = df["day_dt"].min()
df["day_index"] = (df["day_dt"] - min_day).dt.days.astype("float32")  # keep as float for model

# %% ----------------
# Ensure numeric columns are numeric (coerce strings; keep NaN where needed)
# -------------------
for c in binary_actions + cont_feats + ["creator_level","PushlishMlogCnt","age_gender_missing"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Categorical columns: make strings and add explicit missing token
for c in cat_feats:
    if c in df.columns:
        df[c] = df[c].astype("string").fillna("__MISSING__")
    else:
        raise KeyError(f"Missing categorical column: {c}")

# Ensure required columns exist
required = set(binary_actions + cont_feats + cat_feats + num_feats + [TARGET, USER_KEY, "day_dt"])
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing expected columns: {missing}")

# Build keep list (include ORDER_BY columns); remove duplicates preserving order
keep_cols = list(dict.fromkeys(binary_actions + cont_feats + cat_feats + num_feats + [USER_KEY, TARGET] + ORDER_BY))
df = df[keep_cols].copy()

# Sort deterministically within user
df = df.sort_values([USER_KEY] + ORDER_BY, kind="mergesort")

# y_active should be user-level; if multiple rows per user, take max (ever-active)
user_target = df.groupby(USER_KEY)[TARGET].max().astype(int)

# %% ----------------
# One-hot encoder (version-safe)
# -------------------
skver = tuple(map(int, sklearn.__version__.split(".")[:2]))
if skver >= (1, 4):
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
else:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

# Fit encoder on all rows (categoricals only)
ohe.fit(df[cat_feats])

# Transform helper: OHE cats + stack numeric features
def transform_rows(frame: pd.DataFrame) -> np.ndarray:
    X_cat = ohe.transform(frame[cat_feats])          # sparse
    X_cat = X_cat.toarray().astype(np.float32)       # densify
    X_num = frame[binary_actions + cont_feats + num_feats].to_numpy(dtype=np.float32)  # includes NaNs fine
    return np.hstack([X_cat, X_num]).astype(np.float32)

# Row feature dimension
_row_sample = transform_rows(df.iloc[[0]])
ROW_FEAT_DIM = _row_sample.shape[1]

# %% ----------------
# Build user-level sequences
# -------------------
Xs, ys, user_list, lengths = [], [], [], []
for uid, g in df.groupby(USER_KEY, sort=False):
    n = len(g)
    if n < MIN_STEPS_REQUIRED:
        continue
    g_feat = transform_rows(g)
    g_feat = g_feat[:MAX_STEPS_PER_USER]
    if g_feat.shape[0] < MAX_STEPS_PER_USER:
        pad_len = MAX_STEPS_PER_USER - g_feat.shape[0]
        g_feat = np.vstack([g_feat, np.zeros((pad_len, ROW_FEAT_DIM), dtype=np.float32)])
    Xs.append(g_feat)
    ys.append(int(user_target.loc[uid]))
    user_list.append(uid)
    lengths.append(min(n, MAX_STEPS_PER_USER))

X = np.stack(Xs, axis=0)  # [N_users, T, D]
y = np.array(ys, dtype=np.int32)
lengths = np.array(lengths, dtype=np.int32)
print(f"Users kept: {len(user_list)} | Sequence tensor: {X.shape} (N, T, D)")

# %% ----------------
# Split by user (no leakage)
# -------------------
user_arr = np.array(user_list)
u_train, u_tmp, y_train, y_tmp, idx_train, idx_tmp = train_test_split(
    user_arr, y, np.arange(len(user_arr)),
    test_size=VAL_SIZE + TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)
rel_test = TEST_SIZE / (VAL_SIZE + TEST_SIZE)
u_val, u_test, y_val, y_test, idx_val, idx_test = train_test_split(
    u_tmp, y_tmp, idx_tmp, test_size=rel_test, random_state=RANDOM_SEED, stratify=y_tmp
)

X_train, X_val, X_test = X[idx_train], X[idx_val], X[idx_test]
len_train, len_val, len_test = lengths[idx_train], lengths[idx_val], lengths[idx_test]

print("Split shapes:", X_train.shape, X_val.shape, X_test.shape)

# %% ----------------
# Build LSTM model with masking
# -------------------
tf.keras.backend.clear_session()

inp = layers.Input(shape=(MAX_STEPS_PER_USER, ROW_FEAT_DIM), name="seq")
x = layers.Masking(mask_value=0.0)(inp)
x = layers.LSTM(LSTM_UNITS, return_sequences=False)(x)
x = layers.Dropout(DROPOUT)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(DROPOUT)(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inp, out)
model.compile(
    optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.AUC(name="AUC")]
)
model.summary()

# %% ----------------
# Callbacks & class weights
# -------------------
cbs = [
    callbacks.EarlyStopping(monitor="val_AUC", mode="max", patience=3, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_AUC", mode="max", factor=0.5, patience=2, min_lr=1e-5)
]
class_weight = {0: 1.0, 1: POS_CLASS_WEIGHT}

# %% ----------------
# Train
# -------------------
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight,
    callbacks=cbs,
    verbose=1
)

# %% ----------------
# Evaluate
# -------------------
pred_val  = model.predict(X_val, batch_size=BATCH_SIZE).ravel()
pred_test = model.predict(X_test, batch_size=BATCH_SIZE).ravel()

print(f"Validation AUC: {roc_auc_score(y_val, pred_val):.4f}")
print(f"Test AUC:       {roc_auc_score(y_test, pred_test):.4f}")

# Choose threshold by maximizing Youdenâ€™s J on validation
ths = np.linspace(0.05, 0.95, 19)
j_scores = []
for t in ths:
    yhat = (pred_val >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val, yhat).ravel()
    sens = tp / (tp + fn + 1e-9)
    spec = tn / (tn + fp + 1e-9)
    j_scores.append(sens + spec - 1)
best_thr = ths[int(np.argmax(j_scores))]
print(f"Chosen threshold (Youden J on val): {best_thr:.2f}")

print("\nTest classification report:")
print(classification_report(y_test, (pred_test >= best_thr).astype(int)))
print("Confusion matrix (test):")
print(confusion_matrix(y_test, (pred_test >= best_thr).astype(int)))

# %% ----------------
# Save model & encoder (optional)
# -------------------
# model.save("user_activity_lstm.keras")
# import pickle
# with open("ohe_province_type_creatorType.pkl", "wb") as f:
#     pickle.dump(ohe, f)


Users kept: 14421 | Sequence tensor: (14421, 60, 62) (N, T, D)
Split shapes: (10094, 60, 62) (2163, 60, 62) (2164, 60, 62)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 seq (InputLayer)            [(None, 60, 62)]          0         
                                                                 
 masking (Masking)           (None, 60, 62)            0         
                                                                 
 lstm (LSTM)                 (None, 64)                32512     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
    