In [11]:
import os
from typing import Tuple, Callable
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import root_mean_squared_error

In [4]:
DATA_DIR = "./data" 

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)

    # You may also want to cast rating to float32
    df["rating"] = df["rating"].astype("float32")

    train_df, valid_df = train_test_split(df, test_size=0.25, random_state=42)
    return train_df, valid_df

def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)

def df_to_tf_dataset(df: pd.DataFrame,
                     batch_size: int = 256,
                     shuffle: bool = True) -> tf.data.Dataset:
    """
    Converts a DataFrame with columns ['sid', 'pid', 'rating'] into
    a tf.data.Dataset yielding (features_dict, label) tuples.
    """
    # 1) Pull out NumPy arrays
    sid_array = df["sid"].values
    pid_array = df["pid"].values
    rating_array = df["rating"].values

    # 2) Build the dataset
    ds = tf.data.Dataset.from_tensor_slices((
        {
            "sid": sid_array,
            "pid": pid_array,
        },
        rating_array
    ))  # :contentReference[oaicite:0]{index=0}

    # 3) Shuffle / batch / prefetch
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)

    return ds

def get_tf_datasets(batch_size: int = 256) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
    """
    Reads CSV, splits into train/validation DataFrames,
    and returns two tf.data.Datasets.
    """
    train_df, valid_df = read_data_df()
    train_ds = df_to_tf_dataset(train_df, batch_size=batch_size, shuffle=True)
    val_ds   = df_to_tf_dataset(valid_df, batch_size=batch_size, shuffle=False)
    return train_ds, val_ds

In [5]:
train_ds, val_ds = get_tf_datasets(batch_size=1024)
train_df, valid_df = read_data_df()

In [6]:
def construct_neumf_base_model(user_input, item_input, params):
    """Builds the NeuMF (GMF + MLP) base model.

    Args:
      user_input:  Keras Input(shape=(1,), dtype=tf.int32) for user IDs
      item_input:  Keras Input(shape=(1,), dtype=tf.int32) for item IDs
      params:      dict containing
        - num_users:       int, total number of users
        - num_items:       int, total number of items
        - mf_dim:          int, embedding size for MF branch
        - model_layers:    list of ints, layer sizes for the MLP branch
        - mf_regularization: float, L2 reg for MF embeddings
        - mlp_reg_layers:    list of floats, L2 regs for each MLP layer

    Returns:
      A tf.keras.Model whose output is the (unnormalized) logit for each (user, item).
    """
    num_users        = params["num_users"]
    num_items        = params["num_items"]
    mf_dim           = params["mf_dim"]
    model_layers     = params["model_layers"]
    mf_reg           = params["mf_regularization"]
    mlp_reg_layers   = params["mlp_reg_layers"]

    # Combined embedding size = MF dim + half of first MLP layer
    embedding_size = mf_dim + model_layers[0] // 2
    embed_init     = "glorot_uniform"

    def mf_slice(x):
        x = tf.squeeze(x, axis=1)
        return x[:, :mf_dim]

    def mlp_slice(x):
        x = tf.squeeze(x, axis=1)
        return x[:, mf_dim:]

    # shared user/item embedding tables
    user_emb = tf.keras.layers.Embedding(
        input_dim=num_users,
        output_dim=embedding_size,
        embeddings_initializer=embed_init,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_reg),
        input_length=1,
        name="embedding_user"
    )(user_input)

    item_emb = tf.keras.layers.Embedding(
        input_dim=num_items,
        output_dim=embedding_size,
        embeddings_initializer=embed_init,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_reg),
        input_length=1,
        name="embedding_item"
    )(item_input)

    # MF branch: slice out the first mf_dim components and multiply
    mf_user = tf.keras.layers.Lambda(mf_slice, name="emb_user_mf")(user_emb)
    mf_item = tf.keras.layers.Lambda(mf_slice, name="emb_item_mf")(item_emb)
    mf_vector = tf.keras.layers.Multiply()([mf_user, mf_item])

    # MLP branch: slice out the remaining components and feed through dense layers
    mlp_user  = tf.keras.layers.Lambda(mlp_slice, name="emb_user_mlp")(user_emb)
    mlp_item  = tf.keras.layers.Lambda(mlp_slice, name="emb_item_mlp")(item_emb)
    mlp_vector = tf.keras.layers.Concatenate()( [mlp_user, mlp_item] )

    # build MLP layers
    for idx in range(1, len(model_layers)):
        mlp_vector = tf.keras.layers.Dense(
            units=model_layers[idx],
            activation="relu",
            kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[idx])
        )(mlp_vector)

    # fuse MF + MLP
    predict_vector = tf.keras.layers.Concatenate()([mf_vector, mlp_vector])

    # final logit
    logits = tf.keras.layers.Dense(
        units=1,
        activation=None,
        kernel_initializer="lecun_uniform",
        name="logits"
    )(predict_vector)

    return tf.keras.Model(inputs=[user_input, item_input], outputs=logits)

In [17]:
class MetricLayer(tf.keras.layers.Layer):
    """No-op placeholder for the sampled-softmax metric layer."""
    def __init__(self, params=None, **kwargs):
        super().__init__(**kwargs)
    def call(self, inputs):
        # original pipeline did: [softmax_logits, dup_mask] → mask/metrics
        # here we just pass straight through the logits
        return inputs[0]

class LossLayer(tf.keras.layers.Layer):
    def __init__(self, batch_size=None, **kwargs):
        super().__init__(**kwargs)

    def call(self, inputs):
        softmax_logits, labels, valid_mask = inputs
        labels = tf.cast(labels, tf.float32)
        mask   = tf.cast(valid_mask, tf.float32)

        # If logits come in shape (batch,2), pick the "1" column;
        # otherwise assume (batch,1) and squeeze:
        if softmax_logits.shape[-1] == 2:
            preds = softmax_logits[:, 1]
        else:
            preds = tf.squeeze(softmax_logits, axis=-1)

        # masked MSE
        loss = tf.reduce_sum(mask * tf.square(labels - preds)) / tf.reduce_sum(mask)
        self.add_loss(loss)

        # return (batch,1) so Keras sees a y_pred
        return tf.expand_dims(preds, axis=-1)


def build_ncf_keras_model(params):
    # 1) define all the Keras inputs
    user_input       = tf.keras.layers.Input(shape=(1,), name="sid", dtype=tf.int32)
    item_input       = tf.keras.layers.Input(shape=(1,), name="pid", dtype=tf.int32)
    valid_pt_mask    = tf.keras.layers.Input(shape=(1,), name="valid_pt_mask", dtype=tf.bool)
    dup_mask         = tf.keras.layers.Input(shape=(1,), name="duplicate_mask", dtype=tf.int32)
    label_input      = tf.keras.layers.Input(shape=(1,), name="train_label", dtype=tf.bool)

    # 2) base NeuMF to get raw logits
    base_model = construct_neumf_base_model(user_input, item_input, params)
    logits     = base_model.output              # shape=(batch, 1)

    # 3) zero-pad + concat so we can later pick the 'positive' logit via gather
    zeros         = tf.keras.layers.Lambda(lambda x: x * 0)(logits)
    softmax_logits = tf.keras.layers.Concatenate(axis=-1)([zeros, logits])

    # 4) hook in the MetricLayer / LossLayer from the official repo
    #    (they compute hit-rate, ndcg, and the sampled‐softmax loss under the hood).
    if not params["keras_use_ctl"]:
        softmax_logits = MetricLayer(params)([softmax_logits, dup_mask])
    final_out = LossLayer(params["batch_size"])([softmax_logits, label_input, valid_pt_mask])

    return tf.keras.Model(
        inputs={
            "sid": user_input,
            "pid": item_input,
            "valid_pt_mask": valid_pt_mask,
            "duplicate_mask": dup_mask,
            "train_label": label_input
        },
        outputs=final_out
    )

def build_ncf_regression_model(params):
    sid_in = tf.keras.layers.Input(shape=(1,), name="sid", dtype=tf.int32)
    pid_in = tf.keras.layers.Input(shape=(1,), name="pid", dtype=tf.int32)

    # reuse your NeuMF body
    base = construct_neumf_base_model(sid_in, pid_in, params)
    preds = base.output  # shape=(batch,1)

    model = tf.keras.Model(inputs=[sid_in, pid_in], outputs=preds)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    return model

In [8]:
# 1) Compute basic dataset sizes
num_users = int(max(train_df["sid"].max(), valid_df["sid"].max()) + 1)
num_items = int(max(train_df["pid"].max(), valid_df["pid"].max()) + 1)

# 2) Choose your NCF hyper-parameters
params = {
    "num_users": num_users,
    "num_items": num_items,
    "mf_dim": 64,                     # size of the MF embeddings
    "model_layers": [64, 32, 16, 8],  # MLP layer sizes
    "mf_regularization": 0.0,
    "mlp_reg_layers":    [0.0, 0.0, 0.0, 0.0],
    "batch_size": 1024,
    "keras_use_ctl": False,           # whether to use the official repo's Control-Flow-Trick layers
}

In [None]:
model = build_ncf_regression_model(params)                   
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",                                         
)

# 3) Train
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,                                           
)

# 4) Define the prediction function
def pred_fn(sids: np.ndarray, pids: np.ndarray) -> np.ndarray:
    """
    Takes arrays of user IDs and item IDs and returns
    a flat numpy array of predicted ratings.
    """
    # Keras wants a dict of inputs named exactly as your Input layers:
    inputs = {
        "sid":  sids.astype(np.int32),
        "pid":  pids.astype(np.int32),
        # we don’t need masks/labels here
    }
    # model.predict returns shape (batch,1), so we .flatten()
    preds = model.predict(inputs, batch_size=1024)
    return preds.flatten()

# 5) Compute validation RMSE
rmse = evaluate(valid_df, pred_fn)
print(f"Validation RMSE: {rmse:.4f}")

Epoch 1/5

2025-05-03 18:15:24.274492: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype float and shape [282047]
	 [[{{node Placeholder/_2}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE: 0.9674


: 