
# 🎯 Deep Dive: Should You Use `customerId` as a Feature in Neural Network-Based Machine Learning Models?

Author: *Your Name*  
Date: *Today*  

---

## Introduction

In recommendation systems and personalization models, **customerId** often seems like a natural input feature.

But **should you use it directly** in a neural network model?

The short answer: **It depends**.  
Using customer IDs without care can lead to **overfitting**, **poor generalization**, and a **fragile model**. In this notebook, we'll explore why and walk through a full **PyTorch mini-project** demonstrating the pitfalls — plus how we can "rescue" a model using **dropout** and **L2 regularization**.


## 1. Data Simulation

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

# Simulate data
np.random.seed(42)
torch.manual_seed(42)

num_customers = 1000
num_products = 100
interactions_per_customer = 5

data = []
for customer_id in range(num_customers):
    preferred = np.random.choice(num_products, 3, replace=False)
    for _ in range(interactions_per_customer):
        product_id = np.random.choice(num_products)
        liked = int(product_id in preferred)
        behavior_score = np.random.rand() * (liked + 0.1)
        data.append([customer_id, product_id, behavior_score, liked])

df = pd.DataFrame(data, columns=["customerId", "productId", "behaviorFeature", "label"])
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)


## 2. Dataset Class

In [None]:

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df, use_customer_id=True):
        self.customer_ids = torch.tensor(df["customerId"].values, dtype=torch.long)
        self.product_ids = torch.tensor(df["productId"].values, dtype=torch.long)
        self.behavior = torch.tensor(df["behaviorFeature"].values, dtype=torch.float32).unsqueeze(1)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32).unsqueeze(1)
        self.use_customer_id = use_customer_id

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.use_customer_id:
            return self.customer_ids[idx], self.product_ids[idx], self.behavior[idx], self.labels[idx]
        else:
            return self.product_ids[idx], self.behavior[idx], self.labels[idx]


## 3. Model Definitions

In [None]:

# Model A: With CustomerId
class WithCustomerId(nn.Module):
    def __init__(self, num_customers, num_products):
        super().__init__()
        self.customer_embed = nn.Embedding(num_customers, 8)
        self.product_embed = nn.Embedding(num_products, 8)
        self.fc = nn.Sequential(
            nn.Linear(8 + 8 + 1, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, customer_id, product_id, behavior_feat):
        ce = self.customer_embed(customer_id)
        pe = self.product_embed(product_id)
        x = torch.cat([ce, pe, behavior_feat], dim=1)
        return self.fc(x)

# Model B: Without CustomerId
class WithoutCustomerId(nn.Module):
    def __init__(self, num_products):
        super().__init__()
        self.product_embed = nn.Embedding(num_products, 8)
        self.fc = nn.Sequential(
            nn.Linear(8 + 1, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, product_id, behavior_feat):
        pe = self.product_embed(product_id)
        x = torch.cat([pe, behavior_feat], dim=1)
        return self.fc(x)

# Rescue Model C: With Dropout + L2 Regularization
class WithCustomerIdDropout(nn.Module):
    def __init__(self, num_customers, num_products):
        super().__init__()
        self.customer_embed = nn.Embedding(num_customers, 8)
        self.product_embed = nn.Embedding(num_products, 8)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Sequential(
            nn.Linear(8 + 8 + 1, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, customer_id, product_id, behavior_feat):
        ce = self.customer_embed(customer_id)
        pe = self.product_embed(product_id)
        x = torch.cat([ce, pe, behavior_feat], dim=1)
        x = self.dropout(x)
        return self.fc(x)


## 4. Training and Evaluation Functions

In [None]:

def train_epoch(model, dataloader, optimizer, criterion, use_customer_id):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        if use_customer_id:
            customer_id, product_id, behavior, label = batch
            pred = model(customer_id, product_id, behavior)
        else:
            product_id, behavior, label = batch
            pred = model(product_id, behavior)
        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * label.size(0)
    return total_loss / len(dataloader.dataset)

def evaluate_model(model, dataloader, use_customer_id):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            if use_customer_id:
                customer_id, product_id, behavior, label = batch
                pred = model(customer_id, product_id, behavior)
            else:
                product_id, behavior, label = batch
                pred = model(product_id, behavior)
            preds.extend(pred.squeeze().tolist())
            labels.extend(label.squeeze().tolist())
    return roc_auc_score(labels, preds)


## 5. Train Models and Visualize Results

In [None]:

# Prepare datasets and loaders
train_set_a = SimpleDataset(train_df, use_customer_id=True)
test_set_a = SimpleDataset(test_df, use_customer_id=True)
train_set_b = SimpleDataset(train_df, use_customer_id=False)
test_set_b = SimpleDataset(test_df, use_customer_id=False)

train_loader_a = torch.utils.data.DataLoader(train_set_a, batch_size=64, shuffle=True)
test_loader_a = torch.utils.data.DataLoader(test_set_a, batch_size=64)
train_loader_b = torch.utils.data.DataLoader(train_set_b, batch_size=64, shuffle=True)
test_loader_b = torch.utils.data.DataLoader(test_set_b, batch_size=64)

# Initialize models
model_a = WithCustomerId(num_customers=1000, num_products=100)
optimizer_a = optim.Adam(model_a.parameters(), lr=0.01)

model_b = WithoutCustomerId(num_products=100)
optimizer_b = optim.Adam(model_b.parameters(), lr=0.01)

model_c = WithCustomerIdDropout(num_customers=1000, num_products=100)
optimizer_c = optim.Adam(model_c.parameters(), lr=0.01, weight_decay=1e-4)  # L2 regularization

criterion = nn.BCELoss()

losses_a, aucs_a = [], []
losses_b, aucs_b = [], []
losses_c, aucs_c = [], []

for epoch in range(20):
    loss_a = train_epoch(model_a, train_loader_a, optimizer_a, criterion, use_customer_id=True)
    loss_b = train_epoch(model_b, train_loader_b, optimizer_b, criterion, use_customer_id=False)
    loss_c = train_epoch(model_c, train_loader_a, optimizer_c, criterion, use_customer_id=True)

    auc_a = evaluate_model(model_a, test_loader_a, use_customer_id=True)
    auc_b = evaluate_model(model_b, test_loader_b, use_customer_id=False)
    auc_c = evaluate_model(model_c, test_loader_a, use_customer_id=True)

    losses_a.append(loss_a)
    losses_b.append(loss_b)
    losses_c.append(loss_c)
    aucs_a.append(auc_a)
    aucs_b.append(auc_b)
    aucs_c.append(auc_c)

# Plot losses and AUCs
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
plt.plot(losses_a, label="With customerId (no reg)")
plt.plot(losses_b, label="Without customerId")
plt.plot(losses_c, label="With customerId (dropout + L2)")
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(aucs_a, label="With customerId (no reg)")
plt.plot(aucs_b, label="Without customerId")
plt.plot(aucs_c, label="With customerId (dropout + L2)")
plt.title("Test ROC AUC")
plt.xlabel("Epoch")
plt.ylabel("AUC")
plt.legend()

plt.tight_layout()
plt.show()



# 📚 Appendix: Theoretical Analysis

## Why Does Using CustomerId Cause Overfitting?

- Each customer has an embedding \(e_i\).
- Prediction is linear in \(e_i\).
- Statistical estimation theory says:

\[
\text{Variance}(\hat{e}_i) \propto \frac{1}{n_i}
\]

thus:

- Fewer examples \(\Rightarrow\) Larger variance \(\Rightarrow\) Overfitting
- More examples \(\Rightarrow\) Lower variance \(\Rightarrow\) Better generalization
