# KumoRFM vs. LightGBM on the Titanic dataset

This notebook demonstrates single-table usage of `KumoRFM` and compares it to a strong tree-based baseline (`LightGBM`). We:
- Load Titanic from `seaborn`
- Create a single table with primary key `id`
- Use a single, shared train/test split for both methods
- Evaluate AUROC and compare results

In [1]:
# Optional: ensure seaborn is available in this kernel
# %pip install -q seaborn

In [2]:
# Optional: keep `kumoai` current in this environment
# %pip install -q --pre --upgrade kumoai

In [3]:
# Import the KumoRFM SDK
from kumoai.experimental import rfm

In [4]:
# Authentication: prefer env var `KUMO_API_KEY`; fall back to interactive auth
import os

if not os.environ.get("KUMO_API_KEY"):
    rfm.authenticate()

Opening browser page to automatically generate an API key...
If the page does not open, manually create a new API key at https://kumorfm.ai/api-keys and set it using os.environ["KUMO_API_KEY"] = "YOUR_API_KEY"


[2025-09-29 22:05:17 - kumoai:301 - INFO] Generated token "sdk-m4max-2025-09-29-22-05-02-Z" and saved to KUMO_API_KEY env variable


In [5]:
# Initialize the SDK (prints deployment and logging level)
rfm.init()

[2025-09-29 22:05:18 - kumoai:203 - INFO] Successfully initialized the Kumo SDK against deployment https://kumorfm.ai/api, with log level INFO.


In [6]:
# Load Titanic and create a single-table dataset
# - Keep a simple set of numeric + categorical columns
# - Create a single primary key `id`
# - Create ONE shared stratified train/test split used by both KumoRFM and LightGBM
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load and pick a few useful columns
df = sns.load_dataset("titanic").copy()
cols = ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked", "alone"]
df = df[["survived"] + cols].dropna(subset=["survived"]).reset_index(drop=True)

# Stable integer IDs
df["id"] = df.index

# Shared train/test split (stratified)
y = df["survived"]
train_ids, test_ids = train_test_split(
    df["id"].values, test_size=0.2, random_state=42, stratify=y
)

# KumoRFM view: mask test labels to simulate missing-value imputation
table = df[["id"] + cols + ["survived"]].copy()
table.loc[table["id"].isin(test_ids), "survived"] = pd.NA

display(table.head())

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked,alone,survived
0,0,3,male,22.0,1,0,7.25,S,False,0.0
1,1,1,female,38.0,1,0,71.2833,C,False,1.0
2,2,3,female,26.0,0,0,7.925,S,True,1.0
3,3,1,female,35.0,1,0,53.1,S,False,1.0
4,4,3,male,35.0,0,0,8.05,S,True,


In [7]:
# Build a local graph with a single table and instantiate KumoRFM
from kumoai.experimental import rfm

graph = rfm.LocalGraph.from_data({"table": table})
model = rfm.KumoRFM(graph)

graph["table"].print_metadata()

### 🗂️ Graph Metadata

name,primary_key,time_column
table,id,-


### 🕸️ Graph Links (FK ↔️ PK)

*No links registered*

Output()

### 🏷️ Metadata of Table `table` (891 rows)

name,dtype,stype,is_primary_key,is_time_column
id,int,ID,True,False
pclass,int,categorical,False,False
sex,string,categorical,False,False
age,float,numerical,False,False
sibsp,int,categorical,False,False
parch,int,categorical,False,False
fare,float,numerical,False,False
embarked,string,categorical,False,False
alone,bool,categorical,False,False
survived,float,categorical,False,False


In [8]:
# Predict on the test IDs using the KumoRFM query language
id_list = ", ".join(str(i) for i in test_ids)
query = f"PREDICT table.survived=1 FOR table.id IN ({id_list})"
result = model.predict(query)
result.head()

Output()

Unnamed: 0,ENTITY,TARGET_PRED,False_PROB,True_PROB
0,565,False,0.830244,0.169756
1,160,False,0.900935,0.099065
2,553,False,0.818403,0.181597
3,860,False,0.886015,0.113985
4,241,True,0.474409,0.525591


Let's read this data frame into `KumoRFM`:

In [9]:
# WARNING: This naive evaluation assumes result rows are ordered like `test_ids`.
# In practice, always align by ID. The robust evaluation is in the next cell.
from sklearn.metrics import roc_auc_score

y_test = df.set_index("id").loc[list(test_ids), "survived"].to_numpy()
prob_col = [c for c in result.columns if c.endswith("_PROB")][0]
y_pred = result[prob_col].to_numpy()
print(f"AUROC (naive): {roc_auc_score(y_test, y_pred):.4f}")

AUROC (naive): 0.1568


In [10]:
# Robust evaluation: align predictions to ground-truth by ID, then compute AUROC
# - KumoRFM returns an `ENTITY` column; we parse integer IDs from it
# - We then gather `y_true` in exactly that row order
# - Finally, choose the positive-class probability and compute AUROC
import re
from sklearn.metrics import roc_auc_score

# Parse integer IDs from the ENTITY column
def parse_entity_id(x):
    if isinstance(x, (int, np.integer)):
        return int(x)
    m = re.search(r"(\d+)$", str(x))
    return int(m.group(1)) if m else None

result_ids = result["ENTITY"].map(parse_entity_id)
assert result_ids.notna().all(), f"Unparsed ENTITY values: {result['ENTITY'].head()}"

# Align y_true to the prediction rows
y_true = df.set_index("id").loc[result_ids, "survived"].to_numpy()

# Pick positive-class probability
prob_col = "1_PROB" if "1_PROB" in result.columns else ("True_PROB" if "True_PROB" in result.columns else result.filter(like="_PROB").columns[0])
y_pred = result[prob_col].to_numpy()

auc = roc_auc_score(y_true, y_pred)
auc_flip = roc_auc_score(y_true, 1 - y_pred)
print(f"AUC: {auc:.4f}  |  AUC(1-prob): {auc_flip:.4f}  |  prob_col: {prob_col}")

AUC: 0.8432  |  AUC(1-prob): 0.1568  |  prob_col: True_PROB


In [11]:
from sklearn.metrics import roc_curve, classification_report, confusion_matrix

fpr, tpr, thresholds = roc_curve(y_true, y_pred)
j = tpr - fpr
best_idx = j.argmax()
best_thresh = thresholds[best_idx]

y_hat = (y_pred >= best_thresh).astype(int)
print("Best threshold:", best_thresh)
print(classification_report(y_true, y_hat, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_hat))

Best threshold: 0.3497413397
              precision    recall  f1-score   support

           0     0.8763    0.7727    0.8213       110
           1     0.6951    0.8261    0.7550        69

    accuracy                         0.7933       179
   macro avg     0.7857    0.7994    0.7881       179
weighted avg     0.8065    0.7933    0.7957       179

Confusion matrix:
 [[85 25]
 [12 57]]


In [12]:
pred_df = pd.DataFrame({"id": result_ids, "survived_true": y_true, "survived_prob": y_pred, "survived_pred": y_hat})
display(pred_df.head())

Unnamed: 0,id,survived_true,survived_prob,survived_pred
0,565,0,0.169756,0
1,160,0,0.099065,0
2,553,1,0.181597,0
3,860,0,0.113985,0
4,241,1,0.525591,1


In [13]:
# Install LightGBM (one-time)
# If already installed in the venv, this is a no-op
%pip install -q lightgbm


Note: you may need to restart the kernel to use updated packages.


In [14]:
# LightGBM baseline using the SAME split and native categoricals
# - Uses the same `train_ids`/`test_ids`
# - Casts categorical columns to pandas `category` dtype
# - Early stopping via callbacks (LightGBM v4+ sklearn API)
import pandas as pd
import logging
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

logging.getLogger("lightgbm").setLevel(logging.ERROR)

cat_cols = ["sex", "embarked", "alone", "pclass"]
num_cols = ["age", "sibsp", "parch", "fare"]

df_lgb = df[["id", "survived"] + cat_cols + num_cols].copy()
for c in cat_cols:
    df_lgb[c] = df_lgb[c].astype("category")

train_mask = df_lgb["id"].isin(train_ids)
test_mask  = df_lgb["id"].isin(test_ids)

X_train = df_lgb.loc[train_mask, cat_cols + num_cols]
y_train = df_lgb.loc[train_mask, "survived"].astype(int)
X_test  = df_lgb.loc[test_mask,  cat_cols + num_cols]
y_test  = df_lgb.loc[test_mask,  "survived"].astype(int)

lgbm = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=10,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=0.5,
    random_state=42,
    verbosity=-1,
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    categorical_feature=cat_cols,
    callbacks=[
        early_stopping(stopping_rounds=100, verbose=False),
        log_evaluation(period=0),
    ],
)

y_pred_lgb = lgbm.predict_proba(X_test)[:, 1]
auc_lgb = roc_auc_score(y_test, y_pred_lgb)
print("LightGBM AUROC:", auc_lgb)

LightGBM AUROC: 0.8177206851119895


In [15]:
# Compare KumoRFM vs LightGBM on the SAME test split
import pandas as pd
from sklearn.metrics import roc_auc_score

auc_kumo = roc_auc_score(y_true, y_pred)
comparison = pd.DataFrame({"Model": ["KumoRFM", "LightGBM"], "AUROC": [auc_kumo, auc_lgb]}).sort_values("AUROC", ascending=False)
display(comparison)


Unnamed: 0,Model,AUROC
0,KumoRFM,0.843215
1,LightGBM,0.817721
