# Feature 3.4 — Batch Scoring & Integration

This notebook loads trained models, scores churn probabilities and CLV, runs simple train/serve skew checks, and writes an output table (synthetic fallback). Replace synthetic blocks with Delta/Spark reads and MERGE writes.

In [None]:
import os, pathlib, numpy as np, pandas as pd, random, time, warnings
SEED = int(os.getenv('SEED','42'))
np.random.seed(SEED); random.seed(SEED)
AS_OF_DATE = os.getenv('AS_OF_DATE','2024-06-30')
OUT_DIR = os.getenv('OUT_DIR','artifacts/feature_3_4')
pathlib.Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
print('SEED',SEED,'AS_OF_DATE',AS_OF_DATE)

In [None]:
# Synthetic features and toy models (fallback). Replace with Delta reads and MLflow model loading.
def make_features(n=5000, p=20, seed=SEED):
    rng = np.random.default_rng(seed)
    X = rng.normal(size=(n,p))
    customers = pd.DataFrame(X, columns=[f'f{i}' for i in range(p)])
    customers['customer_id'] = np.arange(n)
    return customers

features_df = make_features()
features = [c for c in features_df.columns if c.startswith('f')]
features_df.head()

In [None]:
# Load models (here: synthetic random scorers). In production, load from MLflow registry or serialized artifacts.
def score_churn(X):
    # Stub: random but smoothed probabilities
    p = 1/(1+np.exp(-0.2*X[:,0] + 0.1*X[:,1]))
    return 0.9*p + 0.1*0.2
def score_clv(X):
    # Stub: positive values
    return np.maximum(0, 200 + 30*X[:,0] + 20*X[:,1])

X = features_df[features].values
churn_score = score_churn(X)
clv_pred = score_clv(X)
print(churn_score.min(), churn_score.max(), clv_pred.min(), clv_pred.max())

In [None]:
# Bucketing + basic QA
def decile_bucket(arr):
    q = np.nanpercentile(arr, np.arange(0,110,10))
    return np.clip(np.digitize(arr, q[1:-1], right=True)+1, 1, 10)
features_df['churn_score'] = churn_score
features_df['churn_bucket'] = decile_bucket(churn_score)
features_df['clv_pred'] = clv_pred
features_df['model_version'] = os.getenv('MODEL_VERSION','v1')
features_df['feature_version'] = os.getenv('FEATURE_VERSION','v1')
features_df['as_of_date'] = AS_OF_DATE
features_df['scored_ts'] = pd.Timestamp.utcnow()
# QA checks
assert features_df['churn_score'].between(0,1).all()
assert (features_df['clv_pred']>=0).all()
assert features_df['customer_id'].is_unique
features_df.head()

In [None]:
# Train/serve skew check (toy): compare to a stored reference mean/std (synthetic).
ref = pd.DataFrame({'feature':features, 'mean_ref':0.0, 'std_ref':1.0})
cur = features_df[features].mean().reset_index(); cur.columns=['feature','mean_cur']
skew = ref.merge(cur, on='feature', how='left')
skew['delta_mean'] = (skew['mean_cur']-skew['mean_ref']).abs()
skew.to_csv(f'{OUT_DIR}/skew_report.csv', index=False)
skew.head()

In [None]:
# Persist scored output (CSV fallback). In Databricks, write Delta and MERGE/overwrite-by-partition.
out = features_df[['customer_id','churn_score','churn_bucket','clv_pred','model_version','feature_version','as_of_date','scored_ts']]
out.to_csv(f'{OUT_DIR}/customer_scores_gold.csv', index=False)
print('Saved', len(out))

Join to `customer_360_gold` by `customer_id` in your warehouse/lakehouse. Ensure counts/keys match, and publish the schema contract to analysts.