### Imports

In [0]:
import pandas as pd
import numpy as np


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

import mlflow
import mlflow.sklearn

### Read Table

In [0]:
rfm_df = spark.table("olist_ecommerce.gold.customer_rfm").toPandas()

### Check for Nulls

In [0]:
rfm_df.isna().sum()

customer_unique_id      0
recency                 0
frequency               0
monetary              106
dtype: int64

### Fill Nulls by Zero

In [0]:
rfm_df["monetary"] = rfm_df["monetary"].fillna(0)
rfm_df.isna().sum()

customer_unique_id    0
recency               0
frequency             0
monetary              0
dtype: int64

### Scale features

In [0]:
X = rfm_df[["recency", "frequency", "monetary"]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Train KMeans

In [0]:
kmeans = KMeans(
    n_clusters=4,
    random_state=42,
    n_init=10
)

rfm_df["cluster_id"] = kmeans.fit_predict(X_scaled)

### Map business segments

In [0]:
SEGMENT_MAP = {
    0: "Lost Customers",
    1: "Champions",
    2: "Loyal Customers",
    3: "Potential Loyalists"
}

rfm_df["customer_segment"] = rfm_df["cluster_id"].map(SEGMENT_MAP)

rfm_df["customer_segment"].value_counts()

customer_segment
Champions              51828
Lost Customers         38239
Loyal Customers         2913
Potential Loyalists     2242
Name: count, dtype: int64

### Log model to MLflow

In [0]:
with mlflow.start_run(run_name="rfm_kmeans"):
    mlflow.log_param("n_clusters", 4)
    mlflow.sklearn.log_model(kmeans, "kmeans_model")
    mlflow.sklearn.log_model(scaler, "scaler")



### Save predictions to Gold

In [0]:
spark.createDataFrame(rfm_df) \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("olist_ecommerce.gold.customer_rfm_segments")