In [None]:
#Build RFM table (Task 4) ---
rfm = df_raw.groupby("CustomerId").agg(
    Recency=("TransactionStartTime", lambda x: (snapshot_date - x.max()).days),
    Frequency=("TransactionStartTime", "count"),
    Monetary=("Amount", "sum"),
).reset_index()

# --- Step 8: Cluster customers ---
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency","Frequency","Monetary"]])

kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto")
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

cluster_summary = rfm.groupby("Cluster")[["Recency","Frequency","Monetary"]].mean()
print("Cluster summary:\n", cluster_summary)

# Identify high-risk cluster (low Frequency, low Monetary, high Recency)
high_risk_cluster = cluster_summary.sort_values(["Frequency","Monetary"]).index[0]
rfm["is_high_risk"] = (rfm["Cluster"] == high_risk_cluster).astype(int)

print("Target distribution:", rfm["is_high_risk"].value_counts())

# --- Step 9: Merge target into Task 3 features ---
final_df = processed_df.merge(rfm[["CustomerId","is_high_risk"]], on="CustomerId", how="inner")

print("Final target distribution:", final_df["is_high_risk"].value_counts())

# Save Task 4 dataset
final_df.to_csv("/content/processed_task4.csv", index=False)
print("Task 4 saved:", final_df.shape)