In [0]:
from pyspark.sql.functions import col, when

df_gold = spark.table("churn_catalog.analytics.churn_features")

# MANUAL LABEL ENCODING
df_ml = (
    df_gold
    .withColumn("gender_index", when(col("gender") == "Male", 1).otherwise(0))
    .withColumn("age_group_index",
                when(col("age_group") == "Young", 0)
               .when(col("age_group") == "Adult", 1)
               .when(col("age_group") == "Middle Age", 2)
               .otherwise(3))
    .withColumn("balance_category_index",
                when(col("balance_category") == "Low", 0)
               .when(col("balance_category") == "Medium", 1)
               .otherwise(2))
    .withColumn("tenure_bucket_index",
                when(col("tenure_bucket") == "New", 0)
               .when(col("tenure_bucket") == "Medium", 1)
               .otherwise(2))
)

display(df_ml.limit(10))

# SAVE ML READY TABLE
df_ml.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(
    "churn_catalog.analytics.churn_ml_ready"
)


# ML preparation layer summary

In [0]:
from pyspark.sql.functions import col, count

print("ML READY LAYER SUMMARY")


# Reload ML table (important after overwrite)
df_ml_summary = spark.table("churn_catalog.analytics.churn_ml_ready")

# Total rows
total_rows = df_ml_summary.count()
print("Total rows in ML table:", total_rows)

#  Gender Encoding summary
print("\nGender Encoding Summary:")
display(
    df_ml_summary.groupBy("gender_index")
                 .count()
                 .orderBy("gender_index")
)

#  Age Group Encoding summary
print("\nAge Group Encoding Summary:")
display(
    df_ml_summary.groupBy("age_group_index")
                 .count()
                 .orderBy("age_group_index")
)

#  Balance Category Encoding summary
print("\nBalance Category Encoding Summary:")
display(
    df_ml_summary.groupBy("balance_category_index")
                 .count()
                 .orderBy("balance_category_index")
)

#  Tenure Bucket Encoding summary
print("\nTenure Bucket Encoding Summary:")
display(
    df_ml_summary.groupBy("tenure_bucket_index")
                 .count()
                 .orderBy("tenure_bucket_index")
)

#  Sample ML rows
print("\nSample ML Records:")
display(df_ml_summary.limit(10))
