In [0]:
data = [
    (23, "Female", "Hyderabad", 6, 12.5, 8, 420.0, 2100.0, 1, 0),
    (35, "Male", "Bangalore", 24, 18.2, 45, 650.0, 8200.0, 0, 0),
    (29, "Female", "Chennai", 10, 9.8, 12, 390.0, 1800.0, 1, 1),
    (41, "Male", "Delhi", 36, 21.4, 78, 720.0, 11500.0, 0, 0),
    (26, "Female", "Pune", 4, 7.5, 5, 310.0, 900.0, 1, 1),
    (33, "Male", "Mumbai", 18, 15.1, 32, 580.0, 6400.0, 0, 0)
]

columns = [
    "age", "gender", "city", "tenure_months",
    "avg_session_time", "total_orders",
    "avg_order_value", "last_month_spend",
    "discount_used", "churn"
]

df = spark.createDataFrame(data, columns)
df.display()


age,gender,city,tenure_months,avg_session_time,total_orders,avg_order_value,last_month_spend,discount_used,churn
23,Female,Hyderabad,6,12.5,8,420.0,2100.0,1,0
35,Male,Bangalore,24,18.2,45,650.0,8200.0,0,0
29,Female,Chennai,10,9.8,12,390.0,1800.0,1,1
41,Male,Delhi,36,21.4,78,720.0,11500.0,0,0
26,Female,Pune,4,7.5,5,310.0,900.0,1,1
33,Male,Mumbai,18,15.1,32,580.0,6400.0,0,0


# Basic EDA

In [0]:
df.printSchema()
df.describe().display()

df.groupBy("churn").count().display()
df.groupBy("city").avg("last_month_spend").display()


root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- city: string (nullable = true)
 |-- tenure_months: long (nullable = true)
 |-- avg_session_time: double (nullable = true)
 |-- total_orders: long (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- last_month_spend: double (nullable = true)
 |-- discount_used: long (nullable = true)
 |-- churn: long (nullable = true)



summary,age,gender,city,tenure_months,avg_session_time,total_orders,avg_order_value,last_month_spend,discount_used,churn
count,6.0,6,6,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,31.166666666666668,,,16.333333333333332,14.083333333333334,30.0,511.6666666666667,5150.0,0.5,0.3333333333333333
stddev,6.5243135015621885,,,12.225656083281038,5.209766469494257,28.16380656090366,161.9156158826772,4237.33406754766,0.5477225575051661,0.5163977794943222
min,23.0,Female,Bangalore,4.0,7.5,5.0,310.0,900.0,0.0,0.0
max,41.0,Male,Pune,36.0,21.4,78.0,720.0,11500.0,1.0,1.0


churn,count
0,4
1,2


city,avg(last_month_spend)
Hyderabad,2100.0
Bangalore,8200.0
Chennai,1800.0
Delhi,11500.0
Pune,900.0
Mumbai,6400.0


# Feature Engineering

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [0]:
gender_indexer = StringIndexer(
    inputCol="gender",
    outputCol="gender_idx",
    handleInvalid="keep"
)

city_indexer = StringIndexer(
    inputCol="city",
    outputCol="city_idx",
    handleInvalid="keep"
)

df = gender_indexer.fit(df).transform(df)
df = city_indexer.fit(df).transform(df)


# Assemble Features

In [0]:
feature_cols = [
    "age", "tenure_months", "avg_session_time",
    "total_orders", "avg_order_value",
    "last_month_spend", "discount_used",
    "gender_idx", "city_idx"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

final_df = assembler.transform(df).select("features", "churn")
final_df.display()


features,churn
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""23.0"",""6.0"",""12.5"",""8.0"",""420.0"",""2100.0"",""1.0"",""0.0"",""3.0""]}",0
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""35.0"",""24.0"",""18.2"",""45.0"",""650.0"",""8200.0"",""0.0"",""1.0"",""0.0""]}",0
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""29.0"",""10.0"",""9.8"",""12.0"",""390.0"",""1800.0"",""1.0"",""0.0"",""1.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""41.0"",""36.0"",""21.4"",""78.0"",""720.0"",""11500.0"",""0.0"",""1.0"",""2.0""]}",0
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""26.0"",""4.0"",""7.5"",""5.0"",""310.0"",""900.0"",""1.0"",""0.0"",""5.0""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""33.0"",""18.0"",""15.1"",""32.0"",""580.0"",""6400.0"",""0.0"",""1.0"",""4.0""]}",0


# Train-Test Split

In [0]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)


# Train Model (Logistic Regression)

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="churn"
)

model = lr.fit(train_df)
predictions = model.transform(test_df)

predictions.select(
    "churn", "prediction", "probability"
).display()


churn,prediction,probability
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.999999999999287"",""7.129852264142755E-13""]}"


# Evaluation

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
auc


0.0

In [0]:
import mlflow
import mlflow.spark
