# Predict Customer Churn with PySpark

This notebook demonstrates how to:
- Create a list of 5 unlabelled customer records.
- Load a previously stored Decision Tree model.
- Use the model to predict churn for these customers.

## Step 1: Initialize PySpark and Define Sample Customers
Create a DataFrame with 5 unlabelled customer records.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.feature import VectorAssembler

# Initialize Spark session
spark = SparkSession.builder.appName("PredictChurn_WithoutLabels").getOrCreate()

# Define schema for the customer records
schema = StructType([
    StructField("account_length", IntegerType(), True),
    StructField("number_vmail_messages", IntegerType(), True),
    StructField("total_day_minutes", DoubleType(), True),
    StructField("total_day_calls", IntegerType(), True),
    StructField("total_day_charge", DoubleType(), True),
    StructField("total_eve_minutes", DoubleType(), True),
    StructField("total_eve_calls", IntegerType(), True),
    StructField("total_eve_charge", DoubleType(), True),
    StructField("total_night_minutes", DoubleType(), True),
    StructField("total_night_calls", IntegerType(), True),
    StructField("total_night_charge", DoubleType(), True),
    StructField("total_intl_minutes", DoubleType(), True),
    StructField("total_intl_calls", IntegerType(), True),
    StructField("total_intl_charge", DoubleType(), True),
    StructField("number_customer_service_calls", IntegerType(), True)
])

# Create a list of 5 unlabelled customers
sample_customers_data = [
    (128, 25, 265.1, 110, 45.07, 197.4, 99, 16.78, 244.7, 91, 11.01, 10.0, 3, 2.7, 1),
    (107, 26, 161.6, 123, 27.47, 195.5, 103, 16.62, 254.4, 103, 11.45, 13.7, 3, 3.7, 1),
    (137, 0, 243.4, 114, 41.38, 121.2, 110, 10.3, 162.6, 104, 7.32, 12.2, 5, 3.29, 0),
    (84, 0, 299.4, 71, 50.9, 61.9, 88, 5.26, 196.9, 89, 8.86, 6.6, 7, 1.78, 2),
    (75, 0, 166.7, 113, 28.34, 148.3, 122, 12.61, 186.9, 121, 8.41, 10.1, 3, 2.73, 3)
]

# Create a DataFrame from the sample data
sample_customers_df = spark.createDataFrame(sample_customers_data, schema=schema)
sample_customers_df.show()

## Step 2: Assemble Features for the Sample Customers
Prepare the `features` column required for the model.

In [None]:
# Assemble features into a vector
assembler = VectorAssembler(
    inputCols=[
        "account_length", "number_vmail_messages", "total_day_minutes",
        "total_day_calls", "total_day_charge", "total_eve_minutes",
        "total_eve_calls", "total_eve_charge", "total_night_minutes",
        "total_night_calls", "total_night_charge", "total_intl_minutes",
        "total_intl_calls", "total_intl_charge", "number_customer_service_calls"
    ],
    outputCol="features"
)
sample_customers_df = assembler.transform(sample_customers_df)
sample_customers_df.select("features").show(truncate=False)

## Step 3: Load the Stored Model
Load the trained Decision Tree model stored earlier.

In [None]:
from pyspark.ml.classification import DecisionTreeClassificationModel

# Load the stored model
model_path = "./trained_model_decision_tree"
loaded_model = DecisionTreeClassificationModel.load(model_path)
print("Model loaded successfully.")

## Step 4: Generate Predictions for the Sample Customers
Use the loaded model to predict churn for the unlabelled customers.

In [None]:
# Generate predictions
predictions = loaded_model.transform(sample_customers_df)

# Show the predictions
predictions.select("features", "prediction", "probability").show(truncate=False)

## Step 5: Analyze Predictions
Review the predictions to understand which customers are likely to churn.

In [None]:
# Analyze predictions
for row in predictions.select("features", "prediction", "probability").collect():
    print(f"Features: {row['features']}")
    print(f"Predicted Label (Churn=1, No Churn=0): {row['prediction']}")
    print(f"Probability: {row['probability']}")
    print("---")