# ðŸ“Š Churn Prediction Pipeline (Free Edition)
Pipeline completo de previsÃ£o de churn em PySpark, adaptado para rodar na versÃ£o gratuita do Databricks.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.appName("ChurnPredictionPipeline").getOrCreate()

silver_path = "/Volumes/workspace/voc/churn/churn_silver_2025.csv"
df_silver = spark.read.option("header", True).option("inferSchema", True).csv(silver_path)
df_silver.show(5)

In [None]:
df_gold = df_silver.withColumn("taxa_uso_valor", col("total_logs_app_30d") / col("valor_mensalidade"))
df_gold = df_gold.withColumn("label", col("churn").cast("integer"))

df_gold = df_gold.withColumn(
    "categoria_index",
    when(col("categoria_principal_voc") == "API", 0)
    .when(col("categoria_principal_voc") == "BUGS", 1)
    .when(col("categoria_principal_voc") == "SUPORTE", 2)
    .when(col("categoria_principal_voc") == "FINANCEIRO", 3)
    .otherwise(4)
)
df_gold.show(5)

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_cols = ["valor_mensalidade", "total_logs_app_30d", "tickets_suporte_abertos", "score_sentimento_voc", "taxa_uso_valor", "categoria_index"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

df_ml = assembler.transform(df_gold)
df_ml.select("id_cliente", "features", "label").show(5)

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

train, test = df_ml.randomSplit([0.7, 0.3], seed=42)

rf = RandomForestClassifier(featuresCol="features", labelCol="label", probabilityCol="probability", predictionCol="prediction")
model = rf.fit(train)

predictions = model.transform(test)
predictions.select("id_cliente", "probability", "prediction").show(5)

In [None]:
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(predictions)

evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)

print(f"AcurÃ¡cia: {accuracy}")
print(f"AUC-ROC: {auc}")

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

extract_prob = udf(lambda v: float(v[1]), DoubleType())
churn_predictions_gold = predictions.withColumn("prob_churn", extract_prob(col("probability"))) \
    .select("id_cliente", "prob_churn", "prediction")

churn_predictions_gold.show(10)