In [201]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [202]:
from google.colab import files
uploaded=files.upload()

Saving churn.csv to churn (4).csv


In [203]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [204]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [205]:
df = spark.read.csv("churn.csv", header=True)
df.show(5,True)

+---+----------------+----+--------------+---------------+-----+---------+-----+
|_c0|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+---+----------------+----+--------------+---------------+-----+---------+-----+
|  0|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|  Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
+---+----------------+----+--------------+---------------+-----+---------+-----+
only showing top 5 rows



In [206]:
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [207]:
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [208]:
from pyspark.sql.functions import col
df = df.drop("Names")

In [209]:
transformed = transData(df)
transformed.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,42.0,11066.8...|    1|
|[1.0,41.0,11916.2...|    1|
|[2.0,38.0,12884.7...|    1|
|[3.0,42.0,8010.76...|    1|
|[4.0,37.0,9191.58...|    1|
+--------------------+-----+
only showing top 5 rows



In [210]:
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='numLabel').fit(transformed)
labelIndexer.transform(transformed).show(5, True)

+--------------------+-----+--------+
|            features|label|numLabel|
+--------------------+-----+--------+
|[0.0,42.0,11066.8...|    1|     1.0|
|[1.0,41.0,11916.2...|    1|     1.0|
|[2.0,38.0,12884.7...|    1|     1.0|
|[3.0,42.0,8010.76...|    1|     1.0|
|[4.0,37.0,9191.58...|    1|     1.0|
+--------------------+-----+--------+
only showing top 5 rows



In [211]:
(train_data, test_data) = transformed.randomSplit([0.5, 0.5])

train_data.show(5)
test_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,42.0,11066.8...|    1|
|[3.0,42.0,8010.76...|    1|
|[7.0,32.0,9885.12...|    1|
|[10.0,30.0,11575....|    1|
|[11.0,45.0,8771.0...|    1|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,41.0,11916.2...|    1|
|[2.0,38.0,12884.7...|    1|
|[4.0,37.0,9191.58...|    1|
|[5.0,48.0,10356.0...|    1|
|[6.0,44.0,11331.5...|    1|
+--------------------+-----+
only showing top 5 rows



In [212]:
train_data

DataFrame[features: vector, label: string]

In [213]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol='numLabel', featuresCol='features',maxIter=10)

In [214]:
from pyspark.ml.feature import IndexToString

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [215]:
pipeline = Pipeline(stages=[labelIndexer,gbt,labelConverter])

In [216]:
model = pipeline.fit(train_data)

In [217]:
predictions.show(5)

+--------------------+-----+--------+--------------------+--------------------+----------+--------------+
|            features|label|numLabel|       rawPrediction|         probability|prediction|predictedLabel|
+--------------------+-----+--------+--------------------+--------------------+----------+--------------+
|[0.0,42.0,11066.8...|    1|     1.0|[-1.3259026792203...|[0.06587782434721...|       1.0|             1|
|[1.0,41.0,11916.2...|    1|     1.0|[-1.3259026792203...|[0.06587782434721...|       1.0|             1|
|[2.0,38.0,12884.7...|    1|     1.0|[-1.3259026792203...|[0.06587782434721...|       1.0|             1|
|[3.0,42.0,8010.76...|    1|     1.0|[-1.3259026792203...|[0.06587782434721...|       1.0|             1|
|[8.0,43.0,14062.6...|    1|     1.0|[-1.3259026792203...|[0.06587782434721...|       1.0|             1|
+--------------------+-----+--------+--------------------+--------------------+----------+--------------+
only showing top 5 rows



In [218]:
predictions = model.transform(test_data)
predictions.select("features","label","prediction").show(5)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.0,41.0,11916.2...|    1|       1.0|
|[2.0,38.0,12884.7...|    1|       1.0|
|[4.0,37.0,9191.58...|    1|       1.0|
|[5.0,48.0,10356.0...|    1|       1.0|
|[6.0,44.0,11331.5...|    1|       1.0|
+--------------------+-----+----------+
only showing top 5 rows



In [222]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="numLabel", predictionCol="prediction")

predictionAndTarget = model.transform(test_data).select("numLabel", "prediction")

acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})

In [224]:
acc,f1,weightedPrecision,weightedRecall

(0.9978070175438597, 0.997813371909781, 0.9978379046207067, 0.9978070175438597)