In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [2]:
from pyspark.ml.classification import LogisticRegression

In [3]:
df = spark.read.csv('customer_churn.csv', inferSchema=True, header=True)

In [4]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [5]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [6]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [7]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [8]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'],
                            outputCol='features')

In [9]:
output = assembler.transform(df)

In [10]:
final_data = output.select('features', 'churn')

In [11]:
train_data, test_data = final_data.randomSplit([.7, .3])

In [12]:
log_rig = LogisticRegression(labelCol='churn')

In [13]:
fit_model = log_rig.fit(train_data)

In [14]:
train_summary = fit_model.summary

In [15]:
train_summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                651|                651|
|   mean|0.16589861751152074| 0.1336405529953917|
| stddev|0.37227564086668546|0.34052735547947927|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [17]:
pred_and_label = fit_model.evaluate(test_data)

In [18]:
pred_and_label.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.79710937546362...|[0.99181399321113...|       0.0|
|[26.0,8787.39,1.0...|    1|[0.76557584834550...|[0.68256308478351...|       0.0|
|[28.0,8670.98,0.0...|    0|[8.21627169401448...|[0.99972985234502...|       0.0|
|[28.0,11245.38,0....|    0|[3.90517281480963...|[0.98026003882089...|       0.0|
|[29.0,11274.46,1....|    0|[4.65205650720706...|[0.99054822976807...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.86257191848128...|[0.97941861095530...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.39931883509897...|[0.96768324043413...|       0.0|
|[30.0,8677.28,1.0...|    0|[4.38670809183184...|[0.98771127310517...|       0.0|
|[30.0,10744.14,1....|    1|[1.80088648074483...|[0.85825681155876...|       0.0|
|[30.0,12788.37,

In [19]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [20]:
AUC = my_eval.evaluate(pred_and_label.predictions)

In [21]:
AUC

0.7236024844720497