In [1]:
#spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName ("Customer Churn Logistic Regression").getOrCreate ()

In [12]:
#loading the dataset
data = spark.read.csv ("customer_churn.csv", inferSchema=True, header=True)
data.printSchema ()
data.count ()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



900

In [16]:
#looking for any missing data
data.toPandas ().describe ()
#we found we have no missing data

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
count,900.0,900.0,900.0,900.0,900.0,900.0
mean,41.816667,10062.824033,0.481111,5.273156,8.587778,0.166667
std,6.12756,2408.644532,0.499921,1.274449,1.764836,0.372885
min,22.0,100.0,0.0,1.0,3.0,0.0
25%,38.0,8497.1225,0.0,4.45,7.0,0.0
50%,42.0,10045.87,0.0,5.215,8.0,0.0
75%,46.0,11760.105,1.0,6.11,10.0,0.0
max,65.0,18026.01,1.0,9.15,14.0,1.0


In [20]:
#assembler for machine learning
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler (inputCols=["Age", "Total_Purchase", "Account_Manager", "Years", "Num_Sites"], outputCol="features")

df = assembler.transform (data)
df = df.select ("features", "churn")
df.printSchema ()

root
 |-- features: vector (nullable = true)
 |-- churn: integer (nullable = true)



In [21]:
#train-test splitting
(train, test) = df.randomSplit ([0.7, 0.3])

In [22]:
#fitting model to the data
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression (featuresCol="features", labelCol="churn")
model = lr.fit (train)

In [29]:
summary = model.summary 
summary.predictions.describe ().toPandas ()

Unnamed: 0,summary,churn,prediction
0,count,626.0,626.0
1,mean,0.1709265175718849,0.1261980830670926
2,stddev,0.3767457766076395,0.3323380241566099
3,min,0.0,0.0
4,max,1.0,1.0


In [36]:
#evaluating the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator

labeled_pred = model.evaluate (test)
evaluator = BinaryClassificationEvaluator (rawPredictionCol="prediction", labelCol="churn")

auc = evaluator.evaluate (labeled_pred.predictions)
print (auc)

0.7320044296788483


### making predictions


In [39]:
#fitting to the whole dataset
model = lr.fit (df)

In [42]:
#new data to predict
new_customers = spark.read.csv ("new_customers.csv", inferSchema=True, header=True)
new_customers.printSchema ()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [44]:
#test set
new_test = assembler.transform (new_customers)
new_test.printSchema ()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [49]:
#predicting results
prediction = model.transform (new_test)
prediction.select ("Company", "prediction").show ()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

