###  Customer Churn -- Here are the fields and their definitions

        Name : Name of the latest contact at Company,
        Age: Customer Age,
        Total_Purchase: Total Ads Purchased,
        Account_Manager: Binary 0=No manager, 1= Account manager assigned,
        Years: Totaly Years as a customer,
        Num_sites: Number of websites that use the service.,
        Onboard_date: Date that the name of the latest contact was onboarded,
        Location: Client HQ Address,
        Company: Name of Client Company,
        

In [34]:
import findspark

In [35]:
findspark.init('/home/ubuntu/spark-2.4.5-bin-hadoop2.7')

In [36]:
import pyspark

In [37]:
from pyspark.sql import SparkSession

In [38]:
spark= SparkSession.builder.appName('logreg').getOrCreate()

In [39]:
data = spark.read.csv('customer_churn.csv',inferSchema=True,header=True)

In [40]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [41]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [42]:
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [43]:
from pyspark.ml.feature import VectorAssembler

In [44]:
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')

In [45]:
output = assembler.transform(data)

In [46]:
final_data = output.select('features','churn')

In [47]:
train_churn,test_churn = final_data.randomSplit([0.7,0.3])

In [48]:
from pyspark.ml.classification import LogisticRegression

In [49]:
lr = LogisticRegression(labelCol='churn')

In [50]:
lr_fit = lr.fit(train_churn)

In [51]:
training_sum = lr_fit.summary

In [52]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                639|                639|
|   mean|  0.162754303599374|0.13615023474178403|
| stddev|0.36943053785154334|0.34321668849647147|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [54]:
pred_and_labels = lr_fit.evaluate(test_churn)

In [55]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.89728112578236...|[0.71039045284174...|       0.0|
|[28.0,11204.23,0....|    0|[2.64116845291722...|[0.93346457200804...|       0.0|
|[29.0,9378.24,0.0...|    0|[6.14922758234127...|[0.99786941872309...|       0.0|
|[29.0,9617.59,0.0...|    0|[5.72955997313142...|[0.99676201227444...|       0.0|
|[29.0,12711.15,0....|    0|[6.87845099345395...|[0.99897132138819...|       0.0|
|[29.0,13255.05,1....|    0|[5.20785917010504...|[0.99455641606975...|       0.0|
|[30.0,6744.87,0.0...|    0|[4.54291862428497...|[0.98946976561090...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.98443093821479...|[0.98173672463215...|       0.0|
|[30.0,8874.83,0.0...|    0|[4.13453982895819...|[0.98424225125504...|       0.0|
|[30.0,10744.14,

In [56]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [57]:
auc = churn_eval.evaluate(pred_and_labels.predictions)

In [58]:
auc

0.7500505561172901

### -------------xxxxxxxx----------------
