In [1]:
import findspark
findspark.init('/home/raj/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('logreg').getOrCreate()

In [3]:
churnData=spark.read.csv('/home/raj/Documents/Udemy-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv',
                       header=True,inferSchema=True)

In [5]:
churnData.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [6]:
churnData.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [7]:
churnData.show(5)

+----------------+----+--------------+---------------+-----+---------+--------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|        Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+--------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:...|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:...|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:...|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:...|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
churnData.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [12]:
assembler =VectorAssembler(inputCols=['Age','Total_Purchase','Years','Num_Sites'],outputCol='churnFeatures' )

In [13]:
churnFeatures=assembler.transform(churnData)

In [15]:
cleanChurnData=churnFeatures.select('churnFeatures','Churn')

In [17]:
cleanChurnData.show(5)

+--------------------+-----+
|       churnFeatures|Churn|
+--------------------+-----+
|[42.0,11066.8,7.2...|    1|
|[41.0,11916.22,6....|    1|
|[38.0,12884.75,6....|    1|
|[42.0,8010.76,6.7...|    1|
|[37.0,9191.58,5.5...|    1|
+--------------------+-----+
only showing top 5 rows



In [18]:
from pyspark.ml.classification import LogisticRegression

In [20]:
trainData,testData=cleanChurnData.randomSplit([0.75,0.25])

In [21]:
trainData.describe().show()

+-------+------------------+
|summary|             Churn|
+-------+------------------+
|  count|               668|
|   mean|0.1751497005988024|
| stddev|0.3803799194270887|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [24]:
lr=LogisticRegression(maxIter=50,regParam=0.3,elasticNetParam=0.8,labelCol='Churn',featuresCol='churnFeatures')

In [25]:
lrModel=lr.fit(trainData)

In [26]:
testResults=lrModel.evaluate(testData)

In [27]:
testResults.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|       churnFeatures|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,4....|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[28.0,8670.98,3.9...|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[30.0,8677.28,7.3...|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[30.0,10183.98,5....|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[30.0,11575.37,5....|    1|[1.54956087435515...|[0.82485029940119...|       0.0|
|[31.0,8829.83,4.5...|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[31.0,9574.89,7.3...|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[31.0,11297.57,6....|    1|[1.54956087435515...|[0.82485029940119...|       0.0|
|[31.0,12264.68,5....|    0|[1.54956087435515...|[0.82485029940119...|       0.0|
|[32.0,5756.12,5

In [28]:
testResults.areaUnderROC

0.5