In [1]:
import findspark
findspark.init('/home/raj/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('logreg').getOrCreate()

In [3]:
churnData=spark.read.csv('/home/raj/Documents/Udemy-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv',
                       header=True,inferSchema=True)

In [5]:
churnData.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [6]:
churnData.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [7]:
churnData.show(5)

+----------------+----+--------------+---------------+-----+---------+--------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|        Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+--------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:...|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:...|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:...|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:...|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
churnData.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [12]:
assembler =VectorAssembler(inputCols=['Age','Total_Purchase','Years','Num_Sites'],outputCol='churnFeatures' )

In [13]:
churnFeatures=assembler.transform(churnData)

In [15]:
cleanChurnData=churnFeatures.select('churnFeatures','Churn')

In [17]:
cleanChurnData.show(5)

+--------------------+-----+
|       churnFeatures|Churn|
+--------------------+-----+
|[42.0,11066.8,7.2...|    1|
|[41.0,11916.22,6....|    1|
|[38.0,12884.75,6....|    1|
|[42.0,8010.76,6.7...|    1|
|[37.0,9191.58,5.5...|    1|
+--------------------+-----+
only showing top 5 rows



In [18]:
from pyspark.ml.classification import LogisticRegression

In [20]:
trainData,testData=cleanChurnData.randomSplit([0.75,0.25])

In [21]:
trainData.describe().show()

+-------+------------------+
|summary|             Churn|
+-------+------------------+
|  count|               668|
|   mean|0.1751497005988024|
| stddev|0.3803799194270887|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [38]:
lr=LogisticRegression(labelCol='Churn',featuresCol='churnFeatures')

In [39]:
lrModel=lr.fit(trainData)

In [40]:
trainingSum=lrModel.summary

In [42]:
trainingSum.predictions.describe().show()

+-------+------------------+-------------------+
|summary|             Churn|         prediction|
+-------+------------------+-------------------+
|  count|               668|                668|
|   mean|0.1751497005988024|0.12874251497005987|
| stddev|0.3803799194270887| 0.3351657016239725|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+



In [43]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [44]:
predAndLabels=lrModel.evaluate(testData)

In [46]:
predAndLabels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|       churnFeatures|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,4....|    0|[4.76120649542652...|[0.99151729070327...|       0.0|
|[28.0,8670.98,3.9...|    0|[7.44377149263583...|[0.99941526768460...|       0.0|
|[30.0,8677.28,7.3...|    0|[4.21051533702674...|[0.98537824864012...|       0.0|
|[30.0,10183.98,5....|    0|[2.96832240339203...|[0.95112234683217...|       0.0|
|[30.0,11575.37,5....|    1|[4.15978046198945...|[0.98462897213325...|       0.0|
|[31.0,8829.83,4.5...|    0|[4.49966857379936...|[0.98900945543243...|       0.0|
|[31.0,9574.89,7.3...|    0|[2.90025395553135...|[0.94785898942214...|       0.0|
|[31.0,11297.57,6....|    1|[1.05685311642654...|[0.74208871262897...|       0.0|
|[31.0,12264.68,5....|    0|[3.72877578530628...|[0.97654130389894...|       0.0|
|[32.0,5756.12,5

In [47]:
churnEval=BinaryClassificationEvaluator(labelCol='Churn',rawPredictionCol='prediction')

In [48]:
auc=churnEval.evaluate(predAndLabels.predictions)

In [49]:
auc

0.8082077051926297

#### Deploy and predict 

In [50]:
newCustomers=spark.read.csv('/home/raj/Documents/Udemy-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv',
                           header=True,inferSchema=True)

In [51]:
newCustomers.show(5)

+--------------+----+--------------+---------------+-----+---------+--------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|        Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+--------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:...|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:...|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:...|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:...|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:...|Unit 0789 Box 0

In [52]:
newChurnFeatures =assembler.transform(newCustomers)

In [56]:
newCleanChurnData=newChurnFeatures.select('churnFeatures')

#newChurnFeatures.show(5)

In [58]:
newCleanChurnData.show()

+--------------------+
|       churnFeatures|
+--------------------+
|[37.0,9935.53,7.7...|
|[23.0,7526.94,9.2...|
|[65.0,100.0,1.0,1...|
|[32.0,6487.5,9.4,...|
|[32.0,13147.71,10...|
|[22.0,8445.26,3.4...|
+--------------------+



In [61]:
finalResulta=lrModel.transform(newChurnFeatures)

In [64]:
finalResulta.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

