In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Customer Churn Prediction with MLlib**

Churn prediction is big business. It minimizes customer defection by predicting which customers are likely to cancel a subscription to a service. Though originally used within the telecommunications industry, it has become common practice across banks, ISPs, insurance firms, and other verticals.

The prediction process is heavily data-driven and often utilizes advanced machine learning techniques. Here, we’ll take a look at what types of customer data are typically used, do some preliminary analysis of the data, and generate churn prediction models – all with PySpark and its machine learning frameworks.

In [None]:
#!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('churnanalysis').getOrCreate()

In [None]:
from pyspark.ml.classification import LogisticRegression


In [None]:
input_data=spark.read.csv('../input/Churn_Modelling.csv',header=True,inferSchema=True)

In [None]:
input_data.printSchema() #training data

**Using VectorAssembler**


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
 
assembler=VectorAssembler(inputCols=['Age','NumOfProducts','IsActiveMember','Tenure','CreditScore'],outputCol='features')
 
output_data=assembler.transform(train)

**Train Test Split**

In [None]:
final_data=output_data.select('features','Exited')         #creating final data with only 2 columns
 
train,test=final_data.randomSplit([0.7,0.3])          #splitting data

print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
 

**Using Logistic Regression on the data**


In [None]:
model=LogisticRegression(labelCol='Exited')           #creating model
 
model=model.fit(train)        #fitting model on training dataset
 
summary=model.summary
 
summary.predictions.describe().show()         #summary of the predictions on training data

We can obtain the coefficients by using LogisticRegressionModel’s attributes.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
beta = np.sort(model.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()

Summarize the model over the training set, we can also obtain the receiver-operating characteristic and areaUnderROC.

In [None]:
trainingSummary = model.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

**Precision and Recall**

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

**Make predictions on the test set**

In [None]:
predictions = model.transform(test)

predictions.select('Exited', 'rawPrediction', 'prediction', 'probability').show(10)
#summary=model.summary
 
predictions.describe().show()         #summary of the predictions on training data