1. objective

To predict whether a bank customer will churn (leave the bank) based on various customer attributes.

2. Data Source

We'll use the "Bank Customer Churn" dataset, which can be found on platforms like Kaggle or UCI Machine Learning Repository.



In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=8d94a2f3742f4d79b3e408e0dcc495a1d5de1850263c77d3f6d15f5389af8e55
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pyspark

In [None]:
pyspark.__version__

'3.5.1'

3. import library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


4.import data

In [None]:
data = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/main/Bank%20Churn%20Modelling.csv')

5. descibe data

In [None]:
data.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,Num Of Products,Has Credit Card,Is Active Member,Estimated Salary,Churn
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
data.describe()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,Num Of Products,Has Credit Card,Is Active Member,Estimated Salary,Churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerId        10000 non-null  int64  
 1   Surname           10000 non-null  object 
 2   CreditScore       10000 non-null  int64  
 3   Geography         10000 non-null  object 
 4   Gender            10000 non-null  object 
 5   Age               10000 non-null  int64  
 6   Tenure            10000 non-null  int64  
 7   Balance           10000 non-null  float64
 8   Num Of Products   10000 non-null  int64  
 9   Has Credit Card   10000 non-null  int64  
 10  Is Active Member  10000 non-null  int64  
 11  Estimated Salary  10000 non-null  float64
 12  Churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


6. data visualization

In [None]:

data.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'Num Of Products', 'Has Credit Card',
       'Is Active Member', 'Estimated Salary', 'Churn'],
      dtype='object')

In [None]:
# Import necessary PySpark modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Create a SparkSession if one doesn't exist
spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()

# Assuming 'data' is your Pandas DataFrame
spark_df = spark.createDataFrame(data) # Convert Pandas DataFrame to PySpark DataFrame

# Handle string columns using StringIndexer
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index")
            for column in ['Surname', 'Geography', 'Gender']]
for indexer in indexers:
    spark_df = indexer.fit(spark_df).transform(spark_df)

# Proceed with VectorAssembler using indexed columns
featureassembler = VectorAssembler(inputCols=[
 'CustomerId', 'Surname_index', 'CreditScore', 'Geography_index', 'Gender_index', 'Age',
       'Tenure', 'Balance', 'Num Of Products', 'Has Credit Card',
       'Is Active Member', 'Estimated Salary'], outputCol='Features')

output = featureassembler.transform(spark_df) # Use PySpark DataFrame here

In [None]:
spark_df.show()

+----------+---------+-----------+---------+------+---+------+---------+---------------+---------------+----------------+----------------+-----+-------------+---------------+------------+
|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|Num Of Products|Has Credit Card|Is Active Member|Estimated Salary|Churn|Surname_index|Geography_index|Gender_index|
+----------+---------+-----------+---------+------+---+------+---------+---------------+---------------+----------------+----------------+-----+-------------+---------------+------------+
|  15634602| Hargrave|        619|   France|Female| 42|     2|      0.0|              1|              1|               1|       101348.88|    1|       1958.0|            0.0|         1.0|
|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|              1|              0|               1|       112542.58|    0|         79.0|            2.0|         1.0|
|  15619304|     Onio|        502|   France|Female| 42|     

7. Modeling

In [None]:
modeldata = output.select('Features','Balance')

In [None]:
modeldata.show()

+--------------------+---------+
|            Features|  Balance|
+--------------------+---------+
|[1.5634602E7,1958...|      0.0|
|[1.5647311E7,79.0...| 83807.86|
|[1.5619304E7,336....| 159660.8|
|[1.5701354E7,128....|      0.0|
|[1.5737888E7,32.0...|125510.82|
|[1.5574012E7,14.0...|113755.78|
|[1.5592531E7,631....|      0.0|
|[1.5656148E7,1269...|115046.74|
|[1.5792365E7,57.0...|142051.07|
|[1.5592389E7,44.0...|134603.88|
|[1.5767821E7,1468...|102016.72|
|[1.5737173E7,417....|      0.0|
|[1.5632264E7,556....|      0.0|
|[1.5691483E7,92.0...|      0.0|
|[1.5600882E7,2.0,...|      0.0|
|[1.5643966E7,1909...|143129.41|
|[1.5737452E7,2591...|132602.88|
|[1.5788218E7,97.0...|      0.0|
|[1.5661507E7,2381...|      0.0|
|[1.5568982E7,197....|      0.0|
+--------------------+---------+
only showing top 20 rows



8. Train Test Split

In [None]:
train_data, test_data = modeldata.randomSplit([0.8,0.2])

In [None]:
from pyspark.ml.regression import LinearRegression
reg = LinearRegression(featuresCol='Features',labelCol='Balance')
reg = reg.fit(train_data)

In [None]:
reg.coefficients

DenseVector([-0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 1.0, 0.0, -0.0, 0.0, 0.0])

In [None]:
reg.intercept

6.560637707029068e-07

9. prediction

In [None]:
reg.transform(test_data).show()


+--------------------+---------+--------------------+
|            Features|  Balance|          prediction|
+--------------------+---------+--------------------+
|[1.5565806E7,2784...|      0.0|5.185503672734214...|
|[1.5566091E7,2770...|      0.0|5.334164389745931E-9|
|[1.5566251E7,290....| 96652.86|   96652.86000000515|
|[1.5566292E7,583....|      0.0|5.362737011334272E-9|
|[1.5566633E7,321....| 155059.1|  155059.10000000513|
|[1.5566708E7,1618...|      0.0|5.239890323932541E-9|
|[1.5566958E7,242....|167557.12|   167557.1200000051|
|[1.5567063E7,83.0...|106434.94|  106434.94000000521|
|[1.5567333E7,1422...|      0.0|5.247948138078485E-9|
|[1.5567367E7,256....|133636.16|  133636.16000000507|
|[1.5567486E7,205....|      0.0|5.308762966563373E-9|
|[1.5568044E7,509....|      0.0|5.099802952285032E-9|
|[1.5568106E7,140....|119278.01|  119278.01000000504|
|[1.5568595E7,56.0...|113829.45|  113829.45000000489|
|[1.5569098E7,2889...|153548.12|  153548.12000000497|
|[1.5569248E7,211....|      

In [None]:
y_pred = reg.evaluate(test_data)

In [None]:
y_pred.predictions.show()

+--------------------+---------+--------------------+
|            Features|  Balance|          prediction|
+--------------------+---------+--------------------+
|[1.5565806E7,2784...|      0.0|5.185503672734214...|
|[1.5566091E7,2770...|      0.0|5.334164389745931E-9|
|[1.5566251E7,290....| 96652.86|   96652.86000000515|
|[1.5566292E7,583....|      0.0|5.362737011334272E-9|
|[1.5566633E7,321....| 155059.1|  155059.10000000513|
|[1.5566708E7,1618...|      0.0|5.239890323932541E-9|
|[1.5566958E7,242....|167557.12|   167557.1200000051|
|[1.5567063E7,83.0...|106434.94|  106434.94000000521|
|[1.5567333E7,1422...|      0.0|5.247948138078485E-9|
|[1.5567367E7,256....|133636.16|  133636.16000000507|
|[1.5567486E7,205....|      0.0|5.308762966563373E-9|
|[1.5568044E7,509....|      0.0|5.099802952285032E-9|
|[1.5568106E7,140....|119278.01|  119278.01000000504|
|[1.5568595E7,56.0...|113829.45|  113829.45000000489|
|[1.5569098E7,2889...|153548.12|  153548.12000000497|
|[1.5569248E7,211....|      

10. Explaination

In [None]:
y_pred.meanAbsoluteError

2.5628224972423788e-09

In [None]:
y_pred.rootMeanSquaredError

2.974852472316454e-09

In [None]:
y_pred.meanSquaredError

8.849747232047318e-18

In [None]:
y_pred.r2

1.0

In [None]:
y_pred.r2adj


1.0