In [2]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/8e/b0/bf9020b56492281b9c9d8aae8f44ff51e1bc91b3ef5a884385cb4e389a40/pyspark-3.0.0.tar.gz (204.7MB)
[K     |████████████████████████████████| 204.7MB 66kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 19.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044182 sha256=2dc319bad8ba62c99f61e7488d00a41a8c4ceae159067e2c865b4429035559a0
  Stored in directory: /root/.cache/pip/wheels/57/27/4d/ddacf7143f8d5b76c45c61ee2e43d9f8492fc5a8e78ebd7d37
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.0


In [3]:
#Initialize SparkSession and SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [4]:
#Create a Spark Session
SpSession = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

In [6]:
#Load Data
#Load the CSV file into a RDD
bankData = SpContext.textFile("/content/drive/My Drive/bank.csv")
bankData.cache()
bankData.count()

#Remove the first line (contains headers)
firstLine=bankData.first()
dataLines = bankData.filter(lambda x: x != firstLine)
dataLines.count()

541

In [7]:
#Cleanup Data
import math
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

def transformToNumeric( inputStr) :
    
    attList=inputStr.replace("\"","").split(";")
    
    age=float(attList[0])
    #convert outcome to float    
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    #create indicator variables for single/married    
    single= 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    #create indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    #convert default to float
    default= 0.0 if attList[4] == "no" else 1.0
    #convert balance amount to float
    balance=float(attList[5])
    #convert loan to float
    loan= 0.0 if attList[7] == "no" else 1.0
    
    #Create a row with cleaned up and converted data
    values= Row(     OUTCOME=outcome ,\
                    AGE=age, \
                    SINGLE=single, \
                    MARRIED=married, \
                    DIVORCED=divorced, \
                    PRIMARY=primary, \
                    SECONDARY=secondary, \
                    TERTIARY=tertiary, \
                    DEFAULT=default, \
                    BALANCE=balance, \
                    LOAN=loan                    
                    ) 
    return values
    
#Change to a Vector
bankRows = dataLines.map(transformToNumeric)
bankRows.collect()[:15]

bankData = SpSession.createDataFrame(bankRows)

In [8]:
#Perform Data Analytics

#See descriptive analytics.
bankData.describe().show()

+-------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+-------------------+
|summary|            OUTCOME|               AGE|            SINGLE|           MARRIED|           DIVORCED|           PRIMARY|         SECONDARY|          TERTIARY|             DEFAULT|           BALANCE|               LOAN|
+-------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+-------------------+
|  count|                541|               541|               541|               541|                541|               541|               541|               541|                 541|               541|                541|
|   mean| 0.3974121996303142| 41.26987060998152|0.2754158964879852|0.6155268022181146|0.1090573012939001

In [9]:
#Prepare data for ML
#Transform to a Data Frame for input to Machine Learing

def transformToLabeledPoint(row) :
    lp = ( row["OUTCOME"], \
            Vectors.dense([
                row["AGE"], \
                row["BALANCE"], \
                row["DEFAULT"], \
                row["DIVORCED"], \
                row["LOAN"], \
                row["MARRIED"], \
                row["PRIMARY"], \
                row["SECONDARY"], \
                row["SINGLE"], \
                row["TERTIARY"]
        ]))
    return lp
    
bankLp = bankData.rdd.map(transformToLabeledPoint)
bankLp.collect()
bankDF = SpSession.createDataFrame(bankLp,["label", "features"])
bankDF.select("label","features").show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[30.0,1787.0,0.0,...|
|  1.0|[33.0,4789.0,0.0,...|
|  1.0|[35.0,1350.0,0.0,...|
|  1.0|[30.0,1476.0,0.0,...|
|  0.0|[59.0,0.0,0.0,0.0...|
|  1.0|[35.0,747.0,0.0,0...|
|  1.0|[36.0,307.0,0.0,0...|
|  0.0|[39.0,147.0,0.0,0...|
|  0.0|[41.0,221.0,0.0,0...|
|  1.0|[43.0,-88.0,0.0,0...|
+-----+--------------------+
only showing top 10 rows



In [10]:
#Perform Machine Learning

#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate=False)

#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures")
rmModel = rmClassifer.fit(trainingData)

#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction","indexed","label","pcaFeatures").show()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="accuracy")
evaluator.evaluate(predictions)      

#Draw a confusion matrix
predictions.groupBy("indexed","prediction").count().show()

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[-1787.018897197381,28.86209683775529,-0.06459982604876241] |
|1.0  |[-4789.020177138492,29.922562636341947,-0.9830243513096373] |
|1.0  |[-1350.022213163262,34.10110809796688,0.8951427168301704]   |
|1.0  |[-1476.0189517184556,29.051333993596703,0.3952723868021948] |
|0.0  |[-0.037889185366442445,58.9897182000177,-0.7290792383661886]|
|1.0  |[-747.0223377634923,34.48829198181773,0.9045654956970108]   |
|1.0  |[-307.0230691022592,35.799850539655225,0.5170631523785976]  |
|0.0  |[-147.02501216176339,38.90107856650329,-0.8069627548799397] |
|0.0  |[-221.0262985348787,40.853633675694944,0.5373036365803221]  |
|1.0  |[87.9723868768871,43.062659441151055,-0.0670164287117152]   |
|0.0  |[-9374.023105550941,32.97645883799288,-0.9511484606914431]  |
|0.0  |[-264.02755731528384,42.824