<a href="https://colab.research.google.com/github/roitraining/SparkforDataScientists/blob/Development/Ch06_ClassificationAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
import pyspark_helpers as pyh
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display

### Let's read in a bank data set to try to predict if a potential borrower will default on their loan before lending to them.

In [0]:
filename = 'bank.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df


### Clean up the dataset by identifying the numeric and categorical features and target.

In [0]:
# Let's just keep a few fields to start with for simplicity
numeric_features = ['age','balance', 'duration', 'pdays']
categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

# numeric_features = ['balance', 'duration', 'age']
# categorical_features = ['marital', 'education']
target_label = 'default'


df = dfRawFile.select(numeric_features + categorical_features + [target_label])
display(df)
print(df.take(1))

### Explore numeric features, to see if there is any correlation between values.

In [0]:
%matplotlib inline

display(pyh.describe_numeric_features(df, numeric_features))
pyh.scatter_matrix(df, numeric_features)


### Use the helper function to reshape it for ML training.

In [0]:
%matplotlib inline
# import imp
# imp.reload(pyh)

dfML = pyh.MakeMLDataFrame(df, categorical_features, numeric_features, target_label = 'default', target_is_categorical=True)
display(dfML)
dfML.printSchema()
labelCnt = dfML.groupBy('label').count()
display(labelCnt)



In [0]:
labelCnt.toPandas().plot(kind = 'bar')


### Save the Vectorized file in case we want to use it again. 

In [0]:
# dfML.write.format('parquet').mode('overwrite').save('testsave')
# dfML0 = spark.read.format('parquet').load('testsave')
# dfML0.printSchema()
# display(dfML0)

### Split it into training and testing sets.

In [0]:
#dfML = dfML0
train, test = dfML.randomSplit([.7,.3], seed = 100)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')
display(train.groupBy('label').count())
display(test.groupBy('label').count())



### Import the Decision Tree classifier and train it.

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
print('DT Trained')

filename1 = filename.replace('.','_') + '_DT_trainedModel'
dtModel.write().overwrite().save(filename1)
print('DT Saved')


### Normally there are a lot of steps to predict and test. We have built a helper function to bundle all that up.
Take a look at the source code for it to see those indivual steps.

In [0]:
dtPredictions, dtLog = pyh.predict_and_evaluate(dtModel, test)


### Now let's try Logistic Regression.

In [0]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
print('LR Trained')

filename1 = filename.replace('.','_') + '_LR_trainedModel'
lrModel.write().overwrite().save(filename1)
print('LR Saved')


### Normally you should be able to load the trained model, but for some reason it's not working correctly on this VM.

In [0]:
#lrModel2 = LogisticRegression.load(filename1)

### See the test results as before, but LR has some extra options.

In [0]:
lrPredictions, lrLog = pyh.predict_and_evaluate(lrModel, test, showModel = False)

### Let's try different thresholds to see if we can tweak the false positive/negative balance or improve the overall accuracy.

In [0]:
lr2 = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, threshold = .1).fit(train)

lr2Predictions, lr2Log = pyh.predict_and_evaluate(lr2, test, showModel = False)

### After a while it's the same thing over and over, but try out as many models as possible to see which works best.


In [0]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
print('RF Trained')

filename1 = filename.replace('.','_') + '_RF_trainedModel'
rfModel.write().overwrite().save(filename1)
print('RF Saved')



In [0]:
rfPredictions, rfLog = pyh.predict_and_evaluate(rfModel, test)



### Try Gradient Boost.

In [0]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
print ('GBT Trained')

filename1 = filename.replace('.','_') + '_GBT_trainedModel'
rfModel.write().overwrite().save(filename1)
print ('GBT Saved')



In [0]:
gbtPredictions, gbtLog = pyh.predict_and_evaluate(gbtModel, test)



### Try Neural Networks.

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# specify layers for the neural network:
# input layer of size 13 (features), two intermediate of size 5 and 4
# and output of size 2 (classes)
layers = [13, 5, 4, 2]

nn = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
nnModel = nn.fit(train)
print ('NN Trained')

filename1 = filename.replace('.','_') + '_NN_trainedModel'
nnModel.write().overwrite().save(filename1)
print ('NN Saved')



In [0]:
nnPredictions = nnModel.transform(test)
print(type(nnPredictions))
print(nnPredictions.take(1))
nnPredictions.printSchema()
print (nnPredictions.count())

#nnPredictions, nnLog = pyh.predict_and_evaluate(nnModel, test)
##nnPredictions.take(1)
# predictionAndLabels = nnPredictions.select("prediction", "label")
# #display(predictionAndLabels)
# print(predictionAndLabels.collect())
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))