<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch06_ClassificationAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
import pyspark_helpers as pyh
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display

### Let's read in a bank data set to try to predict if a potential borrower will default on their loan before lending to them

In [None]:
filename = 'bank.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df
print(df.dtypes)

### Clean up the dataset by identifying the numeric and categorical features and target

In [None]:
# Let's just keep a few fields to start with for simplicity
import imp
imp.reload(pyh)

num = pyh.auto_numeric_features(df, exceptlist = ('day'))
print (num)
cat = pyh.auto_categorical_features(df, suffix = None, exceptlist = ('default'))
#cat = pyh.auto_categorical_features(df, suffix = ('ing', 'ion'))
print (cat)

numeric_features = ['age','balance', 'duration', 'pdays']
categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

# numeric_features = ['balance', 'duration', 'age']
# categorical_features = ['marital', 'education']
target_label = 'default'


df = dfRawFile.select(numeric_features + categorical_features + [target_label])
display(df)
print(df.take(1))

### Explore numeric features. To see if there is any correlation between values

In [None]:
%matplotlib inline
#display(df.describe())
display(pyh.describe_numeric_features(df, numeric_features))
pyh.scatter_matrix(df, numeric_features)


### Use the helper function to reshape it for ML training

In [None]:
%matplotlib inline
# import imp
# imp.reload(pyh)

dfML, keydict = pyh.MakeMLDataFrame(df, categorical_features, numeric_features, target_label = 'default', target_is_categorical=True, return_key_dict = True)
display(dfML)
dfML.printSchema()
labelCnt = dfML.groupBy('label').count()
display(labelCnt)
print('keydict ' , keydict)
print(dfML.take(1))
print(df.take(1))

In [None]:
%matplotlib inline
labelCnt.toPandas().plot(kind = 'bar')
df.groupBy('job').count().toPandas().plot(kind='bar')


### Save the Vectorized file in case we want to use it again 

In [None]:
# dfML.write.format('parquet').mode('overwrite').save('testsave')
# dfML0 = spark.read.format('parquet').load('testsave')
# dfML0.printSchema()
# display(dfML0)

### Split it into training and testing sets

In [None]:
#dfML = dfML0
train, test = dfML.randomSplit([.7,.3], seed = 100)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')
display(train.groupBy('label').count())
display(test.groupBy('label').count())
#print(test.take(1))


### Import the Decision Tree classifier and train it

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 6)
dtModel = dt.fit(train)
print('DT Trained')

filename1 = filename.replace('.','_') + '_DT_trainedModel'
dtModel.write().overwrite().save(filename1)
print('DT Saved')


In [None]:
pred = dtModel.transform(test)
display(pred)
from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(pred.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0])))
print(metrics.confusionMatrix().toArray())


### Normally there are a lot of steps to predict and test. We have built a helper function to bundle all that up.
Take a look at the source code for it to see those indivual steps.

In [None]:
dtPredictions, dtLog = pyh.predict_and_evaluate(dtModel, test)


In [None]:
display(df)
rdd = df.rdd
print(type(df), type(rdd))
print(df.take(1))
print(rdd.take(1))

print(dir(df))
print ('*' * 50)
print(dir(rdd))

In [None]:
dtLog

### Now let's try Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
print('LR Trained')

filename1 = filename.replace('.','_') + '_LR_trainedModel'
lrModel.write().overwrite().save(filename1)
print('LR Saved')

#evaluate_model(lr)
pyh.beta_coefficients(lrModel)
pyh.roc_curve(lrModel)
pyh.precision_recall(lrModel)


### Normally you should be able to load the trained model, but for some reason it's not working correctly on this VM

In [None]:
#lrModel2 = LogisticRegression.load(filename1)

### See the test results as before, but LR has some extra options

In [None]:
print(lrModel.summary.roc)
lrPredictions, lrLog = pyh.predict_and_evaluate(lrModel, test, showModel = False)

### Let's try different thresholds to see if we can tweak the false positive/negative balance or improve the overall accuracy

In [None]:
lr2 = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, threshold = .1).fit(train)

lr2Predictions, lr2Log = pyh.predict_and_evaluate(lr2, test, showModel = False)

In [None]:
from pyspark.sql.functions import expr, udf
from pyspark.sql.types import *

print(lr2Predictions.select('probability').take(2))
#print(lr2Predictions.where('probability[0] >= .2 and probability <= .8').select('probability').take(2))
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

spark.udf.register('firstelement', lambda v:float(v[0]), FloatType())

lr2Predictions.createOrReplaceTempView('predictions')
display(spark.sql('select probability from predictions where firstelement(probability) between .2 and .8'))

firstelement=udf(lambda v:float(v[0]),FloatType())
#lr2Predictions.select(firstelement('probability')).show()
lr2Predictions.where(firstelement('probability') >= .2).where(firstelement('probability') <= .8 ).select('probability').show()

### After a while it's the same thing over and over, but try out as many models as possible to see which works best


In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees = 10, maxDepth = 3)
rfModel = rf.fit(train)
print('RF Trained')

filename1 = filename.replace('.','_') + '_RF_trainedModel'
rfModel.write().overwrite().save(filename1)
print('RF Saved')



In [None]:
rfPredictions, rfLog = pyh.predict_and_evaluate(rfModel, test)



### Try Gradient Boost

In [None]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
print ('GBT Trained')

filename1 = filename.replace('.','_') + '_GBT_trainedModel'
rfModel.write().overwrite().save(filename1)
print ('GBT Saved')



In [None]:
gbtPredictions, gbtLog = pyh.predict_and_evaluate(gbtModel, test)



### Try Neural Networks

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# specify layers for the neural network:
# input layer of size 13 (features), two intermediate of size 5 and 4
# and output of size 2 (classes)
layers = [13, 5, 4, 2]

nn = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
nnModel = nn.fit(train)
print ('NN Trained')

filename1 = filename.replace('.','_') + '_NN_trainedModel'
nnModel.write().overwrite().save(filename1)
print ('NN Saved')



In [None]:
nnPredictions = nnModel.transform(test)
#pyh.evaluate_ROC(nnPredictions)

In [None]:
nnPredictions = nnModel.transform(test)
print(type(nnPredictions))
#print(nnPredictions.take(1))
nnPredictions.printSchema()
print (nnPredictions.count())

nnPredictions, nnLog = pyh.predict_and_evaluate(nnModel, test)
##nnPredictions.take(1)
# predictionAndLabels = nnPredictions.select("prediction", "label")
# #display(predictionAndLabels)
# print(predictionAndLabels.collect())
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


models = [
    (DecisionTreeClassifier, dict(featuresCol = 'features', labelCol = 'label', maxDepth = 6))
    , (DecisionTreeClassifier, dict(featuresCol = 'features', labelCol = 'label', maxDepth = 3))
    , (MultilayerPerceptronClassifier, dict(maxIter=100, layers=[13, 5, 4, 2], blockSize=128, seed=1234))
    , (MultilayerPerceptronClassifier, dict(maxIter=100, layers=[13, 3, 2], blockSize=128, seed=1234))
    , (GBTClassifier, {})
]

for modelclass, params in models:
    model = modelclass(**params)
    trained = model.fit(train)
    #pred = trained.transform(test)
    pred, log = pyh.predict_and_evaluate(trained, test, showModel = False, show = False)
    print (log)
    
    