In [None]:
import sys

rootpath = '/class'
datapath = f'{rootpath}/datasets/'
sys.path.append(rootpath)
import pyspark_helpers as pyh
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display

### Let's read in a bank dataset to try to predict if a potential borrower will default on their loan before lending to them.

In [None]:
filename = 'bank.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df
print(df.dtypes)

### Explore numeric features to see if there is any correlation between values.

In [None]:
%matplotlib inline
numeric_features = ['balance', 'duration', 'age']
categorical_features = ['marital', 'education']
display(pyh.describe_numeric_features(df, numeric_features))
pyh.scatter_matrix(df, numeric_features)


### Exploring categorical columns is also useful

In [None]:
%matplotlib inline
df.groupBy('default').count().toPandas().plot(kind = 'bar')
df.groupBy('job').count().toPandas().plot(kind='bar')


### Let's make a pipeline that combines several stages into one. First we will StringIndexer, then OneHotEncoder, then Vector Assemble

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StringIndexerModel
from pyspark.ml import Pipeline
# help(StringIndexer)
# help(OneHotEncoderEstimator)

maritalStringIndexer = StringIndexer(inputCol = 'marital', outputCol = 'marital_index')
jobStringIndexer = StringIndexer(inputCol = 'job', outputCol = 'job_index')
maritalOneHotEncoder = OneHotEncoderEstimator(inputCols=['marital_index'], outputCols=['marital_vector'])
jobOneHotEncoder = OneHotEncoderEstimator(inputCols=['job_index'], outputCols=['job_vector'])
assembler = VectorAssembler(inputCols=['balance', 'duration', 'age', 'marital_vector', 'job_vector']
                            , outputCol='features')
pipeline = Pipeline(stages = [maritalStringIndexer, jobStringIndexer, maritalOneHotEncoder, jobOneHotEncoder, assembler])
pipeline_trained = pipeline.fit(df)
df2 = pipeline_trained.transform(df)
display(df2.limit(3))

### Doing a lot of OneHotEncoding is pretty common so a helper function makes this a lot easier. This function will take a list of categorical columns and OneHotEncode them and vector assemble them with the numerical columns.

In [None]:
def MakeMLPipeline(df, categorical_features, numeric_features, target_label = None, target_is_categorical = True):
    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StringIndexerModel
    from pyspark.ml import Pipeline

    stages = []

    for c in categorical_features:
        stringIndexer = StringIndexer(inputCol = c, outputCol = c + '_index')
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[c + '_vector'])
        stages += [stringIndexer, encoder]
        
    if target_is_categorical:
        label_stringIdx = StringIndexer(inputCol = target_label, outputCol = 'label')
        stages += [label_stringIdx]

    assemblerInputs = numeric_features + [c + '_vector' for c in categorical_features]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
    stages += [assembler]

    pipeline = Pipeline(stages = stages)

    return pipeline
    


### Clean up the dataset by identifying the numeric and categorical features and target.

In [None]:
# Let's just keep a few fields to start with for simplicity

numeric_features = ['balance', 'duration', 'age']
categorical_features = ['marital', 'education']

# numeric_features = ['age','balance', 'duration', 'pdays']
# categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

target_label = 'default'

df = dfRawFile.select(numeric_features + categorical_features + [target_label])
pipeline = MakeMLPipeline(df, categorical_features, numeric_features, target_label, target_is_categorical = True) 
pipeline_trained = pipeline.fit(df)
df = pipeline_trained.transform(df)
display(df.limit(3))
print(df.take(1))

### You could even insert extra steps in the stages returned before fitting it

In [None]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import PCA

df = dfRawFile.select(numeric_features + categorical_features + [target_label])
pipeline = MakeMLPipeline(df, categorical_features, numeric_features, target_label, target_is_categorical = True) 
sql = "SELECT *, balance/1000 as newbalance FROM __THIS__"
sqlTransformer = SQLTransformer(statement = sql)
stages = pipeline.getStages()
stages.insert(-1, sqlTransformer)

quantile = QuantileDiscretizer(inputCol='age', outputCol='age_buckers', numBuckets = 3)
stages.insert(-1, quantile)

minMaxScaler = MinMaxScaler(inputCol='features', outputCol='minmax_features')
stages.append(minMaxScaler)

standardScaler = StandardScaler(inputCol='features', outputCol='scaled_features')
stages.append(standardScaler)

pca = PCA(k=4, inputCol='features', outputCol='pcaFeatures')
stages.append(pca)

print(stages)


pipeline_trained = pipeline.fit(df)
df = pipeline_trained.transform(df)
display(df.limit(3))
print(df.take(1))

In [None]:
# Let's just go with this moving forward to classification

numeric_features = ['age','balance', 'duration', 'pdays']
categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

target_label = 'default'

df = dfRawFile.select(numeric_features + categorical_features + [target_label])
pipeline = MakeMLPipeline(df, categorical_features, numeric_features, target_label, target_is_categorical = True) 
pipeline_trained = pipeline.fit(df)
pipeline_trained.write().overwrite().save('bank_pipeline')

dfML = pipeline_trained.transform(df)
display(dfML.limit(3))
print(dfML.take(1))

labelCnt = dfML.groupBy('label').count()
display(labelCnt)


### Once a trained pipeline model is saved, it can be reloaded to do the exact same transformation on new datasets.

In [None]:
from pyspark.ml import PipelineModel
pipeline3 = PipelineModel.load('bank_pipeline')
dfML3 = pipeline3.transform(df)
display(dfML3)

### Save the vectorized file in case we want to use it again. This saves the transformed data as opposed to the model that does the transformation in the first place. Both are worth saving for different reasons.

In [None]:
dfML.select('features', 'label').write.format('parquet').mode('overwrite').save('testsave')


### Load the saved file to see it worked.

In [None]:
dfML0 = spark.read.format('parquet').load('testsave')
dfML0.printSchema()
display(dfML0)


### Split it into training and testing sets.

In [None]:
#dfML = dfML0
train, test = dfML.randomSplit([.7,.3], seed = 100)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')
display(train.groupBy('label').count())
display(test.groupBy('label').count())
#print(test.take(1))


### Import the Decision Tree classifier and train it.

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 6)
dtModel = dt.fit(train)
print('DT Trained')

filename1 = filename.replace('.','_') + '_DT_trainedModel'
dtModel.write().overwrite().save(filename1)
print('DT Saved')


### Now make predictions from the trained model and see how good of a job it did.

In [None]:
predDT = dtModel.transform(test)
#display(pred.limit(3))
from pyspark.mllib.evaluation import MulticlassMetrics
print('Predicted')
predDT.groupBy('prediction').count().show()
print('Actual')
predDT.groupBy('label').count().show()

metrics = MulticlassMetrics(predDT.select(['prediction','label']).rdd) 
print(metrics.confusionMatrix().toArray())


### That scientific notation is annoying so try the following to make it more readable.

In [None]:
import numpy as np
np.set_printoptions(suppress=True)
print(metrics.confusionMatrix().toArray())


### This function just makes the output a bit easier to read

In [None]:
cm, cmp = pyh.pretty_confusion(metrics.confusionMatrix().toArray(), include_percent = True)
print(cm)
print(cmp)

### You can also try reloading the saved model and seeing that it works. Note you have to import a different class name than before; this one ends with Model.

In [None]:
from pyspark.ml.classification import DecisionTreeClassificationModel
dtModel2 = DecisionTreeClassificationModel.load(filename1)
pred2 = dtModel2.transform(test)
display(pred2.limit(3))


### Now let's try Logistic Regression.

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
print('LR Trained')

filename1 = filename.replace('.','_') + '_LR_trainedModel'
lrModel.write().overwrite().save(filename1)
print('LR Saved')


### Note again how to load a saved model.

In [None]:
from pyspark.ml.classification import LogisticRegressionModel
lrModel2 = LogisticRegressionModel.load(filename1)

### See the test results as before, but LR has some extra options.

In [None]:
predLR = lrModel.transform(test)
display(predLR.limit(3))

cm, cmp = pyh.pretty_confusion(predLR, include_percent = True)
print(cm)
print(cmp)


### Let's try different thresholds to see if we can tweak the false positive/negative balance or improve the overall accuracy.

In [None]:
for t in range(10, 91, 10):
    lr2 = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, threshold = t/100).fit(train)
    predLR2 = lr2.transform(test)
    cm, cmp = pyh.pretty_confusion(predLR2, include_percent = True)
    print(f'Threshold {t}')
    print(cm)
    print(cmp)


### After a while it's the same thing over and over, but try out as many models as possible to see which works best.


In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees = 10, maxDepth = 6)
rfModel = rf.fit(train)
print('RF Trained')

filename1 = filename.replace('.','_') + '_RF_trainedModel'
rfModel.write().overwrite().save(filename1)
print('RF Saved')



In [None]:
predRF = rfModel.transform(test)
cm, cmp = pyh.pretty_confusion(predRF, include_percent = True)
print(cm)
print(cmp)


### Try Gradient Boost.

In [None]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
print ('GBT Trained')

filename1 = filename.replace('.','_') + '_GBT_trainedModel'
rfModel.write().overwrite().save(filename1)
print ('GBT Saved')


In [None]:
predGB = rfModel.transform(test)
cm, cmp = pyh.pretty_confusion(predGB, include_percent = True)
print(cm)
print(cmp)


### Try Neural Networks.

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# specify layers for the neural network:
# input layer of size 13 (features), two intermediate of size 5 and 4
# and output of size 2 (classes)
layers = [13, 5, 4, 2]

nn = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
nnModel = nn.fit(train)
print ('NN Trained')

filename1 = filename.replace('.','_') + '_NN_trainedModel'
nnModel.write().overwrite().save(filename1)
print ('NN Saved')


In [None]:
predNN = rfModel.transform(test)
cm, cmp = pyh.pretty_confusion(predNN, include_percent = True)
print(cm)
print(cmp)


### Let's make a function that can do a single real time prediction

In [None]:
def predict_bankdefault(transformModel, predictionModel, d): 
    #age, balance, duration, pdays, job, marital, education, housing, loan, contact, campaign, poutcome, deposit):

    newDF = spark.createDataFrame(sc.parallelize([d]))
    predictML = transformModel.transform(newDF)
    prediction = predictionModel.transform(predictML)
    return (prediction.select('prediction').collect())[0][0]

predict1 = dict(age=19, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, deposit='yes', poutcome = 'unknown')
print(predict_bankdefault(pipeline3, dtModel, predict1))

predict2 = dict(age=31, balance=-825, duration=179, pdays=-1, job='unemployed', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, deposit='yes', poutcome='unknown')
print(predict_bankdefault(pipeline3, dtModel, predict2))
