In [None]:
import sys

rootpath = '/class'
datapath = f'{rootpath}/datasets/'
sys.path.append(rootpath)
import pyspark_helpers as pyh
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display


### The following helper function shows the building of stages to convert categorical and numeric columns into Vectorized versions using a Pipeline instead of building the steps as a series of DataFrames.

In [None]:
def MakeMLPipeline(df, categorical_features, numeric_features, target_label = None, target_is_categorical = True):
    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StringIndexerModel
    from pyspark.ml import Pipeline

    stages = []

    for c in categorical_features:
        stringIndexer = StringIndexer(inputCol = c, outputCol = c + '_Index')
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[c + "_classVec"])
        stages += [stringIndexer, encoder]
        
    if target_is_categorical:
        label_stringIdx = StringIndexer(inputCol = target_label, outputCol = 'label')
        stages += [label_stringIdx]

    assemblerInputs = [c + "_classVec" for c in categorical_features] + numeric_features
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]

    pipeline = Pipeline(stages = stages)

    return pipeline
    


### Read the bank dataset .

In [None]:
filename = 'bank.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df


In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from timeit import default_timer as timer

start = timer()

col = 'marital'
m_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x1 = m_indexer.fit(df).transform(df) #.select(col, col+'_Index')

m_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x2 = m_encoder.fit(x1).transform(x1).orderBy(col + '_Index')

col = 'job'
j_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x3 = j_indexer.fit(x2).transform(x2)
j_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x4 = j_encoder.fit(x3).transform(x3)

#display(x2.select('marital', 'marital_Index', 'marital_Vector'))
end = timer()
print('time to run', end - start)
display(x2)

start = timer()
col = 'marital'
m_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
m_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
#pipeline = Pipeline(stages = [m_indexer, m_encoder])

col = 'job'
j_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
j_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])

v_encoder = VectorAssembler(inputCols = ['age','marital_Vector', 'job_Vector'], outputCol = 'features')
pipeline = Pipeline(stages = [m_indexer, j_indexer, m_encoder, j_encoder, v_encoder])
dfModel = pipeline.fit(df)
#dfModel.save()
dfML = dfModel.transform(df)
end = timer()
print('time to run', end - start)
display(dfML)

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
col = 'marital'
indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x1 = indexer.fit(df).transform(df) #.select(col, col+'_Index')

encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x2 = encoder.fit(x1).transform(x1).orderBy(col + '_Index')
display(x2.select('marital', 'marital_Index', 'marital_Vector'))


### Use the same categorical and numeric features.

In [None]:
# Let's just keep a few fields to start with for simplicity
numeric_features = ['age','balance', 'duration', 'pdays']
categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

# numeric_features = ['balance', 'duration', 'age']
# categorical_features = ['marital', 'education']
target_label = 'default'


df = dfRawFile.select(numeric_features + categorical_features + [target_label])
display(df)
print(df.take(1))

### Try this using the original helper vs. the Pipeline version to see if there is a time difference.

In [None]:
from timeit import default_timer as timer

start = timer()

dfModel, dfML = MakeMLDataFramePipeline(df, categorical_features, numeric_features, target_label)
#dfML = pyh.MakeMLDataFramePipeline(df, categorical_features, numeric_features, target_label)
#dfML = pyh.MakeMLDataFrame(df, categorical_features, numeric_features, target_label)

display(dfML)
dfML.printSchema()
labelCnt = dfML.groupBy('label').count()
display(labelCnt)

end = timer()
print('time to run', end - start)


### Train and test as normal.

In [None]:
train, test = dfML.randomSplit([.7,.3], seed = 10)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')
      

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
print('DT Trained')


In [None]:
dtPredictions, dtLog = pyh.predict_and_evaluate(dtModel, test)


In [None]:
predict = dict(age=59, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, poutcome='unknown', deposit='yes')
print(predict)
predict = spark.createDataFrame(sc.parallelize([predict]))
print(predict)
predictML = dfModel.transform(predict)
#x = dtModel.transform(predict)

print(predictML.take(1))

prediction = dtModel.transform(predictML).select('prediction')
print(prediction.collect()[0][0])

In [None]:
def predict_bankdefault(transformModel, predictionModel, d): #age, balance, duration, pdays, job, marital, education, housing, loan, contact, campaign, poutcome, deposit):
    newDF = spark.createDataFrame(sc.parallelize([d]))
    predictML = transformModel.transform(newDF)
    prediction = predictionModel.transform(predictML)
    return (prediction.collect())[0][0]

predict = dict(age=19, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, poutcome='unknown', deposit='yes')
print (predict_bankdefault(dfModel, dtModel, predict))

### Pipelines and writing your own models

In [None]:
from pyspark.ml.pipeline import Estimator, Model, Pipeline, Transformer
from pyspark.ml.param.shared import *
from pyspark.sql.functions import avg, stddev_samp


class myTransformer(Transformer):
    pass

class HasMean(Params):

    mean = Param(Params._dummy(), "mean", "mean", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasMean, self).__init__()

    def setMean(self, value):
        return self._set(mean=value)

    def getMean(self):
        return self.getOrDefault(self.mean)
    
class HasStandardDeviation(Params):

    stddev = Param(Params._dummy(), "stddev", "stddev", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasStandardDeviation, self).__init__()

    def setStddev(self, value):
        return self._set(stddev=value)

    def getStddev(self):
        return self.getOrDefault(self.stddev)

class HasCenteredThreshold(Params):

    centered_threshold = Param(Params._dummy(),
            "centered_threshold", "centered_threshold",
            typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasCenteredThreshold, self).__init__()

    def setCenteredThreshold(self, value):
        return self._set(centered_threshold=value)

    def getCenteredThreshold(self):
        return self.getOrDefault(self.centered_threshold)
    
class NormalDeviation(Estimator, HasInputCol, 
        HasPredictionCol, HasCenteredThreshold):

    def _fit(self, dataset):
        c = self.getInputCol()
        mu, sigma = dataset.agg(avg(c), stddev_samp(c)).first()
        return (NormalDeviationModel()
            .setInputCol(c)
            .setMean(mu)
            .setStddev(sigma)
            .setCenteredThreshold(self.getCenteredThreshold())
            .setPredictionCol(self.getPredictionCol()))

class NormalDeviationModel(Model, HasInputCol, HasPredictionCol,
        HasMean, HasStandardDeviation, HasCenteredThreshold):

    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getPredictionCol()
        threshold = self.getCenteredThreshold()
        mu = self.getMean()
        sigma = self.getStddev()

        return dataset.withColumn(y, (dataset[x] - mu) > threshold * sigma)

df = sc.parallelize([(1, 2.0), (2, 3.0), (3, 0.0), (4, 99.0)]).toDF(["id", "x"])

normal_deviation = NormalDeviation().setInputCol("x").setCenteredThreshold(1.0)
model  = Pipeline(stages=[normal_deviation]).fit(df)
#print(normal_deviation.getMean())
model.transform(df).show()


### PandasUDF

In [None]:
df = spark.createDataFrame(pd.DataFrame(pd.Series(range(11)), columns=["x"]))
display(df)

In [None]:
def dbl(x):
    return x * 2

nums1 = [1, 2, 3, 4]
nums2 = []
for n in nums1:
    nums2.append(dbl(n)) 

print (nums1 * 2)

import numpy as np
nums3 = np.array([1, 2, 3, 4])
nums4 = nums3 * 2

print (nums3 * 2)


print(nums3 - nums3.mean())


In [None]:
from pyspark.sql.functions import udf

def func1(x):
    return x + 1

display(df.withColumn('func1', udf(func1, 'int')(df.x)))

func1x = udf(func1, 'int')
display(df.withColumn('func1x', func1x(df.x)))

sqr = udf(lambda x : x * x , 'int') 
display(df.withColumn('x3', sqr(df.x)))

@udf('int')
def square(x):
      return x * 2

display(df.withColumn('x2', square(df.x)))




In [None]:
#! pip install pyarrow
#import pyarrow
from pyspark.sql.functions import pandas_udf, PandasUDFType
#from pyspark.sql.types import LongType

@pandas_udf('double', PandasUDFType.SCALAR)
def psquare(x):
      return x * x
   
#pandas_square = pandas_udf(psquare, returnType=LongType())

df.withColumn('x3', psquare(df.x)).show()
