In [1]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
import pyspark_helpers as pyh
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display


initializing pyspark
pyspark initialized


### The following helper function shows the building of stages to convert categorical and numeric columns into Vectorized versions using a Pipeline instead of building the steps as a series of DataFrames

In [9]:
def MakeMLDataFramePipeline(df, categorical_features, numeric_features, target_label = None, target_is_categorical = True):
    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StringIndexerModel
    from pyspark.ml import Pipeline

    stages = []

    for c in categorical_features:
        stringIndexer = StringIndexer(inputCol = c, outputCol = c + '_Index')
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[c + "_classVec"])
        stages += [stringIndexer, encoder]
        
    if target_is_categorical:
        label_stringIdx = StringIndexer(inputCol = target_label, outputCol = 'label')
        stages += [label_stringIdx]

    assemblerInputs = [c + "_classVec" for c in categorical_features] + numeric_features
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]

    pipeline = Pipeline(stages = stages)

    dfModel = pipeline.fit(df)
    dfTrans = dfModel.transform(df).select(['label', 'features'])
    return dfModel, dfTrans



### Read the same set of data as the previous chapter

In [3]:
filename = 'bank.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df



Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
5,42,management,single,tertiary,no,0,yes,yes,unknown,5,may,562,2,-1,0,unknown,yes
6,56,management,married,tertiary,no,830,yes,yes,unknown,6,may,1201,1,-1,0,unknown,yes
7,60,retired,divorced,secondary,no,545,yes,no,unknown,6,may,1030,1,-1,0,unknown,yes
8,37,technician,married,secondary,no,1,yes,no,unknown,6,may,608,1,-1,0,unknown,yes
9,28,services,single,secondary,no,5090,yes,no,unknown,6,may,1297,3,-1,0,unknown,yes


In [6]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from timeit import default_timer as timer

start = timer()

col = 'marital'
m_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x1 = m_indexer.fit(df).transform(df) #.select(col, col+'_Index')

m_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x2 = m_encoder.fit(x1).transform(x1).orderBy(col + '_Index')

col = 'job'
j_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x3 = j_indexer.fit(x2).transform(x2)
j_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x4 = j_encoder.fit(x3).transform(x3)

#display(x2.select('marital', 'marital_Index', 'marital_Vector'))
end = timer()
print('time to run', end - start)
display(x2)

start = timer()
col = 'marital'
m_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
m_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
#pipeline = Pipeline(stages = [m_indexer, m_encoder])

col = 'job'
j_indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
j_encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])

v_encoder = VectorAssembler(inputCols = ['age','marital_Vector', 'job_Vector'], outputCol = 'features')
pipeline = Pipeline(stages = [m_indexer, j_indexer, m_encoder, j_encoder, v_encoder])
dfModel = pipeline.fit(df)
#dfModel.save()
dfML = dfModel.transform(df)
end = timer()
print('time to run', end - start)
display(dfML)

time to run 1.8039309080049861


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,marital_Index,marital_Vector
0,37,technician,married,secondary,no,1,yes,no,unknown,6,may,608,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
1,41,admin.,married,secondary,no,55,yes,no,unknown,8,may,1120,2,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
2,43,blue-collar,married,primary,no,-192,yes,no,unknown,8,may,1120,2,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
3,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
4,30,blue-collar,married,secondary,no,309,yes,no,unknown,7,may,1574,2,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
5,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
6,29,management,married,tertiary,no,199,yes,yes,unknown,7,may,1689,4,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
7,56,management,married,tertiary,no,830,yes,yes,unknown,6,may,1201,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
8,49,services,married,secondary,no,-8,yes,no,unknown,8,may,1119,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"
9,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes,0.0,"(1.0, 0.0)"


time to run 1.2853768169879913


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,campaign,pdays,previous,poutcome,deposit,marital_Index,job_Index,marital_Vector,job_Vector,features
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,...,1,-1,0,unknown,yes,0.0,3.0,"(1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(59.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
1,56,admin.,married,secondary,no,45,no,no,unknown,5,...,1,-1,0,unknown,yes,0.0,3.0,"(1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(56.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,...,1,-1,0,unknown,yes,0.0,2.0,"(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(41.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
3,55,services,married,secondary,no,2476,yes,no,unknown,5,...,1,-1,0,unknown,yes,0.0,4.0,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(55.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,...,2,-1,0,unknown,yes,0.0,3.0,"(1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(54.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
5,42,management,single,tertiary,no,0,yes,yes,unknown,5,...,2,-1,0,unknown,yes,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(42.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,56,management,married,tertiary,no,830,yes,yes,unknown,6,...,1,-1,0,unknown,yes,0.0,0.0,"(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(56.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7,60,retired,divorced,secondary,no,545,yes,no,unknown,6,...,1,-1,0,unknown,yes,2.0,5.0,"(0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(60.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
8,37,technician,married,secondary,no,1,yes,no,unknown,6,...,1,-1,0,unknown,yes,0.0,2.0,"(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(37.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
9,28,services,single,secondary,no,5090,yes,no,unknown,6,...,3,-1,0,unknown,yes,1.0,4.0,"(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(28.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."


In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
col = 'marital'
indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x1 = indexer.fit(df).transform(df) #.select(col, col+'_Index')

encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
x2 = encoder.fit(x1).transform(x1).orderBy(col + '_Index')
display(x2.select('marital', 'marital_Index', 'marital_Vector'))


### Use the same categorical and numeric features

In [10]:
# Let's just keep a few fields to start with for simplicity
numeric_features = ['age','balance', 'duration', 'pdays']
categorical_features = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'poutcome', 'deposit']

# numeric_features = ['balance', 'duration', 'age']
# categorical_features = ['marital', 'education']
target_label = 'default'


df = dfRawFile.select(numeric_features + categorical_features + [target_label])
display(df)
print(df.take(1))

Unnamed: 0,age,balance,duration,pdays,job,marital,education,housing,loan,contact,campaign,poutcome,deposit,default
0,59,2343,1042,-1,admin.,married,secondary,yes,no,unknown,1,unknown,yes,no
1,56,45,1467,-1,admin.,married,secondary,no,no,unknown,1,unknown,yes,no
2,41,1270,1389,-1,technician,married,secondary,yes,no,unknown,1,unknown,yes,no
3,55,2476,579,-1,services,married,secondary,yes,no,unknown,1,unknown,yes,no
4,54,184,673,-1,admin.,married,tertiary,no,no,unknown,2,unknown,yes,no
5,42,0,562,-1,management,single,tertiary,yes,yes,unknown,2,unknown,yes,no
6,56,830,1201,-1,management,married,tertiary,yes,yes,unknown,1,unknown,yes,no
7,60,545,1030,-1,retired,divorced,secondary,yes,no,unknown,1,unknown,yes,no
8,37,1,608,-1,technician,married,secondary,yes,no,unknown,1,unknown,yes,no
9,28,5090,1297,-1,services,single,secondary,yes,no,unknown,3,unknown,yes,no


[Row(age=59, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, poutcome='unknown', deposit='yes', default='no')]


### Try this using the original helper vs the Pipeline version to see if there is a time difference

In [11]:
from timeit import default_timer as timer

start = timer()

dfModel, dfML = MakeMLDataFramePipeline(df, categorical_features, numeric_features, target_label)
#dfML = pyh.MakeMLDataFramePipeline(df, categorical_features, numeric_features, target_label)
#dfML = pyh.MakeMLDataFrame(df, categorical_features, numeric_features, target_label)

display(dfML)
dfML.printSchema()
labelCnt = dfML.groupBy('label').count()
display(labelCnt)

end = timer()
print('time to run', end - start)


Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
8,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)



Unnamed: 0,label,count
0,0.0,10994
1,1.0,168


time to run 7.885473164002178


### Train and test as normal

In [12]:
train, test = dfML.randomSplit([.7,.3], seed = 10)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')
      

Training set row count 7871
Testing set row count 3291


In [13]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
print('DT Trained')


DT Trained


In [14]:
dtPredictions, dtLog = pyh.predict_and_evaluate(dtModel, test)


Test Area Under ROC 0.7228598787942387
+-----+-------------+----------+--------------------+
|label|rawPrediction|prediction|         probability|
+-----+-------------+----------+--------------------+
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[1203.0,39.0]|       0.0|[0.96859903381642...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[6130.0,22.0]|       0.0|[0.99642392717815...|
|  0.0|[1203.0,39.0]|       0.0|[0.96859903

In [17]:
#dtModel.write().save('DTModel1')
#dfModel.write().save('PipelineModel')

In [21]:
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.classification import DecisionTreeClassificationModel

pl = PipelineModel.load('PipelineModel')
dt = DecisionTreeClassificationModel.load('DTModel1')
print('Done')

Done


In [22]:
predict = dict(age=59, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, poutcome='unknown', deposit='yes')
print(predict)
predict = spark.createDataFrame(sc.parallelize([predict]))
print(predict)
predictML = pl.transform(predict)
#x = dtModel.transform(predict)

print(predictML.take(1))

prediction = dt.transform(predictML).select('prediction')
print(prediction.collect()[0][0])

{'age': 59, 'balance': 2343, 'duration': 1042, 'pdays': -1, 'job': 'admin.', 'marital': 'married', 'education': 'secondary', 'housing': 'yes', 'loan': 'no', 'contact': 'unknown', 'campaign': 1, 'poutcome': 'unknown', 'deposit': 'yes'}
DataFrame[age: bigint, balance: bigint, campaign: bigint, contact: string, deposit: string, duration: bigint, education: string, housing: string, job: string, loan: string, marital: string, pdays: bigint, poutcome: string]




[Row(age=59, balance=2343, campaign=1, contact='unknown', deposit='yes', duration=1042, education='secondary', housing='yes', job='admin.', loan='no', marital='married', pdays=-1, poutcome='unknown', job_Index=3.0, job_classVec=SparseVector(11, {3: 1.0}), marital_Index=0.0, marital_classVec=SparseVector(2, {0: 1.0}), education_Index=0.0, education_classVec=SparseVector(3, {0: 1.0}), housing_Index=1.0, housing_classVec=SparseVector(1, {}), loan_Index=0.0, loan_classVec=SparseVector(1, {0: 1.0}), contact_Index=1.0, contact_classVec=SparseVector(2, {1: 1.0}), campaign_Index=0.0, campaign_classVec=SparseVector(35, {0: 1.0}), poutcome_Index=0.0, poutcome_classVec=SparseVector(3, {0: 1.0}), deposit_Index=1.0, deposit_classVec=SparseVector(1, {}), features=SparseVector(63, {3: 1.0, 11: 1.0, 13: 1.0, 17: 1.0, 19: 1.0, 20: 1.0, 55: 1.0, 59: 59.0, 60: 2343.0, 61: 1042.0, 62: -1.0}))]
0.0


In [None]:
def predict_bankdefault(transformModel, predictionModel, d): #age, balance, duration, pdays, job, marital, education, housing, loan, contact, campaign, poutcome, deposit):
    newDF = spark.createDataFrame(sc.parallelize([d]))
    predictML = transformModel.transform(newDF)
    prediction = predictionModel.transform(predictML)
    return (prediction.collect())[0][0]

predict = dict(age=19, balance=2343, duration=1042, pdays=-1, job='admin.', marital='married', education='secondary', housing='yes', loan='no', contact='unknown', campaign=1, poutcome='unknown', deposit='yes')
print (predict_bankdefault(dfModel, dtModel, predict))

### Pipelines and writing your own models

In [23]:
from pyspark.ml.pipeline import Estimator, Model, Pipeline, Transformer
from pyspark.ml.param.shared import *
from pyspark.sql.functions import avg, stddev_samp


class myTransformer(Transformer):
    pass

class HasMean(Params):

    mean = Param(Params._dummy(), "mean", "mean", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasMean, self).__init__()

    def setMean(self, value):
        return self._set(mean=value)

    def getMean(self):
        return self.getOrDefault(self.mean)
    
class HasStandardDeviation(Params):

    stddev = Param(Params._dummy(), "stddev", "stddev", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasStandardDeviation, self).__init__()

    def setStddev(self, value):
        return self._set(stddev=value)

    def getStddev(self):
        return self.getOrDefault(self.stddev)

class HasCenteredThreshold(Params):

    centered_threshold = Param(Params._dummy(),
            "centered_threshold", "centered_threshold",
            typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasCenteredThreshold, self).__init__()

    def setCenteredThreshold(self, value):
        return self._set(centered_threshold=value)

    def getCenteredThreshold(self):
        return self.getOrDefault(self.centered_threshold)
    
class NormalDeviation(Estimator, HasInputCol, 
        HasPredictionCol, HasCenteredThreshold):

    def _fit(self, dataset):
        c = self.getInputCol()
        mu, sigma = dataset.agg(avg(c), stddev_samp(c)).first()
        return (NormalDeviationModel()
            .setInputCol(c)
            .setMean(mu)
            .setStddev(sigma)
            .setCenteredThreshold(self.getCenteredThreshold())
            .setPredictionCol(self.getPredictionCol()))

class NormalDeviationModel(Model, HasInputCol, HasPredictionCol,
        HasMean, HasStandardDeviation, HasCenteredThreshold):

    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getPredictionCol()
        threshold = self.getCenteredThreshold()
        mu = self.getMean()
        sigma = self.getStddev()

        return dataset.withColumn(y, (dataset[x] - mu) > threshold * sigma)

df = sc.parallelize([(1, 2.0), (2, 3.0), (3, 0.0), (4, 99.0)]).toDF(["id", "x"])

#normal_deviation = NormalDeviation(inputCol="x", centeredThreshold = 1.0)
                                   
normal_deviation = NormalDeviation().setInputCol("x").setCenteredThreshold(1.0)

model  = Pipeline(stages=[normal_deviation]).fit(df)
#print(normal_deviation.getMean())
model.transform(df).show()


+---+----+----------+
| id|   x|prediction|
+---+----+----------+
|  1| 2.0|     false|
|  2| 3.0|     false|
|  3| 0.0|     false|
|  4|99.0|      true|
+---+----+----------+



### PandasUDF

In [28]:
df = spark.createDataFrame(pd.DataFrame(pd.Series(range(11)), columns=["x"]))
display(df)

from pyspark.sql.functions import udf

def sqr(x):
    return x * x

sqrUDF=udf(sqr, 'int')

@udf('int')
def sqr2(x):
    return x * x

display(df.withColumn('x1', sqrUDF(df.x)).withColumn('x2', sqr2(df.x)).withColumn('x3', udf(sqr, 'int')(df.x)))

Unnamed: 0,x
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


Unnamed: 0,x,x1,x2,x3
0,0,0,0,0
1,1,1,1,1
2,2,4,4,4
3,3,9,9,9
4,4,16,16,16
5,5,25,25,25
6,6,36,36,36
7,7,49,49,49
8,8,64,64,64
9,9,81,81,81


In [29]:
def dbl(x):
    return x * 2

nums1 = [1, 2, 3, 4]
nums2 = []
for n in nums1:
    nums2.append(dbl(n)) 

print(nums2)

print (nums1 * 2)

import numpy as np
nums3 = np.array([1, 2, 3, 4])
nums4 = nums3 * 2

print (nums3 * 2)


print(nums3 - nums3.mean())


[2, 4, 6, 8]
[1, 2, 3, 4, 1, 2, 3, 4]
[2 4 6 8]
[-1.5 -0.5  0.5  1.5]


In [None]:
from pyspark.sql.functions import udf

def func1(x):
    return x + 1

display(df.withColumn('func1', udf(func1, 'int')(df.x)))

func1x = udf(func1, 'int')
display(df.withColumn('func1x', func1x(df.x)))

sqr = udf(lambda x : x * x , 'int') 
display(df.withColumn('x3', sqr(df.x)))

@udf('int')
def square(x):
      return x * 2

display(df.withColumn('x2', square(df.x)))




In [30]:
#! pip install pyarrow
#import pyarrow
from pyspark.sql.functions import pandas_udf, PandasUDFType
#from pyspark.sql.types import LongType

@pandas_udf('double', PandasUDFType.SCALAR)
def psquare(x):
      return x * x
   
#pandas_square = pandas_udf(psquare, returnType=LongType())

df.withColumn('x3', psquare(df.x)).show()


Py4JJavaError: An error occurred while calling o6001.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 483.0 failed 1 times, most recent failure: Lost task 0.0 in stage 483.0 (TID 1076, localhost, executor driver): java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
	at org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
	at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
	at org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
	at org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
	at org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
	at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
	at org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
	at org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:98)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:96)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
