# Initializing SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
.builder\
.appName("Python Spark SQL basic example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

spark

# Reading Data

In [2]:
df = spark.read.format("csv").load("housing.csv", header=True, inferSchema=True)

In [3]:
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [4]:
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  -122.25|   37.85|              

In [5]:
df.count()

20640

In [6]:
df.columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [7]:
df.dtypes

[('longitude', 'double'),
 ('latitude', 'double'),
 ('housing_median_age', 'double'),
 ('total_rooms', 'double'),
 ('total_bedrooms', 'double'),
 ('population', 'double'),
 ('households', 'double'),
 ('median_income', 'double'),
 ('median_house_value', 'double'),
 ('ocean_proximity', 'string')]

# Adding Unique Index Column

In [8]:
from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn('id', monotonically_increasing_id())
df = df[['id'] + df.columns[:-1]]
df.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  0|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
only s

# Descriptive Statistics

In [9]:
df.describe().show()

+-------+-----------------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|summary|               id|          longitude|         latitude|housing_median_age|       total_rooms|    total_bedrooms|        population|       households|     median_income|median_house_value|ocean_proximity|
+-------+-----------------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|  count|            20640|              20640|            20640|             20640|             20640|             20433|             20640|            20640|             20640|             20640|          20640|
|   mean|          10319.5|-119.56970445736148| 35.6318614341087|28.639486434108527|2635.7630813953488| 537.8705525375618|1425.4767441860465|499

# Useful Commands

## select

In [10]:
df.select( 
    'id',
    'housing_median_age',
    'total_rooms',    
    'population',
    'households'
).agg(
  { 'id'                :'count',
    'housing_median_age':'min',
    'total_rooms'       :'avg',
    'population'        :'median',
    'households'        :'max'
}).show()

+---------------+------------------+---------+------------------+-----------------------+
|max(households)|median(population)|count(id)|  avg(total_rooms)|min(housing_median_age)|
+---------------+------------------+---------+------------------+-----------------------+
|         6082.0|            1166.0|    20640|2635.7630813953488|                    1.0|
+---------------+------------------+---------+------------------+-----------------------+



## Statistical Parameters

In [11]:
from pyspark.sql.functions import mean, median, stddev, count 

df.select(
    *[  median(c) for c in df.columns ] 
).show()

+----------+-----------------+----------------+--------------------------+-------------------+----------------------+------------------+------------------+---------------------+--------------------------+-----------------------+
|median(id)|median(longitude)|median(latitude)|median(housing_median_age)|median(total_rooms)|median(total_bedrooms)|median(population)|median(households)|median(median_income)|median(median_house_value)|median(ocean_proximity)|
+----------+-----------------+----------------+--------------------------+-------------------+----------------------+------------------+------------------+---------------------+--------------------------+-----------------------+
|   10319.5|          -118.49|           34.26|                      29.0|             2127.0|                 435.0|            1166.0|             409.0|   3.5347999999999997|                  179700.0|                   NULL|
+----------+-----------------+----------------+--------------------------+----------

## Statistical Parameters ( Grouped by SubCats )

In [12]:
df.groupby('ocean_proximity').agg({col: 'median' for col in df.columns[5:-1]}).show()

+---------------+------------------+------------------+----------------------+---------------------+--------------------------+
|ocean_proximity|median(households)|median(population)|median(total_bedrooms)|median(median_income)|median(median_house_value)|
+---------------+------------------+------------------+----------------------+---------------------+--------------------------+
|         ISLAND|             288.0|             733.0|                 512.0|               2.7361|                  414700.0|
|     NEAR OCEAN|             429.0|            1136.5|                 464.0|              3.64705|                  229450.0|
|       NEAR BAY|             406.0|            1033.5|                 423.0|              3.81865|                  233800.0|
|      <1H OCEAN|             421.0|            1247.0|                 438.0|                3.875|                  214850.0|
|         INLAND|             385.0|            1124.0|                 423.0|               2.9877|    

# Train & Test

In [13]:
train, test = df.randomSplit([0.7, 0.3], seed=0)
train, test 

(DataFrame[id: bigint, longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, ocean_proximity: string],
 DataFrame[id: bigint, longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, ocean_proximity: string])

# Numerical Features

In [14]:
numerical_features_lst = df.columns 

numerical_features_lst.remove('id')
numerical_features_lst.remove('ocean_proximity')
numerical_features_lst.remove('median_house_value')

numerical_features_lst

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

# Imputing

In [15]:
from pyspark.ml.feature import Imputer

imputer = Imputer( inputCols =numerical_features_lst,
                   outputCols=numerical_features_lst )

imputer = imputer.fit(train)

train = imputer.transform(train)
test  = imputer.transform(test)

train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  3|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
only s

# Vectoring Numerical Features 

In [16]:
from pyspark.ml.feature import VectorAssembler

numerical_vector_assembler = VectorAssembler(inputCols=numerical_features_lst,
                                             outputCol='numerical_feature_vector')

train = numerical_vector_assembler.transform(train)
test  = numerical_vector_assembler.transform(test)

train.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|    [-122.24,37.85,52...|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------

## Viewing Vectors

In [17]:
train.select('numerical_feature_vector').take(2)

[Row(numerical_feature_vector=DenseVector([-122.22, 37.86, 21.0, 7099.0, 1106.0, 2401.0, 1138.0, 8.3014])),
 Row(numerical_feature_vector=DenseVector([-122.24, 37.85, 52.0, 1467.0, 190.0, 496.0, 177.0, 7.2574]))]

# Scaling Vector of Numerical Feature

In [18]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol =       'numerical_feature_vector',
                        outputCol='scaled_numerical_feature_vector',
                        withStd=True, withMean=True)

scaler = scaler.fit(train)

train = scaler.transform(train)
test  = scaler.transform(test)

train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3202293423673...|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|    [-122.24,37.85,52...|           [-1.3302106262727...|
|  3|

## Viewing Scaled Vectors

In [19]:
train.select('scaled_numerical_feature_vector').take(2)

[Row(scaled_numerical_feature_vector=DenseVector([-1.3202, 1.041, -0.607, 2.0492, 1.3485, 0.8481, 1.6587, 2.3305])),
 Row(scaled_numerical_feature_vector=DenseVector([-1.3302, 1.0363, 1.856, -0.5369, -0.8279, -0.8123, -0.8392, 1.7819]))]

# Index_Transforming Categorical Columns

In [20]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol ='ocean_proximity',
                        outputCol='ocean_category_index')

indexer = indexer.fit(train)

train = indexer.transform(train)
test  = indexer.transform(test)

train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3202293423673...|                 3.0|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          3521

## Viewing Index_Transfored Categorical Columns

In [21]:
set(train.select('ocean_category_index').collect())

{Row(ocean_category_index=0.0),
 Row(ocean_category_index=1.0),
 Row(ocean_category_index=2.0),
 Row(ocean_category_index=3.0),
 Row(ocean_category_index=4.0)}

# OneHotEncoding Index_Transfored Categorical Columns

In [22]:
from pyspark.ml.feature import OneHotEncoder

one_hot_encoder = OneHotEncoder(inputCol ='ocean_category_index',
                                outputCol='ocean_category_one_hot')

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test  = one_hot_encoder.transform(test)

train.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3202293423673...|                 3.0|         (4,[3],[1.0])|
|  2|  -122.24|   37.85|    

# Vectoring --- *all feature columns* --- into *features*

In [23]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer

# vectorAssembler combines all feature columns into a single feature vector column, "features".
vectorAssembler = VectorAssembler(
    inputCols = [ 'scaled_numerical_feature_vector',
                  'ocean_category_one_hot'
                ],
    outputCol =   'features'
)

# vectorIndexer identifies categorical features and indexes them, and creates a new column "features". 
#vectorIndexer   = VectorIndexer( inputCol="final_feature_vector", outputCol="features", maxCategories=4)

train = vectorAssembler.transform(train)
test  = vectorAssembler.transform(test)

train.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|            features|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3202293423673...|          

## Viewing Vectors

In [24]:
train.select('features').take(2)

[Row(features=DenseVector([-1.3202, 1.041, -0.607, 2.0492, 1.3485, 0.8481, 1.6587, 2.3305, 0.0, 0.0, 0.0, 1.0])),
 Row(features=DenseVector([-1.3302, 1.0363, 1.856, -0.5369, -0.8279, -0.8123, -0.8392, 1.7819, 0.0, 0.0, 0.0, 1.0]))]

#####   
# Models

In [25]:
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression 
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor 
from pyspark.ml.regression import GBTRegressor, IsotonicRegression, FMRegressor

from pyspark.ml.tuning     import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml            import Pipeline

featuresCol = 'features'
labelCol    = 'median_house_value' 


#####   

In [26]:
# Factorization Machines Regressor

#fmr = FMRegressor( featuresCol=featuresCol, labelCol=labelCol )

#fmr = fmr.fit(train)
#fmr


#####   

In [27]:
# Isotonic Regression
ir = IsotonicRegression( featuresCol=featuresCol, labelCol=labelCol )

ir = ir.fit(train)
ir

print("Boundaries in increasing order: %s\n"             % str(ir.boundaries))
print("Predictions associated with the boundaries: %s\n" % str(ir.predictions))


Boundaries in increasing order: [-2.3832360782888644,-2.2983951650932597,-2.293404523140574,-2.28342323923521,-2.2784325972825243,-2.2584700294717894,-2.253479387519111,-2.2484887455664255,-2.2434981036137396,-2.1187320547966646,-2.113741412843986,-1.8392561054464223,-1.8342654634937365,-1.8043216117776375,-1.799330969824959,-1.7643964761561743,-1.7594058342034886,-1.7444339083454392,-1.724471340534711,-1.6595929951498343,-1.6546023531971488,-1.5647707980488512,-1.5597801560961726,-1.5198550204747094,-1.514864378522024,-1.5098737365693384,0.33167314397068987,0.3366637859233683,2.5475181709619434]

Predictions associated with the boundaries: [79750.0,79750.0,83538.09523809524,83538.09523809524,90150.0,90150.0,95850.0,102260.0,107628.57142857143,107628.57142857143,108787.03703703704,108787.03703703704,111968.18181818182,111968.18181818182,114860.0,114860.0,115500.0,115500.0,153836.36363636365,153836.36363636365,184007.26363636364,184007.26363636364,188829.57391304348,188829.57391304348,1

#####   

In [28]:
# Gradient-Boosted Trees
gbt = GBTRegressor( featuresCol=featuresCol, labelCol=labelCol )

gbt = gbt.fit(train)
gbt

GBTRegressionModel: uid=GBTRegressor_ca1c25305887, numTrees=20, numFeatures=12

#####   

In [35]:
# Random Forest
Reg = RandomForestRegressor( featuresCol=featuresCol, labelCol=labelCol )

# Param Grid
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder()\
  .addGrid( Reg.maxDepth, [4,5])\
  .addGrid( Reg.maxBins,  [32,64,128])\
  .addGrid( Reg.numTrees, [5,25,50,100])\
  .build()

# Evaluator
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=Reg.getLabelCol(),
                                predictionCol=Reg.getPredictionCol())

# CV
cv = CrossValidator( numFolds=3, estimator=Reg, evaluator=evaluator, estimatorParamMaps=paramGrid )

# Pipe Line
rf_pipeline = Pipeline( stages=[ cv ] )
#dt_pipeline.save('/tmp/rf_pipeline_001')
 
# fit
rf = rf_pipeline.fit(train) 
#rf.save('/tmp/rf_001')

#####   

In [40]:
# Decision Tree 
Reg = DecisionTreeRegressor( featuresCol=featuresCol, labelCol=labelCol )

# Param Grid
paramGrid = ParamGridBuilder()\
  .addGrid( Reg.maxDepth, [2,3,4,5])\
  .addGrid( Reg.maxBins,  [32, 64])\
  .build() 

# Evaluator
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=Reg.getLabelCol(),
                                predictionCol=Reg.getPredictionCol())

# CV
cv = CrossValidator( numFolds=3, estimator=Reg, evaluator=evaluator, estimatorParamMaps=paramGrid )

# Pipe Line
dt_pipeline = Pipeline( stages=[ cv ] )
#dt_pipeline.save('/tmp/dt_pipeline_001')
 
# fit
dt = dt_pipeline.fit(train) 
#dt.save('/tmp/dt_001')


#####   

In [67]:
# Generalized Linear Regression
Reg = GeneralizedLinearRegression( featuresCol=featuresCol, labelCol=labelCol 
                                   #, family="gaussian", link="identity", maxIter=10, regParam=0.3 
                                 )

# Param Grid
paramGrid = ParamGridBuilder()\
  .addGrid( Reg.family,   ["gaussian"])\
  .addGrid( Reg.link,     ["identity", "log", "inverse"])\
  .addGrid( Reg.regParam, [.25,.3,.35])\
  .build() 

# Evaluator
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=Reg.getLabelCol(),
                                predictionCol=Reg.getPredictionCol())

# CV
cv = CrossValidator( numFolds=3, estimator=Reg, evaluator=evaluator, estimatorParamMaps=paramGrid )

# Pipe Line
glr_pipeline = Pipeline( stages=[ cv ] )
#glr_pipeline.save('/tmp/glr_pipeline_001')
 
# fit
glr = glr_pipeline.fit(train) 
#glr.save('/tmp/glr_001')


''' 
Reg = GeneralizedLinearRegression( 
    featuresCol=featuresCol, labelCol=labelCol,
    family="gaussian", link="identity", maxIter=10, regParam=0.3 )

# Print the coefficients and intercept for generalized linear regression model
print("\nCoefficients: %s" % str(glr.coefficients))
print("\nIntercept: %s"    % str(glr.intercept))
print('')

# Summarize the model over the training set and print out some metrics
summary = glr.summary
print("\nCoefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("\nT Values: " + str(summary.tValues))
print("\nP Values: " + str(summary.pValues))
print("\nDispersion: " + str(summary.dispersion))
print("\nNull Deviance: " + str(summary.nullDeviance))
print("\nResidual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("\nDeviance: " + str(summary.deviance))
print("\nResidual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("\nAIC: " + str(summary.aic))
print("\nDeviance Residuals: ")
summary.residuals().show(5)
''' 
pass

#####   

In [57]:
# Linear Regression
Reg = LinearRegression( featuresCol=featuresCol, labelCol=labelCol
                        #, maxIter=10, regParam=0.3, elasticNetParam=0.8 
                      )

# Param Grid
paramGrid = ParamGridBuilder()\
  .addGrid( Reg.solver,   ["normal","auto"])\
  .addGrid( Reg.elasticNetParam, [.9,.8,.7])\
  .addGrid( Reg.regParam,     [.35,.30,.25])\
  .build() 

# Evaluator
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=Reg.getLabelCol(),
                                predictionCol=Reg.getPredictionCol())

# CV
cv = CrossValidator( numFolds=3, estimator=Reg, evaluator=evaluator, estimatorParamMaps=paramGrid )

# Pipe Line
lr_pipeline = Pipeline( stages=[ cv ] )
#lr_pipeline.save('/tmp/lr_pipeline_001')
 
# fit
lr = lr_pipeline.fit(train) 
#lr.save('/tmp/lr_001')


''' 
lr = LinearRegression( featuresCol=featuresCol, labelCol=labelCol, maxIter=10, regParam=0.3, elasticNetParam=0.8 )

# Print the coefficients and intercept for linear regression
print("\nCoefficients: %s" % str(lr.coefficients))
print("\nIntercept: %s"    % str(lr.intercept))
print('')

# Summarize the model over the training set and print out some metrics
trainingSummary = lr.summary
print("\nnumIterations: %d"    %     trainingSummary.totalIterations)
print("\nobjectiveHistory: %s" % str(trainingSummary.objectiveHistory))
print('')
trainingSummary.residuals.show(5)
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f"   % trainingSummary.r2)
''' 
pass

#####   
# Models

In [68]:
model = glr # lr, glr, dt, rf, gbt, ir, fmr
model 

PipelineModel_fd75a0cf06f0

#####   
# Predictions

In [69]:
pred_test_df = model.transform(test).withColumnRenamed( 
    'prediction',
    'predicted_median_house_value'
)
pred_test_df.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+----------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|            features|predicted_median_house_value|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+----------------------------+
|  0|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          45260

In [70]:
pred_test_df.select( "median_house_value", "predicted_median_house_value" ).show(5)

+------------------+----------------------------+
|median_house_value|predicted_median_house_value|
+------------------+----------------------------+
|          452600.0|          406596.92625686596|
|          342200.0|          255582.89971551462|
|          226700.0|           200815.9755520383|
|          147500.0|          143602.71321899802|
|          159800.0|          163263.29187966406|
+------------------+----------------------------+
only showing top 5 rows



## RMSE

In [71]:
from pyspark.ml.evaluation import RegressionEvaluator

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol      = "median_house_value", 
    predictionCol = "predicted_median_house_value",
    metricName    = "rmse"
)
rmse = evaluator.evaluate( pred_test_df )
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 68900.5


# To Pandas

In [72]:
pred_test_pd_df = pred_test_df.toPandas()
pred_test_pd_df.head(2)

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,numerical_feature_vector,scaled_numerical_feature_vector,ocean_category_index,ocean_category_one_hot,features,predicted_median_house_value
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,"[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 12...","[-1.325219984320072, 1.050342203966381, 0.9820...",3.0,"(0.0, 0.0, 0.0, 1.0)","[-1.325219984320072, 1.050342203966381, 0.9820...",406596.926257
1,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,"[-122.25, 37.85, 52.0, 1627.0, 280.0, 565.0, 2...","[-1.335201268225436, 1.0362969138441056, 1.856...",3.0,"(0.0, 0.0, 0.0, 1.0)","[-1.335201268225436, 1.0362969138441056, 1.856...",255582.899716


#####   

#####   