In this notebook I am trying to built a regression model for car price predictions

In [0]:
# Import libs
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
# read the data
df1 = spark.read.format("csv").option("header", "true").option('inferSchema', 'True').load("dbfs:/FileStore/shared_uploads/dawar.rohan@gmail.com/CarPrice_Assignment.csv")
display(df1)

car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0
6,2,audi fox,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250.0
7,1,audi 100ls,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710.0
8,1,audi 5000,gas,std,four,wagon,fwd,front,105.8,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920.0
9,1,audi 4000,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875.0
10,0,audi 5000s (diesel),gas,turbo,two,hatchback,4wd,front,99.5,178.2,67.9,52.0,3053,ohc,five,131,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167


In [0]:
#Checking the shape of the data
df1.count(), len(df1.columns)

Out[7]: (205, 26)

In [0]:
# Checking the nulls in the data
display(df1.select([count(when(col(c).contains('None')| col(c).contains('NULL') | (col(c)=='') | col(c).isNull()| isnan(c),c)).alias(c) for c in df1.columns]))

car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The data is cleaned, lets dig into the Modelling

In [0]:
# train & test split
train_df, test_df = df1.randomSplit([0.8,0.2], seed=42)
print('Train: ',train_df.cache().count())
print('Test: ',test_df.cache().count())

Train:  173
Test:  32


In [0]:
# creating a baseline, adding the Mean & Medium values from training data
avg_price = train_df.select(avg('price')).collect()[0][0]
median_price = train_df.approxQuantile('price', [0.5],0)[0]
pred_df = test_df.withColumn('avgPrediction', lit(avg_price))\
                 .withColumn('medianPrediction', lit(median_price))

In [0]:
# Evaluatin the baseline
regression_mean_evaluator = RegressionEvaluator(predictionCol='avgPrediction', labelCol='price', metricName='rmse')
print(f'The RMSE for predicting the average Price is:  {regression_mean_evaluator.evaluate(pred_df)}')
r2_mean = regression_mean_evaluator.setMetricName('r2')
print(f'The R2 for predicting the average Price is:  {r2_mean.evaluate(pred_df)}')
regression_median_evaluator = RegressionEvaluator(predictionCol='medianPrediction', labelCol='price', metricName='rmse')
print(f'The RMSE for predicting the Median Price is:  {regression_median_evaluator.evaluate(pred_df)}')
r2_median = regression_median_evaluator.setMetricName('r2')
print(f'The R2 for predicting the Median Price is:  {r2_median.evaluate(pred_df)}')

The RMSE for predicting the average Price is:  8438.142040971874
The R2 for predicting the average Price is:  -0.022734470632052473
The RMSE for predicting the Median Price is:  9314.635957224791
The R2 for predicting the Median Price is:  -0.24623803763701013


In [0]:
# databricks related pre-processing for catrgorical columns
categorical_cols = [field for (field,dataType) in train_df.dtypes if dataType=='string']
index_output_cols = [x + 'Index' for x in categorical_cols]
ohe_output_cols = [x + 'OHE' for x in categorical_cols]

String_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid='skip')
ohe_encoder = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

In [0]:
# Data brick related pre-processing for numerical columns
numerical_cols = [field for (field,dataType) in train_df.dtypes if ((dataType!='string') & (field!='price' ) & (field!='car_ID'))]
assembler_input = ohe_output_cols + numerical_cols
vector_Assembler = VectorAssembler(inputCols=assembler_input, outputCol='features')

In [0]:
# Instiate Linear Regression Model
lr = LinearRegression(featuresCol='features', labelCol='price')

In [0]:
all_stages = [String_indexer, ohe_encoder, vector_Assembler, lr]
pipeline = Pipeline(stages=all_stages)

In [0]:
# fitting into training data
pipeline_model = pipeline.fit(train_df)

In [0]:
# Saving the model
pipeline_model.write().overwrite().save('dbfs:/FileStore/shared_uploads/dawar.rohan@gmail.com/new_model')

In [0]:
saved_pipeline_model = pipeline_model.load('dbfs:/FileStore/shared_uploads/dawar.rohan@gmail.com/new_model')

In [0]:
pred_train_df = saved_pipeline_model.transform(train_df)

In [0]:
display(pred_train_df.select('price', 'prediction'))

price,prediction
13495.0,13501.610758344606
16500.0,16501.366277124584
13950.0,13938.946017370588
17450.0,17452.379471910317
15250.0,15250.92184372661
18920.0,18931.74365696852
17859.167,17868.22526557301
16430.0,16379.664917521655
16925.0,16984.201372901225
20970.0,20973.999385147297


In [0]:
pred_test_df = saved_pipeline_model.transform(test_df)
display(pred_test_df)

car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarNameIndex,fueltypeIndex,aspirationIndex,doornumberIndex,carbodyIndex,drivewheelIndex,enginelocationIndex,enginetypeIndex,cylindernumberIndex,fuelsystemIndex,CarNameOHE,fueltypeOHE,aspirationOHE,doornumberOHE,carbodyOHE,drivewheelOHE,enginelocationOHE,enginetypeOHE,cylindernumberOHE,fuelsystemOHE,features,prediction
7,1,audi 100ls,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,"Map(vectorType -> sparse, length -> 129, indices -> List(33), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(33, 129, 130, 131, 132, 136, 138, 139, 146, 149, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 105.8, 192.7, 71.4, 55.7, 2844.0, 136.0, 3.19, 3.4, 8.5, 110.0, 5500.0, 19.0, 25.0))",16544.532295092155
14,0,bmw x3,gas,std,four,sedan,rwd,front,101.2,176.8,64.8,54.3,2765,ohc,six,164,mpfi,3.31,3.19,9.0,121,4250,21,28,21105.0,38.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,"Map(vectorType -> sparse, length -> 129, indices -> List(38), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(38, 129, 130, 131, 132, 137, 138, 139, 145, 149, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 101.2, 176.8, 64.8, 54.3, 2765.0, 164.0, 3.31, 3.19, 9.0, 121.0, 4250.0, 21.0, 28.0))",30972.4390654298
56,3,mazda 626,gas,std,two,hatchback,rwd,front,95.3,169.0,65.7,49.6,2380,rotor,two,70,4bbl,3.33,3.255,9.4,101,6000,17,23,10945.0,12.0,0.0,0.0,1.0,1.0,1.0,0.0,5.0,4.0,5.0,"Map(vectorType -> sparse, length -> 129, indices -> List(12), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 5, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 168, indices -> List(12, 129, 130, 133, 137, 138, 148, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 95.3, 169.0, 65.7, 49.6, 2380.0, 70.0, 3.33, 3.255, 9.4, 101.0, 6000.0, 17.0, 23.0))",10317.93201245389
63,0,mazda rx-4,gas,std,four,sedan,fwd,front,98.8,177.8,66.5,55.5,2410,ohc,four,122,2bbl,3.39,3.39,8.6,84,4800,26,32,10245.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 129, indices -> List(70), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(70, 129, 130, 131, 132, 136, 138, 139, 144, 150, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 98.8, 177.8, 66.5, 55.5, 2410.0, 122.0, 3.39, 3.39, 8.6, 84.0, 4800.0, 26.0, 32.0))",9754.395092708217
117,0,peugeot 504,diesel,turbo,four,sedan,rwd,front,107.9,186.7,68.4,56.7,3252,l,four,152,idi,3.7,3.52,21.0,95,4150,28,33,17950.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,4.0,0.0,2.0,"Map(vectorType -> sparse, length -> 129, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(0, 131, 132, 137, 138, 143, 144, 151, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 107.9, 186.7, 68.4, 56.7, 3252.0, 152.0, 3.7, 3.52, 21.0, 95.0, 4150.0, 28.0, 33.0))",15327.353106471295
119,1,plymouth fury iii,gas,std,two,hatchback,fwd,front,93.7,157.3,63.8,50.8,1918,ohc,four,90,2bbl,2.97,3.23,9.4,68,5500,37,41,5572.0,92.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 129, indices -> List(92), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(92, 129, 130, 133, 136, 138, 139, 144, 150, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 93.7, 157.3, 63.8, 50.8, 1918.0, 90.0, 2.97, 3.23, 9.4, 68.0, 5500.0, 37.0, 41.0))",4538.419924269379
153,1,toyota corolla 1200,gas,std,four,hatchback,fwd,front,95.7,158.7,63.6,54.5,2015,ohc,four,92,2bbl,3.05,3.03,9.0,62,4800,31,38,6488.0,109.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 129, indices -> List(109), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(109, 129, 130, 131, 133, 136, 138, 139, 144, 150, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 95.7, 158.7, 63.6, 54.5, 2015.0, 92.0, 3.05, 3.03, 9.0, 62.0, 4800.0, 31.0, 38.0))",7095.266880424217
161,0,toyota corona,gas,std,four,sedan,fwd,front,95.7,166.3,64.4,53.0,2094,ohc,four,98,2bbl,3.19,3.03,9.0,70,4800,38,47,7738.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 129, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(3, 129, 130, 131, 132, 136, 138, 139, 144, 150, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 95.7, 166.3, 64.4, 53.0, 2094.0, 98.0, 3.19, 3.03, 9.0, 70.0, 4800.0, 38.0, 47.0))",8996.350634317801
165,1,toyota corona,gas,std,two,hatchback,rwd,front,94.5,168.7,64.0,52.6,2204,ohc,four,98,2bbl,3.19,3.03,9.0,70,4800,29,34,8238.0,3.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 129, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(3, 129, 130, 133, 137, 138, 139, 144, 150, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 94.5, 168.7, 64.0, 52.6, 2204.0, 98.0, 3.19, 3.03, 9.0, 70.0, 4800.0, 29.0, 34.0))",6913.03322618363
169,2,toyota corolla,gas,std,two,hardtop,rwd,front,98.4,176.2,65.6,52.0,2536,ohc,four,146,mpfi,3.62,3.5,9.3,116,4800,24,30,9639.0,2.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,"Map(vectorType -> sparse, length -> 129, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 168, indices -> List(2, 129, 130, 135, 137, 138, 139, 144, 149, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 98.4, 176.2, 65.6, 52.0, 2536.0, 146.0, 3.62, 3.5, 9.3, 116.0, 4800.0, 24.0, 30.0))",15121.547752723023


In [0]:
display(pred_test_df.select('price', 'prediction'))

price,prediction
17710.0,16544.532295092155
21105.0,30972.4390654298
10945.0,10317.93201245389
10245.0,9754.395092708217
17950.0,15327.353106471295
5572.0,4538.419924269379
6488.0,7095.266880424217
7738.0,8996.350634317801
8238.0,6913.03322618363
9639.0,15121.547752723023


In [0]:
regress_eval = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse_train = regress_eval.evaluate(pred_train_df)
rmse_test = regress_eval.evaluate(pred_test_df)
r2_train = regress_eval.setMetricName('r2').evaluate(pred_train_df)
r2_test = regress_eval.setMetricName('r2').evaluate(pred_train_df)
print('RMSE for Model Training Set: ', rmse_train)
print('RMSE for Model Testinf Set: ', rmse_test)
print('R2 for Model Training Set: ', r2_train)
print('R2 for Model Testing Set: ', r2_test)

RMSE for Model Training Set:  336.3667447980873
RMSE for Model Testinf Set:  3454.5547473903107
R2 for Model Training Set:  0.9981790368144307
R2 for Model Testing Set:  0.9981790368144307
