##### Importing all the libraries

In [548]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col
from pyspark.ml.feature import Imputer
from pyspark.sql.types import FloatType
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import mean
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.util import Saveable
import os
import pandas as pd
import numpy as np
from models.helper import Helper

In [549]:
helper = Helper()

##### Initiating the spark local session

In [550]:
spark = SparkSession.builder\
        .master("local")\
        .appName("HousePricePredict")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

##### Reading and analyzing the dataset

In [551]:
df = spark.read.format("csv").load("./data/austinHousingData.csv", header=True, inferSchema=True)

df.printSchema()

root
 |-- zpid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- streetAddress: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- description: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- propertyTaxRate: string (nullable = true)
 |-- garageSpaces: string (nullable = true)
 |-- hasAssociation: string (nullable = true)
 |-- hasCooling: string (nullable = true)
 |-- hasGarage: string (nullable = true)
 |-- hasHeating: string (nullable = true)
 |-- hasSpa: string (nullable = true)
 |-- hasView: string (nullable = true)
 |-- homeType: string (nullable = true)
 |-- parkingSpaces: string (nullable = true)
 |-- yearBuilt: string (nullable = true)
 |-- latestPrice: string (nullable = true)
 |-- numPriceChanges: string (nullable = true)
 |-- latest_saledate: string (nullable = true)
 |-- latest_salemonth: string (nullable = true)
 |-- latest_saleyear: string (nullable = true)
 |-- latestPriceSo

In [552]:
df.show(5)

+--------------------+--------------------+--------------------+-----------------+--------------------+------------------+------------------+---------------+------------+--------------+----------+---------+----------+-------------+-------+-------------+-------------+---------+-----------+---------------+---------------+--------------------+---------------+-----------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+------------------+-----------------+------------------+-------------+------------------------+--------------+--------------------+------------+--------------------+
|                zpid|                city|       streetAddress|          zipcode|         description|          latitude|         longitude|propertyTaxRate|garageSpaces|hasAsso

In [553]:
len(df.columns)

47

In [554]:
df = df.withColumn('id', monotonically_increasing_id())

df = df[['id'] + df.columns[:-1]]

df.show(3)

+---+--------------------+--------------------+--------------------+-----------------+--------------------+-----------------+------------------+---------------+------------+--------------+----------+---------+----------+-------------+-------+-------------+-------------+---------+-----------+---------------+---------------+--------------------+---------------+-----------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+------------------+-----------------+------------------+-------------+------------------------+--------------+--------------------+------------+--------------------+
| id|                zpid|                city|       streetAddress|          zipcode|         description|         latitude|         longitude|propertyTaxRate|garageSpaces|h

In [555]:
df.count()

21534

In [556]:
df = df.dropna()

In [557]:
df.count()

11197

In [558]:
df.show(3)

+---+----------+------------+-------------------+-------+--------------------+------------------+------------------+---------------+------------+--------------+----------+---------+----------+------+-------+-------------+-------------+---------+-----------+---------------+---------------+----------------+---------------+-----------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+------------------+-------------+------------------------+--------------+-------------+------------+--------------------+
| id|      zpid|        city|      streetAddress|zipcode|         description|          latitude|         longitude|propertyTaxRate|garageSpaces|hasAssociation|hasCooling|hasGarage|hasHeating|hasSpa|hasView|     homeType|p

In [559]:
numerical_features_list = helper.numerical_features_list

In [560]:
len(numerical_features_list)

25

In [561]:
categorical_features_list = helper.categorical_features_list

In [562]:
#categorical_features_list = []

In [563]:
len(categorical_features_list)

6

In [564]:
df = df.select(numerical_features_list + categorical_features_list + ['latestPrice'])

In [565]:
categorical_1hotindexfeatures_list = [feat+'_index' for feat in categorical_features_list]

In [566]:
categorical_1hotendcodingfeatures_list = [feat+'_1hot' for feat in categorical_features_list]

In [567]:
df.columns

['propertyTaxRate',
 'garageSpaces',
 'parkingSpaces',
 'numOfPhotos',
 'numOfAccessibilityFeatures',
 'numOfAppliances',
 'numOfParkingFeatures',
 'numOfPatioAndPorchFeatures',
 'numOfSecurityFeatures',
 'numOfWaterfrontFeatures',
 'numOfWindowFeatures',
 'numOfCommunityFeatures',
 'lotSizeSqFt',
 'livingAreaSqFt',
 'numOfPrimarySchools',
 'numOfElementarySchools',
 'numOfMiddleSchools',
 'numOfHighSchools',
 'avgSchoolDistance',
 'avgSchoolRating',
 'avgSchoolSize',
 'MedianStudentsPerTeacher',
 'numOfBathrooms',
 'numOfBedrooms',
 'numOfStories',
 'hasAssociation',
 'hasCooling',
 'hasGarage',
 'hasHeating',
 'hasSpa',
 'hasView',
 'latestPrice']

##### Converting numerical columns to float type

In [568]:
for colname in numerical_features_list+['latestPrice']:
    df = df.withColumn(colname, col(colname).cast(FloatType()))

In [569]:
df = df.dropna()

##### Splitting data into train and test

In [570]:
train, test = df.randomSplit([0.7, 0.3])

In [571]:
print(train.count())
print(test.count())

7450
3218


##### Imputing the numerical data

In [572]:
imputer = Imputer(inputCols=numerical_features_list,
                  outputCols=numerical_features_list)
imputer = imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeacher|numOfBathrooms|numOfBedrooms|numOfStories|hasAssociat

##### Creating the feature vector

In [573]:
numerical_vector_assembler = VectorAssembler(inputCols=numerical_features_list,
                                             outputCol='numerical_feature_vector')

train = numerical_vector_assembler.transform(train)
test = numerical_vector_assembler.transform(test)
train.show(2)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeacher|numOfBathrooms|numOfBedrooms

In [574]:
train.select('numerical_feature_vector').take(2)

[Row(numerical_feature_vector=SparseVector(25, {0: 1.98, 3: 1.0, 12: 3049.0, 13: 1300.0, 14: 1.0, 16: 1.0, 17: 1.0, 18: 1.4333, 19: 6.6667, 20: 1057.0, 21: 15.0, 22: 2.0, 23: 3.0, 24: 1.0})),
 Row(numerical_feature_vector=SparseVector(25, {0: 1.98, 3: 1.0, 12: 5140.0, 13: 1238.0, 14: 1.0, 16: 1.0, 17: 1.0, 18: 1.2333, 19: 4.0, 20: 413.0, 21: 11.0, 22: 1.0, 23: 2.0, 24: 1.0}))]

##### standardizing the numerical feature values

In [575]:
scaler = StandardScaler(inputCol='numerical_feature_vector',
                        outputCol='scaled_numerical_feature_vector',
                        withStd=True, withMean=True
                        )

scaler = scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

train.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+-------------------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeac

In [576]:
train.select('scaled_numerical_feature_vector').take(2)

[Row(scaled_numerical_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0122, -0.5545, 0.2502, -0.2066, -0.1352, 0.0692, -0.368, 0.4922, -0.5521, 0.0878, -0.6544, -0.5194, -0.8777])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0121, -0.5927, 0.2502, -0.2066, -0.1352, 0.0692, -0.5549, -0.9321, -2.52, -2.1944, -1.6253, -1.7024, -0.8777]))]

##### Converting string values to unique indices

In [577]:
indexer = StringIndexer(inputCols=categorical_features_list,
                        outputCols=categorical_1hotindexfeatures_list)

indexer = indexer.fit(train)

train = indexer.transform(train)
test = indexer.transform(test)

In [578]:
train.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+-------------------------------+--------------------+----------------+---------------+----------------+------------+-------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMi

In [579]:
train.select('scaled_numerical_feature_vector').take(3)

[Row(scaled_numerical_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0122, -0.5545, 0.2502, -0.2066, -0.1352, 0.0692, -0.368, 0.4922, -0.5521, 0.0878, -0.6544, -0.5194, -0.8777])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0121, -0.5927, 0.2502, -0.2066, -0.1352, 0.0692, -0.5549, -0.9321, -2.52, -2.1944, -1.6253, -1.7024, -0.8777])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0121, -0.0638, 0.2502, -0.2066, -0.1352, 0.0692, -0.8975, -1.2882, -0.9524, -1.0533, 0.3166, 0.6636, -0.8777]))]

##### Converting categorical values to one hot encoding format

In [580]:
one_hot_encoder = OneHotEncoder(inputCols=categorical_1hotindexfeatures_list,
                                outputCols=categorical_1hotendcodingfeatures_list)

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)
train.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+-------------------------------+--------------------+----------------+---------------+----------------+------------+-------------+-------------------+---------------+--------------+---------------+-------------+-------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOf

In [581]:
train.select(categorical_1hotendcodingfeatures_list).take(3)

[Row(hasAssociation_1hot=SparseVector(1, {}), hasCooling_1hot=SparseVector(1, {0: 1.0}), hasGarage_1hot=SparseVector(1, {}), hasHeating_1hot=SparseVector(1, {0: 1.0}), hasSpa_1hot=SparseVector(1, {0: 1.0}), hasView_1hot=SparseVector(1, {0: 1.0})),
 Row(hasAssociation_1hot=SparseVector(1, {}), hasCooling_1hot=SparseVector(1, {}), hasGarage_1hot=SparseVector(1, {}), hasHeating_1hot=SparseVector(1, {}), hasSpa_1hot=SparseVector(1, {0: 1.0}), hasView_1hot=SparseVector(1, {0: 1.0})),
 Row(hasAssociation_1hot=SparseVector(1, {}), hasCooling_1hot=SparseVector(1, {}), hasGarage_1hot=SparseVector(1, {}), hasHeating_1hot=SparseVector(1, {}), hasSpa_1hot=SparseVector(1, {0: 1.0}), hasView_1hot=SparseVector(1, {0: 1.0}))]

##### Creating the final feature vector

In [582]:
['scaled_numerical_feature_vector']+[x for x in categorical_1hotendcodingfeatures_list]

['scaled_numerical_feature_vector',
 'hasAssociation_1hot',
 'hasCooling_1hot',
 'hasGarage_1hot',
 'hasHeating_1hot',
 'hasSpa_1hot',
 'hasView_1hot']

In [583]:
assembler = VectorAssembler(inputCols=['scaled_numerical_feature_vector']+
                                       [x for x in categorical_1hotendcodingfeatures_list],
                            outputCol='final_feature_vector')

train = assembler.transform(train)
test = assembler.transform(test)

In [584]:
train.show(2)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+-------------------------------+--------------------+----------------+---------------+----------------+------------+-------------+-------------------+---------------+--------------+---------------+-------------+-------------+--------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numO

In [585]:
train.select('final_feature_vector').take(3)

[Row(final_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0122, -0.5545, 0.2502, -0.2066, -0.1352, 0.0692, -0.368, 0.4922, -0.5521, 0.0878, -0.6544, -0.5194, -0.8777, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0])),
 Row(final_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0121, -0.5927, 0.2502, -0.2066, -0.1352, 0.0692, -0.5549, -0.9321, -2.52, -2.1944, -1.6253, -1.7024, -0.8777, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 Row(final_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0121, -0.0638, 0.2502, -0.2066, -0.1352, 0.0692, -0.8975, -1.2882, -0.9524, -1.0533, 0.3166, 0.6636, -0.8777, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]))]

In [586]:
train.select('final_feature_vector').take(1)

[Row(final_feature_vector=DenseVector([-0.2592, -0.9422, -0.9368, -1.6534, -0.0696, -1.8757, -2.2352, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.0122, -0.5545, 0.2502, -0.2066, -0.1352, 0.0692, -0.368, 0.4922, -0.5521, 0.0878, -0.6544, -0.5194, -0.8777, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0]))]

In [587]:
lr = LinearRegression(featuresCol='final_feature_vector',
                      labelCol='latestPrice',
                      regParam=0.01)

lr

LinearRegression_a3923b5baf93

In [588]:
linearModel = lr.fit(train)

linearModel

LinearRegressionModel: uid=LinearRegression_a3923b5baf93, numFeatures=31

In [589]:
linearModel.coefficients

DenseVector([-51606.0546, 112556.4379, -110466.5976, 18765.2947, 4594.7902, -1872.7643, 33056.3579, 9428.0167, 1460.216, 44053.2231, -338.537, -10956.2114, 8670.2485, 85467.3761, 48911.6209, 53746.6737, 2756.1814, -28997.336, -2294.6339, 108062.3781, -38433.8679, -21829.2417, 231753.2216, -40607.6898, -48280.1734, -165097.1307, -183589.9816, -54155.0252, -105970.8515, -84552.0183, -26500.5804])

##### Generating predictions

In [590]:
len(test.select(numerical_features_list + categorical_features_list).columns)

31

In [591]:
linearModel.coefficients

DenseVector([-51606.0546, 112556.4379, -110466.5976, 18765.2947, 4594.7902, -1872.7643, 33056.3579, 9428.0167, 1460.216, 44053.2231, -338.537, -10956.2114, 8670.2485, 85467.3761, 48911.6209, 53746.6737, 2756.1814, -28997.336, -2294.6339, 108062.3781, -38433.8679, -21829.2417, 231753.2216, -40607.6898, -48280.1734, -165097.1307, -183589.9816, -54155.0252, -105970.8515, -84552.0183, -26500.5804])

In [592]:
len(linearModel.coefficients)

31

In [593]:
predictions = linearModel.transform(test)

In [594]:
predictions.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+-----------+------------------------+-------------------------------+--------------------+----------------+---------------+----------------+------------+-------------+-------------------+---------------+--------------+---------------+-------------+-------------+--------------------+-----------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWater

##### Evaluating the predictions

In [595]:
predictions_and_actuals = predictions.select('prediction',
                                        'latestPrice')



In [596]:
predictions_and_actuals_rdd = predictions_and_actuals.rdd

In [597]:
predictions_and_actuals_rdd.take(2)

[Row(prediction=597217.2436677001, latestPrice=674900.0),
 Row(prediction=710677.7033209288, latestPrice=154900.0)]

In [598]:
predictions_and_actuals_rdd = predictions_and_actuals_rdd.map(tuple)

predictions_and_actuals_rdd.take(2)

[(597217.2436677001, 674900.0), (710677.7033209288, 154900.0)]

In [599]:
metrics = RegressionMetrics(predictions_and_actuals_rdd)

s = '''
Mean Squared Error:      {0}
Root Mean Squared Error: {1}
Mean Absolute Error:     {2}
R**2:                    {3}
'''.format(metrics.meanSquaredError,
           metrics.rootMeanSquaredError,
           metrics.meanAbsoluteError,
           metrics.r2
           )

print(s)




Mean Squared Error:      91464855001.31107
Root Mean Squared Error: 302431.57077479706
Mean Absolute Error:     174045.99378944412
R**2:                    0.4493320622259688



##### Inspect the model coefficients

In [600]:
len(linearModel.coefficients)

31

In [601]:
linearModel.intercept

1008022.8505894007

In [602]:
featureCols = numerical_features_list + categorical_features_list

In [603]:
len(featureCols)

31

In [604]:
coeff_df = pd.DataFrame({"Feature": ["Intercept"] + featureCols, "Co-efficients": np.insert(linearModel.coefficients.toArray(), 0, linearModel.intercept)})
coeff_df = coeff_df[["Feature", "Co-efficients"]]

In [605]:
coeff_df

Unnamed: 0,Feature,Co-efficients
0,Intercept,1008023.0
1,propertyTaxRate,-51606.05
2,garageSpaces,112556.4
3,parkingSpaces,-110466.6
4,numOfPhotos,18765.29
5,numOfAccessibilityFeatures,4594.79
6,numOfAppliances,-1872.764
7,numOfParkingFeatures,33056.36
8,numOfPatioAndPorchFeatures,9428.017
9,numOfSecurityFeatures,1460.216


In [606]:
coeff_df.to_excel('features-coefficients.xlsx', index=False)

In [607]:
#sample_data_df = df.toPandas()
#sample_data_df = sample_data_df[featureCols]

##### Saving the models for future use

In [608]:
os.system('rm -rf ./lrmodel/')

0

In [609]:
linearModel.save('./lrmodel/lrm_model.model')

In [610]:
numerical_vector_assembler.save('./lrmodel/numerical_vector_assembler')

In [611]:
imputer.save('./lrmodel/imputer')

In [612]:
scaler.save('./lrmodel/scaler')

In [613]:
indexer.save('./lrmodel/indexer')

In [614]:
one_hot_encoder.save('./lrmodel/one_hot_encoder')

In [615]:
assembler.save('./lrmodel/assembler')