In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col
from pyspark.ml.feature import Imputer
from pyspark.sql.types import FloatType, StringType
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel
from pyspark.ml.feature import StringIndexer, StringIndexerModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler, StandardScalerModel
from pyspark.ml.feature import Normalizer
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.sql.functions import mean
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.util import Saveable
import os
import pandas as pd
import numpy as np
from models.helper import Helper

In [2]:
helper = Helper()

In [3]:
spark = SparkSession.builder\
        .master("local")\
        .appName("HousePricePredict")\
        .config('spark.ui.port', '4051')\
        .getOrCreate()

spark

23/12/10 02:21:08 WARN Utils: Your hostname, shivalienware resolves to a loopback address: 127.0.1.1; using 172.28.191.26 instead (on interface eth0)
23/12/10 02:21:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/10 02:21:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


##### Reading user input data

In [4]:
df = spark.read.format("csv").load("./data/data-to-predict-house-prices.csv", header=True, inferSchema=True)

In [5]:
df.show()

23/12/10 02:21:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeacher|numOfBathrooms|numOfBedrooms|numOfStories|hasAssociation|hasCooli

In [6]:
len(df.columns)

31

In [7]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[propertyTaxRate: double, garageSpaces: int, parkingSpaces: int, numOfPhotos: int, numOfAccessibilityFeatures: int, numOfAppliances: int, numOfParkingFeatures: int, numOfPatioAndPorchFeatures: int, numOfSecurityFeatures: int, numOfWaterfrontFeatures: int, numOfWindowFeatures: int, numOfCommunityFeatures: int, lotSizeSqFt: int, livingAreaSqFt: int, numOfPrimarySchools: int, numOfElementarySchools: int, numOfMiddleSchools: int, numOfHighSchools: int, avgSchoolDistance: double, avgSchoolRating: double, avgSchoolSize: int, MedianStudentsPerTeacher: int, numOfBathrooms: int, numOfBedrooms: int, numOfStories: int, hasAssociation: string, hasCooling: string, hasGarage: string, hasHeating: string, hasSpa: string, hasView: string]>

#### Perform data transformations

##### Converting numerical columns to float type

In [8]:
for colname in helper.numerical_features_list:
    df = df.withColumn(colname, col(colname).cast(FloatType()))

In [9]:
for colname in helper.categorical_features_list:
    df = df.withColumn(colname, col(colname).cast(StringType()))

In [10]:
categorical_1hotindexfeatures_list = [feat+'_index' for feat in helper.categorical_features_list]

In [11]:
categorical_1hotendcodingfeatures_list = [feat+'_1hot' for feat in helper.categorical_features_list]

##### Creating the feature vector

In [12]:
numerical_vector_assembler = VectorAssembler.load('./lrmodel/numerical_vector_assembler/')

df = numerical_vector_assembler.transform(df)
df.show()

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+------------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeacher|numOfBathrooms|numOfBedrooms|numOfStorie

In [13]:
df.select('numerical_feature_vector').take(1)

[Row(numerical_feature_vector=DenseVector([1.98, 2.0, 2.0, 29.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6185.0, 1200.0, 1.0, 0.0, 1.0, 1.0, 1.4, 2.6667, 1063.0, 14.0, 2.0, 4.0, 1.0]))]

##### Normalizing the numerical feature values

In [14]:
scaler = StandardScalerModel.load('./lrmodel/scaler/')

df = scaler.transform(df)
df.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+------------------------+-------------------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfHighSchools|avgSchoolDistance|avgSchoolRating|avgSchoolSize|MedianStudentsPerTeacher|numOfBat

In [15]:
df.select('scaled_numerical_feature_vector').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled_numerical_feature_vector                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

##### Converting string values to unique indices

In [16]:
indexer = StringIndexer(inputCols=helper.categorical_features_list,
                        outputCols=categorical_1hotendcodingfeatures_list)

indexer = indexer.fit(df)
df = indexer.transform(df)

In [17]:
df.show(3)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+------------------------+-------------------------------+-------------------+---------------+--------------+---------------+-----------+------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numOfMiddleSchools|numOfH

##### Creating the final feature vector

In [18]:
assembler = VectorAssembler.load('./lrmodel/assembler/')

df = assembler.transform(df)

In [19]:
df.show(2)

+---------------+------------+-------------+-----------+--------------------------+---------------+--------------------+--------------------------+---------------------+-----------------------+-------------------+----------------------+-----------+--------------+-------------------+----------------------+------------------+----------------+-----------------+---------------+-------------+------------------------+--------------+-------------+------------+--------------+----------+---------+----------+------+-------+------------------------+-------------------------------+-------------------+---------------+--------------+---------------+-----------+------------+--------------------+
|propertyTaxRate|garageSpaces|parkingSpaces|numOfPhotos|numOfAccessibilityFeatures|numOfAppliances|numOfParkingFeatures|numOfPatioAndPorchFeatures|numOfSecurityFeatures|numOfWaterfrontFeatures|numOfWindowFeatures|numOfCommunityFeatures|lotSizeSqFt|livingAreaSqFt|numOfPrimarySchools|numOfElementarySchools|numO

In [20]:
df.select('final_feature_vector').take(1)

[Row(final_feature_vector=DenseVector([-0.2592, 0.5295, 0.5337, -0.2152, -0.0696, -1.3365, 0.3702, -0.6819, -0.5658, -0.0492, -0.4184, -0.0968, -0.012, -0.6161, 0.2502, -0.2066, -0.1352, 0.0692, -0.3992, -1.6443, -0.5337, -0.4828, -0.6544, 0.6636, -0.8777, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))]

##### Loading the saved LR model

In [21]:
linearModel = LinearRegressionModel.load('./lrmodel/lrm_model.model')

In [22]:
linearModel.coefficients

DenseVector([-51606.0546, 112556.4379, -110466.5976, 18765.2947, 4594.7902, -1872.7643, 33056.3579, 9428.0167, 1460.216, 44053.2231, -338.537, -10956.2114, 8670.2485, 85467.3761, 48911.6209, 53746.6737, 2756.1814, -28997.336, -2294.6339, 108062.3781, -38433.8679, -21829.2417, 231753.2216, -40607.6898, -48280.1734, -165097.1307, -183589.9816, -54155.0252, -105970.8515, -84552.0183, -26500.5804])

In [23]:
prediction = linearModel.transform(df)

In [24]:
prediction.select('prediction').show()

+-----------------+
|       prediction|
+-----------------+
|688239.2758163748|
+-----------------+

