In [None]:
#installing pyspark
# !pip install pyspark

In [None]:
#Importing required libraries
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression


In [None]:
#creating a spark session
spark = SparkSession.builder.master("HouseRegression").getOrCreate()

In [None]:
train = spark.read.csv("./dataset/train.csv", inferSchema=True, header=True)
test = spark.read.csv("./dataset/test.csv", inferSchema=True, header=True)
train.printSchema()

In [None]:
#String Indexer are used tranform strings into categorical data. We are doing it for only one column here but we can doit for all string data
indexer = StringIndexer(inputCol="LotShape", outputCol="LotShape2")
indexed = indexer.fit(train).transform(train)
indexed.head(1)


In [None]:
#Assembler combines all integer and create a vector which is used as input to predict. Here we have only selected columns with data type as integer
assembler= VectorAssembler(inputCols=["MSSubClass","LotArea","OverallQual","OverallCond","BsmtFinSF1",
                                      "BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath",
                                     "FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","YearBuilt",
                                     "YearRemodAdd","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea",
                                     "MiscVal","MoSold","YrSold","LotShape2"],outputCol="features")

In [None]:
#transforming assembler
output= assembler.transform(indexed)
output.select("features","SalePrice")

In [None]:
#We can see column features is dense vector
final = output.select("features", "SalePrice")
final.head(3)

In [None]:
#We will split data into train and validate
train_df, valid_df = final.randomSplit([0.7, 0.3])
train_df.describe().show()

In [None]:
#initializing and fitting model
lr= LinearRegression(labelCol="SalePrice")
model= lr.fit(train_df)

In [None]:
#fitting model of validation set
validate=model.evaluate(valid_df)

In [None]:
#let's check how model performed
print(validate.rootMeanSquaredError)
print(validate.r2)