In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pyspark

In [None]:
#Load Module
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [None]:
#Creat Spark Session
spark=SparkSession.builder.master('local').appName('Regression House Price').getOrCreate()

In [None]:
#Load dataset
data=spark.read.csv('../input/housesalesprediction/kc_house_data.csv',inferSchema=True, header=True)
data.show(5)

In [None]:
data.printSchema()

In [None]:
df=data.select("price",'bedrooms','bathrooms','sqft_living','sqft_lot','floors','sqft_above','sqft_basement','sqft_living15','sqft_lot15')
data.show(5)

In [None]:
df = df.withColumn("price", df["price"].cast(IntegerType()))
df = df.withColumn("bathrooms", df["bathrooms"].cast(IntegerType()))
df = df.withColumn("floors", df["floors"].cast(IntegerType()))
df.show(5)

In [None]:
df.describe().toPandas().transpose()

In [None]:
#Split dataset
splitdata=df.randomSplit([0.8,0.2])
train=splitdata[0]
test=splitdata[1]

print('train:\n', train.count(), '\ntest:\n',test.count())

In [None]:
df.printSchema()

In [None]:
#Assembler
assembler=VectorAssembler(inputCols=("price",'bedrooms','bathrooms','sqft_living','sqft_lot','floors','sqft_above','sqft_basement','sqft_living15','sqft_lot15'),
                         outputCol=('features'))
trainingdatasetfinal=assembler.transform(train).select(col('features'),(col('price').cast('Int').alias('Price')))
trainingdatasetfinal.show(5)


In [None]:
#Modelling
algoritma = LinearRegression(
    labelCol="Price",featuresCol="features", maxIter=10, regParam=0.3)
#mentraining model kita dengan training data final
model = algoritma.fit(trainingdatasetfinal)
print ("Model Trained")

In [None]:
#Preparing Data Testing
assembler=VectorAssembler(inputCols=["price",'bedrooms','bathrooms','sqft_living','sqft_lot','floors','sqft_above','sqft_basement','sqft_living15','sqft_lot15'],outputCol='features')
testingdatasetfinal=assembler.transform(test).select(col('features'),(col('price').cast('Int').alias("Price")))
testingdatasetfinal.show(5)

In [None]:
# Testing Data
data_pred=model.transform(testingdatasetfinal)
data_pred.show(5)

In [None]:
#mengimpor modul untuk mengevaluasi akurasi regresi kita
from pyspark.ml.evaluation import RegressionEvaluator
#mendefinisikan evaluator dengan menginputkan kolom dari "label" 
#dan "prediksi" data kita, serta matrik evaluasi yg kita inginkan (rmse)
evaluator = RegressionEvaluator(
    labelCol="Price", predictionCol="prediction", metricName="rmse")
#menghitung rmse dg evalutor yang telah kita definisikan
rmse = evaluator.evaluate(data_pred)
print ("Root Mean Square Error (RMSE):", rmse)

 Let's upvote! Thanks.