In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [17]:
df = spark.read.csv("../lecture-2/house_prices.csv", header=True, inferSchema=True)

In [18]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: double (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [19]:
# достаём только колонки со строковыми значениями
columnList = [item[0] for item in df.dtypes if item[1].startswith('string')]

In [20]:
columnList

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [21]:
# удаляем все строковые значения
df = df.drop(*columnList)

In [28]:
df = df.drop('Id')

In [29]:
df.printSchema()

root
 |-- MSSubClass: integer (nullable = true)
 |-- LotFrontage: double (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- MasVnrArea: double (nullable = true)
 |-- BsmtFinSF1: integer (nullable = true)
 |-- BsmtFinSF2: integer (nullable = true)
 |-- BsmtUnfSF: integer (nullable = true)
 |-- TotalBsmtSF: integer (nullable = true)
 |-- 1stFlrSF: integer (nullable = true)
 |-- 2ndFlrSF: integer (nullable = true)
 |-- LowQualFinSF: integer (nullable = true)
 |-- GrLivArea: integer (nullable = true)
 |-- BsmtFullBath: integer (nullable = true)
 |-- BsmtHalfBath: integer (nullable = true)
 |-- FullBath: integer (nullable = true)
 |-- HalfBath: integer (nullable = true)
 |-- BedroomAbvGr: integer (nullable = true)
 |-- KitchenAbvGr: integer (nullable = true)
 |-- TotRmsAbvGrd: integer (nullable = true)
 |--

In [30]:
df.head(2)

[Row(MSSubClass=60, LotFrontage=65.0, LotArea=8450, OverallQual=7, OverallCond=5, YearBuilt=2003, YearRemodAdd=2003, MasVnrArea=196.0, BsmtFinSF1=706, BsmtFinSF2=0, BsmtUnfSF=150, TotalBsmtSF=856, 1stFlrSF=856, 2ndFlrSF=854, LowQualFinSF=0, GrLivArea=1710, BsmtFullBath=1, BsmtHalfBath=0, FullBath=2, HalfBath=1, BedroomAbvGr=3, KitchenAbvGr=1, TotRmsAbvGrd=8, Fireplaces=0, GarageYrBlt=2003.0, GarageCars=2, GarageArea=548, WoodDeckSF=0, OpenPorchSF=61, EnclosedPorch=0, 3SsnPorch=0, ScreenPorch=0, PoolArea=0, MiscVal=0, MoSold=2, YrSold=2008, SalePrice=208500),
 Row(MSSubClass=20, LotFrontage=80.0, LotArea=9600, OverallQual=6, OverallCond=8, YearBuilt=1976, YearRemodAdd=1976, MasVnrArea=0.0, BsmtFinSF1=978, BsmtFinSF2=0, BsmtUnfSF=284, TotalBsmtSF=1262, 1stFlrSF=1262, 2ndFlrSF=0, LowQualFinSF=0, GrLivArea=1262, BsmtFullBath=0, BsmtHalfBath=1, FullBath=2, HalfBath=0, BedroomAbvGr=3, KitchenAbvGr=1, TotRmsAbvGrd=6, Fireplaces=1, GarageYrBlt=1976.0, GarageCars=2, GarageArea=460, WoodDeckSF=2

In [25]:
df = df.dropna()

In [42]:
from pyspark.ml.feature import VectorAssembler

In [43]:
train_cols = df.columns[:-1]

In [44]:
vectorAssembler = VectorAssembler(inputCols = train_cols, outputCol = 'features')

In [45]:
v_df = vectorAssembler.transform(df)

In [47]:
v_df = v_df.select(['features', 'SalePrice'])

In [49]:
v_df.show(3)

+--------------------+---------+
|            features|SalePrice|
+--------------------+---------+
|[60.0,65.0,8450.0...|   208500|
|[20.0,80.0,9600.0...|   181500|
|[60.0,68.0,11250....|   223500|
+--------------------+---------+
only showing top 3 rows



In [51]:
(train_df, test_df) = v_df.randomSplit([0.8,0.2])

In [35]:
from pyspark.ml.regression import LinearRegression

In [52]:
lr = LinearRegression(featuresCol='features', labelCol='SalePrice')

In [53]:
lr_model = lr.fit(train_df)

In [56]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("MSE: %f" % trainingSummary.meanSquaredError)
print("MAE: %f" % trainingSummary.meanAbsoluteError)

RMSE: 30686.356866
MSE: 941652497.706340
MAE: 19785.592053


In [57]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","SalePrice","features").show(5)

+------------------+---------+--------------------+
|        prediction|SalePrice|            features|
+------------------+---------+--------------------+
|211581.38057713123|   214000|(36,[0,1,2,3,4,5,...|
|146763.15091166965|   131500|(36,[0,1,2,3,4,5,...|
|157105.80743606825|   124500|(36,[0,1,2,3,4,5,...|
|216287.06258818269|   233170|(36,[0,1,2,3,4,5,...|
|209218.00053931278|   227680|(36,[0,1,2,3,4,5,...|
+------------------+---------+--------------------+
only showing top 5 rows



In [61]:
from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="SalePrice", metricName="rmse")

print("MSE on test data = %g" % lr_evaluator.evaluate(lr_predictions))

MSE on test data = 51465.2
