## Initializing Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("PySpark pipeline") \
  .getOrCreate()

In [3]:
data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [4]:
import pyspark.sql.functions as F

data \
  .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in data.columns]) \
  .show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+



In [5]:
filtered_data = data.na.drop(subset=['total_bedrooms'])
filtered_data.count()

20433

#### Spliting the data to train and test datasets

In [6]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  16395
Test size:  4038


#### Imported the required modules

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

#### Indexers

In [8]:
indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')
encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec', dropLast=False)

#### Using Assembler and standard scaler

In [9]:
feature_cols = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                'population', 'households', 'median_income', 'ocean_proximity_vec']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='unscaled_features')
scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)


#### Linear Regression Model Initialize

In [10]:
lr = LinearRegression(featuresCol='features', labelCol='median_house_value', regParam=0.001)

#### Initialize the pipeline

In [None]:
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])

pipeline_model = pipeline.fit(train_data)

#### Predictions

In [12]:
test_predictions = pipeline_model.transform(test_data)