### Initializing Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Feature engineering") \
  .getOrCreate()

In [3]:
data = spark.read.csv("data/housing.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [4]:
import pyspark.sql.functions as F

data \
  .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in data.columns]) \
  .show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+



In [5]:
filtered_data = data.na.drop(subset=['total_bedrooms'])
filtered_data.count()

20433

#### Splitting the filtered dataset to train and test datasets

In [6]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  16395
Test size:  4038


#### String Indxeer imported from feaure module

In [7]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')
indexer_model = indexer.fit(train_data)
train_data = indexer_model.transform(train_data)
train_data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|           85800.0|     NEAR OCEAN|                  2.0|
|  -124.27|   40.69|              36.0|     2349.0|         528.0|    1194.0|     465.0|       2.5179|           79000.0|     NEAR OCEAN|                  2.0|
|  -124.26|   40.58|              52.0| 

In [9]:
train_data.select('ocean_proximity', 'ocean_proximity_index') \
  .distinct() \
  .show(5)

+---------------+---------------------+
|ocean_proximity|ocean_proximity_index|
+---------------+---------------------+
|     NEAR OCEAN|                  2.0|
|         INLAND|                  1.0|
|       NEAR BAY|                  3.0|
|         ISLAND|                  4.0|
|      <1H OCEAN|                  0.0|
+---------------+---------------------+



#### Using OneHotEncoder

In [10]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec', dropLast=False)
encoder_model = encoder.fit(train_data)
train_data = encoder_model.transform(train_data)
train_data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------------+
|  -124.35|   40.54|              52.0|     1820.0|         300.0|     806.0|     270.0|       3.0147|           94600.0|     NEAR OCEAN|                  2.0|      (5,[2],[1.0])|
|   -124.3|    41.8|              19.0|     2672.0|         552.0|    1298.0|     478.0|       1.9797|           85800.0|     NEAR OCEAN|                  2.0|      (5,[2],[1.0])|
|  -124.27|   40.69|              36.0|     2349.0|         528.0|    1194.0|     465.0|       2.517

In [12]:
train_data.select('ocean_proximity', 'ocean_proximity_index', 'ocean_proximity_vec') \
  .distinct() \
  .show(5)

+---------------+---------------------+-------------------+
|ocean_proximity|ocean_proximity_index|ocean_proximity_vec|
+---------------+---------------------+-------------------+
|      <1H OCEAN|                  0.0|      (5,[0],[1.0])|
|         ISLAND|                  4.0|      (5,[4],[1.0])|
|         INLAND|                  1.0|      (5,[1],[1.0])|
|       NEAR BAY|                  3.0|      (5,[3],[1.0])|
|     NEAR OCEAN|                  2.0|      (5,[2],[1.0])|
+---------------+---------------------+-------------------+

