# California housing cost prediction
# Data (http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html)

In [65]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,IndexToString,VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import DenseVector
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
import pandas as pd
import numpy as np

In [36]:
# Building the Spark Session
spark = SparkSession \
        .builder \
        .appName("Cal_Housing_Spark_Classif") \
        .config('spark.some.config.option','some-value') \
        .getOrCreate()

# here, 'spark' is an object of SparkSession, which has the 'SparkContext' object and can be accessed directly
sc = spark.sparkContext
print(sc.version)

2.0.2


In [37]:
# Loading the data:
rdd = sc.textFile('/home/ramscrux7757/SPARK/CaliforniaHousing/cal_housing.data')

# Load the header
header = sc.textFile('/home/ramscrux7757/SPARK/CaliforniaHousing/cal_housing.domain')

In [38]:
# Data Exploration
header.collect() # instead try: header.take(2)
# collect() will bring everything into the driver

[u'longitude: continuous.',
 u'latitude: continuous.',
 u'housingMedianAge: continuous. ',
 u'totalRooms: continuous. ',
 u'totalBedrooms: continuous. ',
 u'population: continuous. ',
 u'households: continuous. ',
 u'medianIncome: continuous. ',
 u'medianHouseValue: continuous. ']

In [39]:
rdd.take(2)

[u'-122.230000,37.880000,41.000000,880.000000,129.000000,322.000000,126.000000,8.325200,452600.000000',
 u'-122.220000,37.860000,21.000000,7099.000000,1106.000000,2401.000000,1138.000000,8.301400,358500.000000']

In [40]:
# splitting the lines: (this will produce different lists)
rdd = rdd.map(lambda line: line.split(','))
rdd.take(2)

[[u'-122.230000',
  u'37.880000',
  u'41.000000',
  u'880.000000',
  u'129.000000',
  u'322.000000',
  u'126.000000',
  u'8.325200',
  u'452600.000000'],
 [u'-122.220000',
  u'37.860000',
  u'21.000000',
  u'7099.000000',
  u'1106.000000',
  u'2401.000000',
  u'1138.000000',
  u'8.301400',
  u'358500.000000']]

In [41]:
# Map the RDD o DF
df = rdd.map(lambda line: Row(longitude=line[0], 
                              latitude=line[1], 
                              housingMedianAge=line[2],
                              totalRooms=line[3],
                              totalBedRooms=line[4],
                              population=line[5], 
                              households=line[6],
                              medianIncome=line[7],
                              medianHouseValue=line[8])).toDF()

In [42]:
print(type(df))
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+-----------+----------------+---------+-----------+----------------+------------+-----------+-------------+-----------+
| households|housingMedianAge| latitude|  longitude|medianHouseValue|medianIncome| population|totalBedRooms| totalRooms|
+-----------+----------------+---------+-----------+----------------+------------+-----------+-------------+-----------+
| 126.000000|       41.000000|37.880000|-122.230000|   452600.000000|    8.325200| 322.000000|   129.000000| 880.000000|
|1138.000000|       21.000000|37.860000|-122.220000|   358500.000000|    8.301400|2401.000000|  1106.000000|7099.000000|
| 177.000000|       52.000000|37.850000|-122.240000|   352100.000000|    7.257400| 496.000000|   190.000000|1467.000000|
| 219.000000|       52.000000|37.850000|-122.250000|   341300.000000|    5.643100| 558.000000|   235.000000|1274.000000|
| 259.000000|       52.000000|37.850000|-122.250000|   342200.000000|    3.846200| 565.000000|   280.000000|162

In [43]:
# careful that it usually reads them as 'strings'
print(df.dtypes)
print(df.printSchema())

[('households', 'string'), ('housingMedianAge', 'string'), ('latitude', 'string'), ('longitude', 'string'), ('medianHouseValue', 'string'), ('medianIncome', 'string'), ('population', 'string'), ('totalBedRooms', 'string'), ('totalRooms', 'string')]
root
 |-- households: string (nullable = true)
 |-- housingMedianAge: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- medianHouseValue: string (nullable = true)
 |-- medianIncome: string (nullable = true)
 |-- population: string (nullable = true)
 |-- totalBedRooms: string (nullable = true)
 |-- totalRooms: string (nullable = true)

None


In [44]:
# Changing the data types of the fields: (the following is not an elegant way !!!)


#df = df.withColumn("longitude", df["longitude"].cast(FloatType())) \
#   .withColumn("latitude", df["latitude"].cast(FloatType())) \
#   .withColumn("housingMedianAge",df["housingMedianAge"].cast(FloatType())) \
#   .withColumn("totalRooms", df["totalRooms"].cast(FloatType())) \ 
#   .withColumn("totalBedRooms", df["totalBedRooms"].cast(FloatType())) \ 
#   .withColumn("population", df["population"].cast(FloatType())) \ 
#   .withColumn("households", df["households"].cast(FloatType())) \ 
#   .withColumn("medianIncome", df["medianIncome"].cast(FloatType())) \ 
#   .withColumn("medianHouseValue", df["medianHouseValue"].cast(FloatType()))

In [45]:
# Defining a UDF to cast the dtypes
# import all from `sql.types`
from pyspark.sql.functions import *

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, dtype):
  for col in df.columns: 
     df = df.withColumn(col, df[col].cast(dtype))
  return df 

# Conver the `df` columns to `FloatType()`
df = convertColumn(df, FloatType())

In [46]:
print(df.printSchema())

root
 |-- households: float (nullable = true)
 |-- housingMedianAge: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- medianHouseValue: float (nullable = true)
 |-- medianIncome: float (nullable = true)
 |-- population: float (nullable = true)
 |-- totalBedRooms: float (nullable = true)
 |-- totalRooms: float (nullable = true)

None


In [47]:
# Stats
df_pd = df.describe().show()

+-------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+
|summary|        households|  housingMedianAge|          latitude|          longitude|  medianHouseValue|      medianIncome|        population|    totalBedRooms|        totalRooms|
+-------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+
|  count|             20640|             20640|             20640|              20640|             20640|             20640|             20640|            20640|             20640|
|   mean| 499.5396802325581|28.639486434108527| 35.63186143109965|-119.56970444871473|206855.81690891474|3.8706710030346416|1425.4767441860465|537.8980135658915|2635.7630813953488|
| stddev|382.32975283161136|12.585557612111613|2.1359523806029554| 2.0035317429328914|115395.61

In [48]:
# extracting columns
df.select('population','totalBedRooms').show(5)

+----------+-------------+
|population|totalBedRooms|
+----------+-------------+
|     322.0|        129.0|
|    2401.0|       1106.0|
|     496.0|        190.0|
|     558.0|        235.0|
|     565.0|        280.0|
+----------+-------------+
only showing top 5 rows



In [49]:
df.groupBy('housingMedianAge').count().sort('housingMedianAge',ascending=False).show()

+----------------+-----+
|housingMedianAge|count|
+----------------+-----+
|            52.0| 1273|
|            51.0|   48|
|            50.0|  136|
|            49.0|  134|
|            48.0|  177|
|            47.0|  198|
|            46.0|  245|
|            45.0|  294|
|            44.0|  356|
|            43.0|  353|
|            42.0|  368|
|            41.0|  296|
|            40.0|  304|
|            39.0|  369|
|            38.0|  394|
|            37.0|  537|
|            36.0|  862|
|            35.0|  824|
|            34.0|  689|
|            33.0|  615|
+----------------+-----+
only showing top 20 rows



In [50]:
# Data Pre-processing

# rescale the values of 'target variable - 'medianHouseValue'')
df = df.withColumn('medianHouseValue', col('medianHouseValue')/100000)
df.take(2)

[Row(households=126.0, housingMedianAge=41.0, latitude=37.880001068115234, longitude=-122.2300033569336, medianHouseValue=4.526, medianIncome=8.325200080871582, population=322.0, totalBedRooms=129.0, totalRooms=880.0),
 Row(households=1138.0, housingMedianAge=21.0, latitude=37.86000061035156, longitude=-122.22000122070312, medianHouseValue=3.585, medianIncome=8.301400184631348, population=2401.0, totalBedRooms=1106.0, totalRooms=7099.0)]

In [51]:
# Feature Engineering

# Divide `totalRooms` by `households`
roomsPerHousehold = df.select(col("totalRooms")/col("households"))

# Divide `population` by `households`
populationPerHousehold = df.select(col("population")/col("households"))

# Divide `totalBedRooms` by `totalRooms`
bedroomsPerRoom = df.select(col("totalBedRooms")/col("totalRooms"))

# Add the new columns to `df`
df = df.withColumn("roomsPerHousehold", col("totalRooms")/col("households")) \
   .withColumn("populationPerHousehold", col("population")/col("households")) \
   .withColumn("bedroomsPerRoom", col("totalBedRooms")/col("totalRooms"))
   
# Inspect the result
df.first()

Row(households=126.0, housingMedianAge=41.0, latitude=37.880001068115234, longitude=-122.2300033569336, medianHouseValue=4.526, medianIncome=8.325200080871582, population=322.0, totalBedRooms=129.0, totalRooms=880.0, roomsPerHousehold=6.984126984126984, populationPerHousehold=2.5555555555555554, bedroomsPerRoom=0.14659090909090908)

In [52]:
# Leave the variables "longitufe,latitude,housingMedianAge, and totalRooms'
# Reorder and select columns (reordering helps while defining the labels and vector assembling)
df = df.select("medianHouseValue", 
              "totalBedRooms", 
              "population", 
              "households", 
              "medianIncome", 
              "roomsPerHousehold", 
              "populationPerHousehold", 
              "bedroomsPerRoom")

In [53]:
# separating the label(target) and further assemblng the vectors as features 
# Import `DenseVector`

# Define the `input_data` 
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Replace `df` with the new DataFrame
df = spark.createDataFrame(input_data, ["label", "features"])
df.take(2)

[Row(label=4.526, features=DenseVector([129.0, 322.0, 126.0, 8.3252, 6.9841, 2.5556, 0.1466])),
 Row(label=3.585, features=DenseVector([1106.0, 2401.0, 1138.0, 8.3014, 6.2381, 2.1098, 0.1558]))]

In [54]:
# starndardizing the data
# Import `StandardScaler` 

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df)

# Inspect the result
scaled_df.take(2)

[Row(label=4.526, features=DenseVector([129.0, 322.0, 126.0, 8.3252, 6.9841, 2.5556, 0.1466]), features_scaled=DenseVector([0.3062, 0.2843, 0.3296, 4.3821, 2.8228, 0.2461, 2.5264])),
 Row(label=3.585, features=DenseVector([1106.0, 2401.0, 1138.0, 8.3014, 6.2381, 2.1098, 0.1558]), features_scaled=DenseVector([2.6255, 2.1202, 2.9765, 4.3696, 2.5213, 0.2031, 2.6851]))]

In [55]:
# Building a machine learning model with SparkML
# Split the data into train and test sets
train_data, test_data = scaled_df.randomSplit([.8,.2],seed=101)

In [56]:
# Import `LinearRegression`

# Initialize `lr`
lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)
# here: regParam:  lambda
# elasticNetParam: alpha

# Fit the data to the model
linearModel = lr.fit(train_data)

In [None]:
model1 = rf.fit(train_cv)
predictions = model1.transform(test_cv)
# Model evaluation
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
import numpy as np
np.sqrt(mse), mse

In [67]:
# Predicions on test-data
# Generate predictions
predictions = linearModel.transform(test_data)

evaluator = RegressionEvaluator()
mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
np.sqrt(mse), mse

(0.88590578429888789, 0.7848290586542276)

In [68]:
# Extract the predictions and the "known" correct labels
prediction_label = predictions.select("prediction").rdd.map(lambda x: x[0])
labels = predictions.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = prediction_label.zip(labels).collect()

# Print out first 5 instances of `predictionAndLabel` 
predictionAndLabel[:5]

[(1.638626132814976, 0.266),
 (1.7313439228736005, 0.3),
 (1.4903490537133215, 0.332),
 (1.6196211718628302, 0.375),
 (1.4872181846990133, 0.379)]

In [69]:
# Model Evaluation
# Coefficients for the model
print(linearModel.coefficients)

# Intercept for the model
print(linearModel.intercept)

(7,[3],[0.274638009624])
1.00660166786


In [70]:
# Get the RMSE
print(linearModel.summary.rootMeanSquaredError)

# Get the R2
print(linearModel.summary.r2)

0.879138532701
0.418198839718


In [71]:
spark.stop()