In [1]:
from __future__ import print_function

In [2]:
sqlContext = SQLContext(sc)

## Load house prices

In [3]:
text_RDD = sc.textFile(
    "file:///oasis/scratch/comet/zonca/temp_project/houses_1mil.txt",
    minPartitions=48)

In [4]:
def mapper_parse_lines(line):
    """Parse line into (neighborhoood, price) pair"""
    words = line.split()
    return (words[1], float(words[2]), int(words[0]))

In [5]:
house_prices_RDD = text_RDD.map(mapper_parse_lines)

In [6]:
house_prices_RDD.take(5)

[(u'Port', 93000.0, 2),
 (u'Hilltop', 293000.0, 2),
 (u'Hilltop', 605000.0, 4),
 (u'Hilltop', 290000.0, 2),
 (u'Downtown', 418000.0, 4)]

In [7]:
house_prices_df = sqlContext.createDataFrame(house_prices_RDD, ["neighborhood", "price", "bedrooms"])

In [8]:
house_prices_df

DataFrame[neighborhood: string, price: double, bedrooms: bigint]

In [9]:
house_prices_df.printSchema()

root
 |-- neighborhood: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: long (nullable = true)



# Prepare features

In [10]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="neighborhood",
                              outputCol="neighborhood_index")


In [11]:
house_prices_df_indexed = stringIndexer.fit(house_prices_df).transform(house_prices_df)

In [12]:
house_prices_df_indexed.show()

+------------+--------+--------+------------------+
|neighborhood|   price|bedrooms|neighborhood_index|
+------------+--------+--------+------------------+
|        Port| 93000.0|       2|               0.0|
|     Hilltop|293000.0|       2|               1.0|
|     Hilltop|605000.0|       4|               1.0|
|     Hilltop|290000.0|       2|               1.0|
|    Downtown|418000.0|       4|               2.0|
|     Hilltop|271000.0|       2|               1.0|
|     Hilltop|296000.0|       2|               1.0|
|        Port|236000.0|       4|               0.0|
|    Downtown|219000.0|       2|               2.0|
|    Downtown|289000.0|       3|               2.0|
|     Hilltop|404000.0|       3|               1.0|
|        Port|149000.0|       3|               0.0|
|        Port|157000.0|       3|               0.0|
|     Hilltop|552000.0|       4|               1.0|
|    Downtown|217000.0|       2|               2.0|
|    Downtown|438000.0|       4|               2.0|
|    Downtow

In [13]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

In [14]:
def create_features(row):
    return Row(label=row.price,
               features=Vectors.dense([row.bedrooms, row.neighborhood_index])
              )
    
house_prices_features = sqlContext.createDataFrame(
    house_prices_df_indexed.map(create_features))

In [15]:
house_prices_features.show()

+---------+--------+
| features|   label|
+---------+--------+
|[2.0,0.0]| 93000.0|
|[2.0,1.0]|293000.0|
|[4.0,1.0]|605000.0|
|[2.0,1.0]|290000.0|
|[4.0,2.0]|418000.0|
|[2.0,1.0]|271000.0|
|[2.0,1.0]|296000.0|
|[4.0,0.0]|236000.0|
|[2.0,2.0]|219000.0|
|[3.0,2.0]|289000.0|
|[3.0,1.0]|404000.0|
|[3.0,0.0]|149000.0|
|[3.0,0.0]|157000.0|
|[4.0,1.0]|552000.0|
|[2.0,2.0]|217000.0|
|[4.0,2.0]|438000.0|
|[3.0,2.0]|299000.0|
|[4.0,1.0]|601000.0|
|[4.0,0.0]|230000.0|
|[4.0,0.0]|241000.0|
+---------+--------+
only showing top 20 rows



In [16]:
house_prices_features = house_prices_features.cache()

# Fit a Random Forest Regressor

In [17]:
from pyspark.ml.regression import RandomForestRegressor

In [18]:
RandomForestRegressor?

In [19]:
regr = RandomForestRegressor(numTrees=10, featureSubsetStrategy="all")
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + regr.explainParams())

LogisticRegression parameters:
cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. (default: False)
checkpointInterval: checkpoint interval (>= 1) (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2 (default: auto, current: all)
featuresCol: features column name (default: features)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: variance (default: variance)
labelCol: label column name (default: label)
maxBins: Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature. (default: 32)
maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5)
maxM

In [20]:
# Learn a RandomForestRegressor model. This uses the parameters stored in regr.
model = regr.fit(house_prices_features)

In [21]:
output = model.transform(house_prices_features)

In [22]:
output.show()

+---------+--------+------------------+
| features|   label|        prediction|
+---------+--------+------------------+
|[2.0,0.0]| 93000.0|104511.63102449726|
|[2.0,1.0]|293000.0|279509.60688933445|
|[4.0,1.0]|605000.0|  599573.747561463|
|[2.0,1.0]|290000.0|279509.60688933445|
|[4.0,2.0]|418000.0| 449537.9869135598|
|[2.0,1.0]|271000.0|279509.60688933445|
|[2.0,1.0]|296000.0|279509.60688933445|
|[4.0,0.0]|236000.0| 224458.9185392844|
|[2.0,2.0]|219000.0|209501.75616016105|
|[3.0,2.0]|289000.0|299523.14600544784|
|[3.0,1.0]|404000.0| 399542.4684480513|
|[3.0,0.0]|149000.0|149472.14182606322|
|[3.0,0.0]|157000.0|149472.14182606322|
|[4.0,1.0]|552000.0|  599573.747561463|
|[2.0,2.0]|217000.0|209501.75616016105|
|[4.0,2.0]|438000.0| 449537.9869135598|
|[3.0,2.0]|299000.0|299523.14600544784|
|[4.0,1.0]|601000.0|  599573.747561463|
|[4.0,0.0]|230000.0| 224458.9185392844|
|[4.0,0.0]|241000.0| 224458.9185392844|
+---------+--------+------------------+
only showing top 20 rows



# Compute mean squared error

In [23]:
from pyspark.sql.functions import pow

In [24]:
mean_squared_error = output.withColumn(
    "squared_error",
    pow(output.label - output.prediction,2)
    ).agg({"squared_error":"mean"}).collect()

In [25]:
mean_squared_error

[Row(avg(squared_error)=282413983.0559324)]

In [26]:
import numpy as np
np.sqrt(mean_squared_error[0][0])

16805.177269399224