In [1]:
sqlContext = SQLContext(sc)

## Load house prices

In [2]:
text_RDD = sc.textFile("/data/houses.txt")

In [3]:
def mapper_parse_lines(line):
    """Parse line into (neighborhoood, price) pair"""
    words = line.split()
    return (words[1], float(words[2]), int(words[0]))

In [4]:
house_prices_RDD = text_RDD.map(mapper_parse_lines)

In [5]:
house_prices_RDD.collect()

[(u'Downtown', 400000.0, 3),
 (u'Downtown', 240000.0, 2),
 (u'Hilltop', 650000.0, 3)]

In [6]:
house_prices_df = sqlContext.createDataFrame(house_prices_RDD, ["neighborhood", "price", "bedrooms"])

In [7]:
house_prices_df.show()

+------------+--------+--------+
|neighborhood|   price|bedrooms|
+------------+--------+--------+
|    Downtown|400000.0|       3|
|    Downtown|240000.0|       2|
|     Hilltop|650000.0|       3|
+------------+--------+--------+



In [8]:
house_prices_df.printSchema()

root
 |-- neighborhood: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: long (nullable = true)



In [9]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

In [10]:
def create_features(row):
    return Row(neighborhood=row.neighborhood,
               features=Vectors.dense([row.bedrooms, row.price]))
    
house_prices_features = sqlContext.createDataFrame(
    house_prices_df.map(create_features))

In [11]:
house_prices_features.show()

+--------------+------------+
|      features|neighborhood|
+--------------+------------+
|[3.0,400000.0]|    Downtown|
|[2.0,240000.0]|    Downtown|
|[3.0,650000.0]|     Hilltop|
+--------------+------------+



In [12]:
from pyspark.ml.clustering import KMeans

In [13]:
kmeans = KMeans()

In [14]:
print(kmeans.explainParams())

featuresCol: features column name (default: features)
initMode: the initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: steps for k-means initialization mode (default: 5)
k: number of clusters to create (default: 2)
maxIter: max number of iterations (>= 0) (default: 20)
predictionCol: prediction column name (default: prediction)
seed: random seed (default: -7649703878154674547)
tol: the convergence tolerance for iterative algorithms (default: 0.0001)


In [15]:
model = kmeans.fit(house_prices_features)
centers = model.clusterCenters()

In [16]:
centers

[array([  2.50000000e+00,   3.20000000e+05]),
 array([  3.00000000e+00,   6.50000000e+05])]

In [17]:
transformed = model.transform(house_prices_features)

In [18]:
transformed.collect()

[Row(features=DenseVector([3.0, 400000.0]), neighborhood=u'Downtown', prediction=0),
 Row(features=DenseVector([2.0, 240000.0]), neighborhood=u'Downtown', prediction=0),
 Row(features=DenseVector([3.0, 650000.0]), neighborhood=u'Hilltop', prediction=1)]

In [19]:
new_houses = sqlContext.createDataFrame([
    (Vectors.dense([3.0, 450000]),),
    (Vectors.dense([2.0, 500000]),),        
        ],
    ["features"]
)

In [20]:
new_houses.show()

+--------------+
|      features|
+--------------+
|[3.0,450000.0]|
|[2.0,500000.0]|
+--------------+



In [21]:
model.transform(new_houses).collect()

[Row(features=DenseVector([3.0, 450000.0]), prediction=0),
 Row(features=DenseVector([2.0, 500000.0]), prediction=1)]

In [22]:
from pyspark.ml.classification import LogisticRegression

In [23]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="neighborhood",
                              outputCol="label")


In [24]:
house_prices_features_labels = stringIndexer.fit(house_prices_features).transform(house_prices_features)

In [25]:
house_prices_features_labels.collect()

[Row(features=DenseVector([3.0, 400000.0]), neighborhood=u'Downtown', label=0.0),
 Row(features=DenseVector([2.0, 240000.0]), neighborhood=u'Downtown', label=0.0),
 Row(features=DenseVector([3.0, 650000.0]), neighborhood=u'Hilltop', label=1.0)]

In [26]:
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(house_prices_features_labels)

LogisticRegression parameters:
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name (default: label)
maxIter: max number of iterations (>= 0) (default: 100, current: 10)
predictionCol: prediction column name (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
rawPredictionCol: raw prediction (a.k.a. confidence) column name (default: rawPrediction)
regParam: regularization parameter (>= 0) (default: 0.1, current: 0.01)
threshold: Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are

In [27]:
model1.transform(house_prices_features).collect()

[Row(features=DenseVector([3.0, 400000.0]), neighborhood=u'Downtown', rawPrediction=DenseVector([2.9666, -2.9666]), probability=DenseVector([0.951, 0.049]), prediction=0.0),
 Row(features=DenseVector([2.0, 240000.0]), neighborhood=u'Downtown', rawPrediction=DenseVector([3.9863, -3.9863]), probability=DenseVector([0.9818, 0.0182]), prediction=0.0),
 Row(features=DenseVector([3.0, 650000.0]), neighborhood=u'Hilltop', rawPrediction=DenseVector([-1.9976, 1.9976]), probability=DenseVector([0.1195, 0.8805]), prediction=1.0)]

In [28]:
model1.transform(new_houses).collect()

[Row(features=DenseVector([3.0, 450000.0]), rawPrediction=DenseVector([1.9737, -1.9737]), probability=DenseVector([0.878, 0.122]), prediction=0.0),
 Row(features=DenseVector([2.0, 500000.0]), rawPrediction=DenseVector([-1.1764, 1.1764]), probability=DenseVector([0.2357, 0.7643]), prediction=1.0)]