In [None]:
sqlContext = SQLContext(sc)

## Load house prices

In [None]:
text_RDD = sc.textFile("/data/houses.txt")

In [None]:
def mapper_parse_lines(line):
    """Parse line into (neighborhoood, price) pair"""
    words = line.split()
    return (words[1], float(words[2]), int(words[0]))

In [None]:
house_prices_RDD = text_RDD.map(mapper_parse_lines)

In [None]:
house_prices_RDD.collect()

In [None]:
house_prices_df = sqlContext.createDataFrame(house_prices_RDD, ["neighborhood", "price", "bedrooms"])

In [None]:
house_prices_df.show()

In [None]:
house_prices_df.printSchema()

In [None]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

In [None]:
def create_features(row):
    return Row(neighborhood=row.neighborhood,
               features=Vectors.dense([row.bedrooms, row.price]))
    
house_prices_features = sqlContext.createDataFrame(
    house_prices_df.map(create_features))

In [None]:
house_prices_features.show()

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
kmeans = KMeans()

In [None]:
print(kmeans.explainParams())

In [None]:
model = kmeans.fit(house_prices_features)
centers = model.clusterCenters()

In [None]:
centers

In [None]:
transformed = model.transform(house_prices_features)

In [None]:
transformed.collect()

In [None]:
new_houses = sqlContext.createDataFrame([
    (Vectors.dense([3.0, 450000]),),
    (Vectors.dense([2.0, 500000]),),        
        ],
    ["features"]
)

In [None]:
new_houses.show()

In [None]:
model.transform(new_houses).collect()

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="neighborhood",
                              outputCol="label")


In [None]:
house_prices_features_labels = stringIndexer.fit(house_prices_features).transform(house_prices_features)

In [None]:
house_prices_features_labels.collect()

In [None]:
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(house_prices_features_labels)

In [None]:
model1.transform(house_prices_features).collect()

In [None]:
model1.transform(new_houses).collect()