In [None]:
housing_lines = sc.textFile('data-files/housing.data')

In [None]:
housing_lines.take(5)

In [None]:
from pyspark.mllib.linalg import Vector, Vectors

housing_values = housing_lines.map(lambda line: Vectors.dense([float(v.strip()) for v in line.split(',')]))

In [None]:
housing_values.take(1)

In [None]:
from pyspark.mllib.regression import LabeledPoint

In [None]:
def toLabelPoint(row):
    ar = row.toArray()
    return LabeledPoint(ar[-1], ar[:-1])

housing_data = housing_values.map(toLabelPoint)    

In [29]:
housing_data.take(5)

[LabeledPoint(24.0, [0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]),
 LabeledPoint(21.6, [0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]),
 LabeledPoint(34.7, [0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]),
 LabeledPoint(33.4, [0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94]),
 LabeledPoint(36.2, [0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33])]

In [30]:
sets = housing_data.randomSplit([0.8, 0.2])
housing_train = sets[0]
housing_test = sets[1]

In [31]:
housing_train.take(2)

[LabeledPoint(21.6, [0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]),
 LabeledPoint(33.4, [0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94])]

In [32]:
housing_test.take(2)

[LabeledPoint(24.0, [0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]),
 LabeledPoint(34.7, [0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03])]

In [33]:
from pyspark.mllib.feature import StandardScaler

In [34]:
scaler = StandardScaler(True, True).fit(housing_train.map(lambda row: row.features))

In [35]:
train_label = housing_train.map(lambda row: row.label)
train_features = housing_train.map(lambda row: row.features)

test_label = housing_test.map(lambda row: row.label)
test_features = housing_test.map(lambda row: row.features)

train_scaled = train_label.zip( scaler.transform(train_features) ).map(lambda x: LabeledPoint(x[0], x[1]))
test_scaled = test_label.zip( scaler.transform(test_features) ).map(lambda x: LabeledPoint(x[0], x[1]))

In [36]:
test_scaled.take(3)

[LabeledPoint(24.0, [-0.38175516133295884,0.2520515206344977,-1.2378621624170678,-0.27388696921142547,-0.09754261883638317,0.39575622682898876,-0.10007459686717556,0.07359727326804959,-0.9640283856251545,-0.6272281460164654,-1.4667804474163397,0.42162506233415986,-1.0420343578629656]),
 LabeledPoint(34.7, [-0.37944295686088814,-0.5023115917361637,-0.5496890349734455,-0.27388696921142547,-0.6986555807485609,1.2616890332925192,-0.24477816653551013,0.47492407483158094,-0.8450277598332422,-0.9536605316140085,-0.30658115909112166,0.37492510841300825,-1.17493201859798]),
 LabeledPoint(18.9, [-0.3637029860269301,0.021551680743462295,-0.43402968582325685,-0.27388696921142547,-0.21950756763015836,-0.4148136624999898,0.6305019621900256,1.2184607896197899,-0.48802588245750533,-0.5365524833504811,-1.513188418949349,0.30470306467899144,0.6534600085668992])]

In [38]:
from pyspark.mllib.regression import LinearRegressionWithSGD

In [39]:
alg = LinearRegressionWithSGD()

train_scaled.cache()
test_scaled.cache()

model = alg.train(train_scaled, iterations=100, intercept=True)

In [40]:
test_predicts = test_scaled.map(lambda x: ( float(model.predict(x.features)), x.label ) )
test_predicts.take(5)

[(30.48875909993463, 24.0),
 (30.745688610465734, 34.7),
 (19.02826658260388, 18.9),
 (20.964208742202324, 23.1),
 (21.08325978712002, 21.0)]

In [41]:
import math

rmse = math.sqrt( test_predicts.map(lambda x: pow(x[0] - x[1], 2)).mean() )
print(rmse)

4.772372174162127


In [42]:
from pyspark.mllib.evaluation import RegressionMetrics

In [43]:
test_metrics = RegressionMetrics(test_predicts)

In [44]:
print( test_metrics.rootMeanSquaredError )
print( test_metrics.meanSquaredError )
print( test_metrics.r2 )

4.772372174162128
22.775536168716954
0.7235460245798648
