In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
# load training data
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [4]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [9]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [10]:
lrModel = lr.fit(training)

In [11]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [12]:
lrModel.intercept

0.14228558260358093

In [13]:
print('Coefficients: {}'.format(str(lrModel.coefficients)))
print()
print('Intercept: {}'.format(str(lrModel.intercept)))

Coefficients: [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]

Intercept: 0.14228558260358093


In [14]:
training_summary = lrModel.summary

In [15]:
training_summary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-11.011130022096554|
| 0.9236590911176538|
|-4.5957401897776675|
|  -20.4201774575836|
|-10.339160314788181|
|-5.9552091439610555|
|-10.726906349283922|
|  2.122807193191233|
|  4.077122222293811|
|-17.316168071241652|
| -4.593044343959059|
|  6.380476690746936|
| 11.320566035059846|
|-20.721971774534094|
| -2.736692773777401|
| -16.66886934252847|
|  8.242186378876315|
|-1.3723486332690233|
|-0.7060332131264666|
|-1.1591135969994064|
+-------------------+
only showing top 20 rows



In [16]:
print('RMSE: {}'.format(training_summary.rootMeanSquaredError))
print('r2: {}'.format(training_summary.r2))

RMSE: 10.16309157133015
r2: 0.027839179518600154


# Train Test Split

In [17]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [18]:
train_data,test_data=all_data.randomSplit([0.7,0.3])

In [20]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                352|
|   mean|0.21754583342607117|
| stddev| 10.667756790253167|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [21]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               149|
|   mean|0.3498333325752573|
| stddev| 9.473779660848347|
|    min|-23.51088409032297|
|    max| 19.64829023536192|
+-------+------------------+



In [22]:
unlabeled_data = test_data.select('features')

In [23]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [24]:
lr_Model = lr.fit(train_data)

In [25]:
test_results = lr_Model.evaluate(test_data)

In [27]:
test_results.residuals.show()
print('RMSE: {}'.format(test_results.rootMeanSquaredError))

+-------------------+
|          residuals|
+-------------------+
|-22.297108454490015|
|-15.925213185244461|
| -19.84106893845962|
|-18.081212560507375|
|-16.492919662388196|
| -19.55637651886081|
|-14.303428381284697|
|-15.774145829018444|
| -14.73840368479504|
|-16.253515043369593|
| -13.23162509495321|
|-16.662900121547683|
| -12.74014656137501|
|-18.157074310348854|
| -11.97133886201457|
|-12.857897262071095|
|   -17.775435433996|
|-13.913681494029854|
|-13.088894543623589|
|  -14.1156623936409|
+-------------------+
only showing top 20 rows

RMSE: 9.662562875215492


In [28]:
predictions = lr_Model.transform(unlabeled_data)

In [29]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-1.2137756358329577|
|(10,[0,1,2,3,4,5,...|-1.5689871716388828|
|(10,[0,1,2,3,4,5,...| 2.4123943675201156|
|(10,[0,1,2,3,4,5,...| 1.0158129346313605|
|(10,[0,1,2,3,4,5,...|-0.5335726018213497|
|(10,[0,1,2,3,4,5,...| 2.8641694975497023|
|(10,[0,1,2,3,4,5,...|-1.8479209699924155|
|(10,[0,1,2,3,4,5,...| -0.177366736776129|
|(10,[0,1,2,3,4,5,...|-1.1236056427755197|
|(10,[0,1,2,3,4,5,...| 0.5214267711303476|
|(10,[0,1,2,3,4,5,...|-2.1442326283590862|
|(10,[0,1,2,3,4,5,...| 1.3033552417150047|
|(10,[0,1,2,3,4,5,...| -2.608724594004244|
|(10,[0,1,2,3,4,5,...| 2.8223068304265113|
|(10,[0,1,2,3,4,5,...|  -3.33964172740172|
|(10,[0,1,2,3,4,5,...|-1.9048609908600322|
|(10,[0,1,2,3,4,5,...| 3.4464569249205597|
|(10,[0,1,2,3,4,5,...|-0.0624494371228498|
|(10,[0,1,2,3,4,5,...| 0.5303187547674008|
|(10,[0,1,2,3,4,5,...| 1.6148886082858451|
+----------