In [1]:
# Linear Regression Exercise from tutorials

In [2]:
# Initialize PySpark path 
import findspark
findspark.init('/home/jing/spark-2.1.0-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [5]:
from pyspark.ml.regression import LinearRegression

In [6]:
# Read data (in libsvm format)
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [9]:
training.show(10)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 10 rows



In [11]:
# Define model
lr = LinearRegression(featuresCol = 'features', labelCol = 'label', predictionCol= 'Prediction')

In [12]:
# Fit model
lrModel = lr.fit(training)

In [13]:
# Return coefficients
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [14]:
# Return intercept
lrModel.intercept

0.14228558260358093

In [15]:
# Return summary
training_summary = lrModel.summary

In [18]:
training_summary.rootMeanSquaredError

10.16309157133015

In [19]:
# Actually we didn't split the data set into training and test sets
# Here's the correction

# Load data again 
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [20]:
split_object = all_data.randomSplit([0.7, 0.3])

In [21]:
split_object

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [22]:
train_data, test_data = split_object

In [24]:
train_data.show(10)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 10 rows



In [26]:
# Show training data 
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                351|
|   mean|0.26448177281198837|
| stddev| 10.500824686844417|
|    min|-26.805483428483072|
|    max|  27.78383192005107|
+-------+-------------------+



In [28]:
# show test data
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                150|
|   mean|0.23912131775121442|
| stddev|  9.910924133134388|
|    min|-28.571478869743427|
|    max|  22.31738046492344|
+-------+-------------------+



In [29]:
# Train model using training data 
correct_model = lr.fit(train_data)

In [36]:
# Evalute the model on training data
train_result = correct_model.evaluate(train_data)

In [39]:
train_result.rootMeanSquaredError

10.341346508941509

In [40]:
# Evaluate the model on test data 
test_result = correct_model.evaluate(test_data)  

In [41]:
test_result.rootMeanSquaredError

9.879556169307575

In [42]:
# Deploy the model on unlabeled data to make prediction

# Create unlabeled data 
unlabeled_data = test_data.select('features')

In [43]:
unlabeled_data.show(10)

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 10 rows



In [44]:
# transform to predict
predictions = correct_model.transform(unlabeled_data)

In [45]:
predictions.show()

+--------------------+--------------------+
|            features|          Prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -1.2571140782696446|
|(10,[0,1,2,3,4,5,...|  -2.418222481657683|
|(10,[0,1,2,3,4,5,...| -0.0863424363951012|
|(10,[0,1,2,3,4,5,...|  1.5010022964302325|
|(10,[0,1,2,3,4,5,...|  2.4416693343870657|
|(10,[0,1,2,3,4,5,...| -2.0295949513127645|
|(10,[0,1,2,3,4,5,...| -1.6002518194398756|
|(10,[0,1,2,3,4,5,...|-0.13811361705658304|
|(10,[0,1,2,3,4,5,...| -1.0313950641223828|
|(10,[0,1,2,3,4,5,...| -1.9549267724511057|
|(10,[0,1,2,3,4,5,...|  3.3283462671176958|
|(10,[0,1,2,3,4,5,...|  -3.269679032992041|
|(10,[0,1,2,3,4,5,...|  0.6981786375346232|
|(10,[0,1,2,3,4,5,...| -0.8925962619680655|
|(10,[0,1,2,3,4,5,...|-0.08210169343211127|
|(10,[0,1,2,3,4,5,...|  2.2745132672828197|
|(10,[0,1,2,3,4,5,...|  3.2665541974988805|
|(10,[0,1,2,3,4,5,...|  -4.149345634300545|
|(10,[0,1,2,3,4,5,...|  1.0783465808587058|
|(10,[0,1,2,3,4,5,...| -0.835217