# Linear Regression

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression


In [3]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [8]:
spark

In [10]:
training = spark.read.format('libsvm').load('./data/sample_linear_regression_data.txt')

In [11]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [12]:
lr = LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [13]:
lrModel = lr.fit(training)

In [14]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [15]:
lrModel.intercept

0.14228558260358093

In [25]:
print("Coefficients: {}".format(str(lrModel.coefficients))) # For each feature...
print('\n')
print("Intercept:{}".format(str(lrModel.intercept)))

Coefficients: [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]


Intercept:0.14228558260358093


In [16]:
training_summary = lrModel.summary

In [17]:
training_summary.r2

0.027839179518600154

In [18]:
training_summary.rootMeanSquaredError

10.16309157133015

In [28]:
training_summary.residuals.show()
print("RMSE: {}".format(training_summary.rootMeanSquaredError))
print("r2: {}".format(training_summary.r2))

+-------------------+
|          residuals|
+-------------------+
|-11.011130022096554|
| 0.9236590911176538|
|-4.5957401897776675|
|  -20.4201774575836|
|-10.339160314788181|
|-5.9552091439610555|
|-10.726906349283922|
|  2.122807193191233|
|  4.077122222293811|
|-17.316168071241652|
| -4.593044343959059|
|  6.380476690746936|
| 11.320566035059846|
|-20.721971774534094|
| -2.736692773777401|
| -16.66886934252847|
|  8.242186378876315|
|-1.3723486332690233|
|-0.7060332131264666|
|-1.1591135969994064|
+-------------------+
only showing top 20 rows

RMSE: 10.16309157133015
r2: 0.027839179518600154


In [21]:
all_data = spark.read.format('libsvm').load('./data/sample_linear_regression_data.txt')

In [33]:
all_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                501|
|   mean|0.25688882219498976|
| stddev| 10.317884030544564|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [30]:
# Pass in the split between training/test as a list.
# No correct, but generally 70/30 or 60/40 splits are used. 
# Depending on how much data you have and how unbalanced it is.
traing_data,test_data = all_data.randomSplit([0.7,0.3])

In [31]:
traing_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
|-17.803626188664516|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
|-17.428674570939506|(10,[0,1,2,3,4,5,...|
|-17.065399625876015|(10,[0,1,2,3,4,5,...|
|-17.026492264209548|(10,[0,1,2,3,4,5,...|
| -16.71909683360509|(10,[0,1,2,3,4,5,...|
|-16.692207021311106|(10,[0,1,2,3,4,5,...|
|-16.151349351277112|(10,[0,1,2,3,4,5,...|
| -16.08565904102149|(10,[0,1,2,3,4,5,...|
+----------

In [36]:
traing_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                358|
|   mean|0.18269170241593344|
| stddev| 10.387991331657636|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [32]:
test_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -19.66731861537172|(10,[0,1,2,3,4,5,...|
|-19.402336030214553|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
| -16.26143027545273|(10,[0,1,2,3,4,5,...|
| -15.72351561304857|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -13.15333560636553|(10,[0,1,2,3,4,5,...|
| -12.92222310337042|(10,[0,1,2,3,4,5,...|
|-12.773226999251197|(10,[0,1,2,3,4,5,...|
| -12.41094640284016|(10,[0,1,2,3,4,5,...|
|-11.857350365429426|(10,[0,1,2,3,4,5,...|
|-11.827072996392571|(10,[0,1,2,3,4,5,...|
|-11.615775265015627|(10,[0,1,2,3,4,5,...|
| -11.43180236554046|(10,[0,1,2,3,4,5,...|
+----------

In [37]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                143|
|   mean|0.44264105213136923|
| stddev| 10.174000486161127|
|    min|-28.046018037776633|
|    max| 24.290551295953957|
+-------+-------------------+



In [38]:
correct_model = lr.fit(traing_data)

In [39]:
test_results = correct_model.evaluate(test_data)

In [40]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -25.69711139227846|
| -26.66436136421379|
|-20.226643728678592|
|-21.297192605436724|
| -21.53710330393629|
| -22.00313291567406|
| -17.99591604169406|
|-20.802654588045026|
| -17.93697825891584|
|-18.912326216471744|
| -18.26439137491711|
|-19.267315619018326|
|-11.429936287503589|
|-17.040687970425104|
| -8.221139774358438|
| -9.727894725362587|
|-10.516998880094395|
|-16.564271154335657|
|-14.456713508550235|
|-15.013421458341274|
+-------------------+
only showing top 20 rows



In [41]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))

RMSE: 10.48569468325466


Well that is nice, but realistically we will eventually want to test this model against unlabeled data, after all, that is the whole point of building the model in the first place. We can again do this with a convenient method call, in this case, transform(). Which was actually being called within the evaluate() method. 

In [42]:
unlabeled_data = test_data.select('features')

In [43]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [44]:
predictions = correct_model.transform(unlabeled_data)

In [45]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-2.3489066454981704|
|(10,[0,1,2,3,4,5,...|  3.714535428017714|
|(10,[0,1,2,3,4,5,...|-2.6108166882407486|
|(10,[0,1,2,3,4,5,...| 1.5144298158221856|
|(10,[0,1,2,3,4,5,...| 1.8697846885645713|
|(10,[0,1,2,3,4,5,...| 2.6007968854595065|
|(10,[0,1,2,3,4,5,...|-1.1723765812697016|
|(10,[0,1,2,3,4,5,...| 1.9567321151464427|
|(10,[0,1,2,3,4,5,...| 0.6102575262398926|
|(10,[0,1,2,3,4,5,...| 2.6508959410190136|
|(10,[0,1,2,3,4,5,...|  2.540875761868542|
|(10,[0,1,2,3,4,5,...|  5.494874057315457|
|(10,[0,1,2,3,4,5,...|-1.7233993188619425|
|(10,[0,1,2,3,4,5,...| 4.1184648670546835|
|(10,[0,1,2,3,4,5,...| -4.552087224892758|
|(10,[0,1,2,3,4,5,...| -2.683051677477573|
|(10,[0,1,2,3,4,5,...|-1.3403514853350316|
|(10,[0,1,2,3,4,5,...|  4.737198157943087|
|(10,[0,1,2,3,4,5,...| 2.8409382435346093|
|(10,[0,1,2,3,4,5,...|  3.581619092800813|
+----------

Okay, so this data is a bit meaningless, so let's explore this same process with some data that actually makes a little more intuitive sense!