# Linear Regression Example Program

In [1]:
import findspark
findspark.init('/home/raj/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('linreg').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
training=spark.read.format('libsvm').load('/home/raj/Documents/Udemy-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/sample_linear_regression_data.txt')

In [4]:
training.head(5)

[Row(label=-9.490009878824548, features=SparseVector(10, {0: 0.4551, 1: 0.3664, 2: -0.3826, 3: -0.4458, 4: 0.3311, 5: 0.8067, 6: -0.2624, 7: -0.4485, 8: -0.0727, 9: 0.5658})),
 Row(label=0.2577820163584905, features=SparseVector(10, {0: 0.8387, 1: -0.127, 2: 0.4998, 3: -0.2269, 4: -0.6452, 5: 0.1887, 6: -0.5805, 7: 0.6519, 8: -0.6556, 9: 0.1749})),
 Row(label=-4.438869807456516, features=SparseVector(10, {0: 0.5026, 1: 0.1421, 2: 0.16, 3: 0.505, 4: -0.9372, 5: -0.2842, 6: 0.6356, 7: -0.1646, 8: 0.9481, 9: 0.4268})),
 Row(label=-19.782762789614537, features=SparseVector(10, {0: -0.0389, 1: -0.4167, 2: 0.8997, 3: 0.641, 4: 0.2733, 5: -0.2618, 6: -0.2795, 7: -0.1307, 8: -0.0854, 9: -0.0546})),
 Row(label=-7.966593841555266, features=SparseVector(10, {0: -0.062, 1: 0.6546, 2: -0.6979, 3: 0.6677, 4: -0.0794, 5: -0.4389, 6: -0.6081, 7: -0.6415, 8: 0.7314, 9: -0.0268}))]

In [5]:
training.show(5)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 5 rows



In [6]:
lr=LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [7]:
lrModel=lr.fit(training)

In [8]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [9]:
lrModel.intercept

0.14228558260358093

In [12]:
trainingSummary=lrModel.summary

In [13]:
trainingSummary.r2

0.027839179518600154

In [15]:
trainingSummary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-11.011130022096554|
| 0.9236590911176538|
|-4.5957401897776675|
|  -20.4201774575836|
|-10.339160314788181|
|-5.9552091439610555|
|-10.726906349283922|
|  2.122807193191233|
|  4.077122222293811|
|-17.316168071241652|
| -4.593044343959059|
|  6.380476690746936|
| 11.320566035059846|
|-20.721971774534094|
| -2.736692773777401|
| -16.66886934252847|
|  8.242186378876315|
|-1.3723486332690233|
|-0.7060332131264666|
|-1.1591135969994064|
+-------------------+
only showing top 20 rows



In [16]:
all_data=spark.read.format('libsvm').load('/home/raj/Documents/Udemy-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/sample_linear_regression_data.txt')


In [17]:
split_object=all_data.randomSplit([0.7,0.3])

In [18]:
split_object

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [19]:
trainData,testData=all_data.randomSplit([0.7,0.3])

In [20]:
trainData.show(5)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 5 rows



In [23]:
trainData.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                336|
|   mean|0.37232695356736717|
| stddev| 10.475501792173047|
|    min|-28.571478869743427|
|    max| 27.111027963108548|
+-------+-------------------+



In [24]:
correctModel=lr.fit(trainData)

In [25]:
testResults=correctModel.evaluate(testData)

In [26]:
testResults.rootMeanSquaredError

10.58400278266528

In [27]:
unlabledData=testData.select('features')

In [28]:
unlabledData.show(3)

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 3 rows



In [29]:
predictions= correctModel.transform(unlabledData)

In [30]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  3.5483961535221393|
|(10,[0,1,2,3,4,5,...|   -5.22840266815192|
|(10,[0,1,2,3,4,5,...| -0.4970387228428386|
|(10,[0,1,2,3,4,5,...|  2.2473476165658886|
|(10,[0,1,2,3,4,5,...|  0.3974032935469799|
|(10,[0,1,2,3,4,5,...| -2.1665495258119716|
|(10,[0,1,2,3,4,5,...|-0.15654512518487368|
|(10,[0,1,2,3,4,5,...|  0.3065412334471875|
|(10,[0,1,2,3,4,5,...|   2.764601275895918|
|(10,[0,1,2,3,4,5,...|   3.158018865239736|
|(10,[0,1,2,3,4,5,...|   4.818004744067035|
|(10,[0,1,2,3,4,5,...|  6.5836216457193935|
|(10,[0,1,2,3,4,5,...|  -4.339849037214491|
|(10,[0,1,2,3,4,5,...|   3.945573360263716|
|(10,[0,1,2,3,4,5,...|  1.2228514523179517|
|(10,[0,1,2,3,4,5,...|  1.0405184193859132|
|(10,[0,1,2,3,4,5,...|   6.382822315832802|
|(10,[0,1,2,3,4,5,...|  0.4239236604237225|
|(10,[0,1,2,3,4,5,...| -1.0229973253695344|
|(10,[0,1,2,3,4,5,...|   5.64595