In [3]:
import findspark
findspark.init()
from __future__ import print_function
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [10]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

# Load up our data and convert it to the format MLLib expects.
inputLines = spark.sparkContext.textFile("data/regression.txt")
data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

# Convert this RDD to a DataFrame
colNames = ["label", "features"]

In [11]:
df = data.toDF(colNames)

In [12]:
df.show()

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
| 0.29|  [-0.4]|
|-0.13|  [0.09]|
|-0.39|  [0.38]|
|-1.79|  [1.73]|
| 0.71| [-0.77]|
| 1.39| [-1.48]|
| 1.15| [-1.43]|
| 0.13| [-0.07]|
| 0.05| [-0.07]|
|  1.9|  [-1.8]|
| 1.48| [-1.42]|
| 0.32|  [-0.3]|
|-1.11|   [1.0]|
| 0.51| [-0.62]|
|-1.58|  [1.45]|
|-0.46|  [0.44]|
|-0.49|  [0.37]|
| 0.31|  [-0.3]|
+-----+--------+
only showing top 20 rows



In [26]:
trainTest = df.randomSplit([0.7,0.3])
trainingDF  = trainTest[0]
testDF = trainTest[1]

In [27]:
lir = LinearRegression(maxIter=10, regParam= 0.3, elasticNetParam= 0.8)

In [28]:
model = lir.fit(trainingDF)

In [29]:
fullPredictions = model.transform(testDF).cache()

In [30]:
fullPredictions.show()

+-----+--------+-------------------+
|label|features|         prediction|
+-----+--------+-------------------+
|-3.74|  [3.75]|-2.6686050290519376|
|-2.58|  [2.57]|-1.8280051698722457|
|-2.54|  [2.39]|-1.6997780727092422|
|-2.36|  [2.63]| -1.870747535593247|
|-2.29|  [2.35]|-1.6712831622285746|
|-2.12|   [1.9]|-1.3507154193210649|
|-1.94|  [1.94]|-1.3792103298017324|
|-1.94|  [1.98]|   -1.4077052402824|
|-1.88|  [1.89]| -1.343591691700898|
|-1.48|  [1.38]|-0.9802815830723872|
| -1.4|  [1.32]| -0.937539217351386|
|-1.37|  [1.25]|-0.8876731240102179|
|-1.33|  [1.48]| -1.051518859274056|
|-1.25|  [1.32]| -0.937539217351386|
|-1.23|  [1.19]|-0.8449307582892166|
|-1.22|   [1.2]|-0.8520544859093835|
|-1.17|  [1.24]| -0.880549396390051|
|-1.17|  [1.25]|-0.8876731240102179|
|-1.12|   [1.1]|-0.7808172097077147|
|-1.11|   [1.0]|-0.7095799335060459|
+-----+--------+-------------------+
only showing top 20 rows



In [31]:
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])

In [32]:
labels = fullPredictions.select('label').rdd.map(lambda x: x[0])

In [33]:
predcitionAndLabel = predictions.zip(labels).collect()

In [34]:
for prediction in predcitionAndLabel:
    print(prediction)

(-2.6686050290519376, -3.74)
(-1.8280051698722457, -2.58)
(-1.6997780727092422, -2.54)
(-1.870747535593247, -2.36)
(-1.6712831622285746, -2.29)
(-1.3507154193210649, -2.12)
(-1.3792103298017324, -1.94)
(-1.4077052402824, -1.94)
(-1.343591691700898, -1.88)
(-0.9802815830723872, -1.48)
(-0.937539217351386, -1.4)
(-0.8876731240102179, -1.37)
(-1.051518859274056, -1.33)
(-0.937539217351386, -1.25)
(-0.8449307582892166, -1.23)
(-0.8520544859093835, -1.22)
(-0.880549396390051, -1.17)
(-0.8876731240102179, -1.17)
(-0.7808172097077147, -1.12)
(-0.7095799335060459, -1.11)
(-0.8734256687698841, -1.11)
(-0.766569754467381, -1.1)
(-0.8378070306690497, -1.09)
(-0.7095799335060459, -1.05)
(-0.8306833030488828, -1.04)
(-0.6027240192035427, -1.03)
(-0.7594460268472141, -1.03)
(-0.7309511163665465, -0.98)
(-0.645466384924544, -0.97)
(-0.7095799335060459, -0.94)
(-0.7238273887463796, -0.94)
(-0.6739612954052114, -0.92)
(-0.6240952020640433, -0.89)
(-0.7380748439867134, -0.89)
(-0.5671053811027083, -0.85