# Linear Regression with spark

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=de3793d8a250656c4d0b9c9952470f0933ed26a369a14a114478b838cd5f417d
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [3]:
if __name__ == "__main__":

    # Create a SparkSession
    spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("LinearRegression").getOrCreate()

    # Load up our data and convert it to the format MLLib expects.
    inputLines = spark.sparkContext.textFile("regression.txt")
    data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Train the model using our training data
    model = lir.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:
      print(prediction)


    # Stop the session
    spark.stop()

(-2.65503517136323, -3.74)
(-1.8204754884863215, -2.58)
(-1.6931697741491663, -2.54)
(-1.8629107265987068, -2.36)
(-1.664879615407576, -2.29)
(-1.5941542185536006, -2.26)
(-1.5517189804412155, -2.17)
(-1.4314858057894575, -2.0)
(-1.4031956470478673, -1.96)
(-1.318325170823097, -1.91)
(-1.3395427898792895, -1.88)
(-1.4031956470478673, -1.87)
(-1.226382154912929, -1.79)
(-1.1768743771151462, -1.77)
(-1.1768743771151462, -1.74)
(-1.0354235834071956, -1.67)
(-1.2193096152275316, -1.61)
(-1.1485842183735562, -1.59)
(-0.9788432659240154, -1.48)
(-1.0354235834071956, -1.47)
(-1.000060884980208, -1.46)
(-0.9364080278116304, -1.39)
(-1.000060884980208, -1.36)
(-1.0495686627779908, -1.33)
(-0.8373924722160648, -1.3)
(-1.0283510437217982, -1.3)
(-1.0424961230925933, -1.29)
(-0.8444650119014624, -1.27)
(-0.8303199325306673, -1.26)
(-0.8586100912722574, -1.26)
(-0.8515375515868598, -1.25)
(-0.8444650119014624, -1.23)
(-0.8515375515868598, -1.22)
(-0.87982771032845, -1.17)
(-0.8869002500138475, -1.1