The following example demonstrates training an elastic net regularized linear regression model and extracting model summary statistics.

# Imports

In [None]:
from os import environ
environ["SPARK_HOME"] = "/home/students/spark-2.2.0"

import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.ml.regression import LinearRegression

## Get Some Context

In [None]:
# Create a SparkContext and a SQLContext context to use
sc = SparkContext(appName="Linear Regression with Spark")
sqlContext = SQLContext(sc)

## Import the Data

In [None]:
# Directories 
DATA_FILE   = "/home/students/data/mllib/sample_linear_regression_data.txt"

In [None]:
# Load the training data into a dataframe
training = sqlContext.read.format("libsvm").load(DATA_FILE)
training.show()

## Train a Model

In [None]:
# Create an instance of a LinearRegression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
print(lr)

In [None]:
# Fit (train) the model
lr_model = lr.fit(training)
print(lr_model)

In [None]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {}".format(lr_model.coefficients))
print("Intercept: {}".format(lr_model.intercept))

## Summarize the model over the training set and print out some metrics

In [None]:
# Get the model summary
training_summary = lr_model.summary

In [None]:
print("numIterations: {}".format(training_summary.totalIterations))

In [None]:
print("objectiveHistory")
for h in training_summary.objectiveHistory:
    print(h)

In [None]:
training_summary.residuals.show()

In [None]:
print("RMSE: %f" % training_summary.rootMeanSquaredError)

In [None]:
print("r2: %f" % training_summary.r2)

## Shut it Down

In [None]:
sc.stop()