The following example demonstrates training an elastic net regularized linear regression model and extracting model summary statistics.

In [None]:
import findspark
findspark.init()

from os import getlogin, path

from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.ml.regression import LinearRegression

In [None]:
# Directories 

# HOME_DIR = path.join("/home", getlogin())
HOME_DIR = path.join("/Users/robert.dempsey/Dev/daamlobd")
DATA_DIR = path.join(HOME_DIR, "data")
MLLIB_DATA_DIR = path.join(DATA_DIR, "mllib")
DATA_FILE   = path.join(MLLIB_DATA_DIR, "sample_linear_regression_data.txt")

# Check the things
print("Home Directory: {}".format(HOME_DIR))
print("Data Directory: {}".format(DATA_DIR))
print("MLlib Data Directory: {}".format(MLLIB_DATA_DIR))
print("Data File: {}".format(DATA_FILE))

In [None]:
# Create a SparkContext and a SQLContext context to use
sc = SparkContext(appName="Linear Regression with Spark")
sqlContext = SQLContext(sc)

In [None]:
# Load the training data into a dataframe
training = sqlContext.read.format("libsvm").load(DATA_FILE)
training.show()

In [None]:
# Create an instance of a LinearRegression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
print(lr)

In [None]:
# Fit (train) the model
lr_model = lr.fit(training)
print(lr_model)

In [None]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {}".format(lr_model.coefficients))
print("Intercept: {}".format(lr_model.intercept))

## Summarize the model over the training set and print out some metrics

In [None]:
# Get the model summary
training_summary = lr_model.summary

In [None]:
print("numIterations: {}".format(training_summary.totalIterations))

In [None]:
print("objectiveHistory")
for h in training_summary.objectiveHistory:
    print(h)

In [None]:
training_summary.residuals.show()

In [None]:
print("RMSE: %f" % training_summary.rootMeanSquaredError)

In [None]:
print("r2: %f" % training_summary.r2)

In [None]:
sc.stop()