The following example demonstrates training an elastic net regularized linear regression model and extracting model summary statistics and saving the model to disk.

In [1]:
import findspark
findspark.init()

from os import getlogin, path
import pickle

from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.ml.regression import LinearRegression, LinearRegressionModel

In [2]:
# Directories 

# HOME_DIR = path.join("/home", getlogin())
HOME_DIR = path.join("/Users/robert.dempsey/Dev/daamlobd")
DATA_DIR = path.join(HOME_DIR, "data")
MLLIB_DATA_DIR = path.join(DATA_DIR, "mllib")
DATA_FILE   = path.join(MLLIB_DATA_DIR, "sample_linear_regression_data.txt")
MODEL_FILE_PATH = path.join(DATA_DIR, "linear_model")

# Check the things
print("Home Directory: {}".format(HOME_DIR))
print("Data Directory: {}".format(DATA_DIR))
print("MLlib Data Directory: {}".format(MLLIB_DATA_DIR))
print("Data File: {}".format(DATA_FILE))
print("Model File Path: {}".format(MODEL_FILE_PATH))

Home Directory: /Users/robert.dempsey/Dev/daamlobd
Data Directory: /Users/robert.dempsey/Dev/daamlobd/data
MLlib Data Directory: /Users/robert.dempsey/Dev/daamlobd/data/mllib
Data File: /Users/robert.dempsey/Dev/daamlobd/data/mllib/sample_linear_regression_data.txt
Model File Path: /Users/robert.dempsey/Dev/daamlobd/data/linear_model


In [3]:
# Create a SparkContext and a SQLContext context to use
sc = SparkContext(appName="Linear Regression with Spark")
sqlContext = SQLContext(sc)

In [5]:
# Load the training data into a dataframe
training = sqlContext.read.format("libsvm").load(DATA_FILE)
type(training)

pyspark.sql.dataframe.DataFrame

In [6]:
# Create an instance of a LinearRegression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr

LinearRegression_4bf0b0872b8375f5b6cf

In [7]:
# Fit (train) the model
lr_model = lr.fit(training)
lr_model

LinearRegression_4bf0b0872b8375f5b6cf

In [8]:
# Show some summary
lr_model.coefficients

DenseVector([0.0, 0.3229, -0.3439, 1.9156, 0.0529, 0.766, 0.0, -0.1511, -0.2159, 0.2203])

## Serialize the Model

In [9]:
lr_model.write().overwrite().save(MODEL_FILE_PATH)

## Deserialize the model

In [10]:
new_lr_model = LinearRegressionModel.load(MODEL_FILE_PATH)
print(new_lr_model)

LinearRegression_4bf0b0872b8375f5b6cf


In [None]:
LinearRegression_4bf0b0872b8375f5b6cf

In [11]:
# Show some summary
new_lr_model.coefficients

DenseVector([0.0, 0.3229, -0.3439, 1.9156, 0.0529, 0.766, 0.0, -0.1511, -0.2159, 0.2203])

## Clean Up

In [12]:
sc.stop()