# Lasso Regression with PySpark

This notebook creates and measures a LASSO regression model using sklearn.

* Method: LASSO regression
* Dataset: MLlib Ridge Data

## Imports

In [None]:
# Python core libs
from os import getlogin, path, environ
import numpy as np

# Set SPARK_HOME
environ["SPARK_HOME"] = "/home/students/spark-2.2.0"

# Findspark
import findspark
findspark.init()

# PySpark and PySpark SQL
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

# PySpark MLlib
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Get Some Context

In [None]:
# Create a SparkContext and a SQLContext context to use
sc = SparkContext(appName="LASSO Regression with Spark")
sqlContext = SQLContext(sc)

## Load and Prepare the Data

In [None]:
# Data 
DATA_FILE = "/home/students/data/mllib/ridge-data/lpsa.data"

In [None]:
def parse_point(line):
    '''
    Returns a labeled point for the record.
    
    A labeled point is a local vector, either dense or sparse, associated with a label/response.
    '''
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile(DATA_FILE)
parsed_data = data.map(parse_point)

In [None]:
# Show a single record: LabeledPoint(label, [features])
parsed_data.take(1)

## Fit a Linear Regression Model with LASSO

Train a linear regression model using [Stochastic Gradient Descent (SGD)](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) and L1 (LASSO) regularization.

* iterations:  number of iterations. (default: 100)
* step: step parameter used in SGD. (default: 1.0)
* regParam: The regularizer parameter. (default: 0.0)
* regType: the type of regularizer used for training our model. Supported values:
    * `l1` for using L1 regularization
    * `l2` for using L2 regularization
    * None for no regularization (default)

In [None]:
# Create an instance of a LinearRegressionWithSGD and train it on the RDD of LabeledPoints
model = LinearRegressionWithSGD.train(parsed_data, iterations=100, step=0.00000001, regType='l1')

In [None]:
# Intercept for the model
print('Estimated intercept coefficient: {}'.format(model.intercept))

## Create Predictions

In [None]:
values_and_predictions = parsed_data.map(lambda p: (p.label, model.predict(p.features)))
values_and_predictions.take(1)

In [None]:
# Create a plot to compare the actuals (values) and predictions
vp_list = values_and_predictions.collect()
vp_list_prepared = [(x[0], float(x[1])) for x in vp_list]

values_predictions_df = sqlContext.createDataFrame(vp_list_prepared, ["actual", "predicted"])

actuals = values_predictions_df.rdd.map(lambda r: r.actual).collect()
predictions = values_predictions_df.rdd.map(lambda r: r.predicted).collect()

print("Min actual: {}".format(min(actuals)))
print("Mean actual: {}".format(np.mean(actuals)))
print("Max actual: {}\n".format(max(actuals)))
print("Min prediction: {}".format(min(predictions)))
print("Mean prediction: {}".format(np.mean(predictions)))
print("Max prediction: {}".format(max(predictions)))

fig = plt.figure(figsize=(20,10))
plt.scatter(actuals, predictions)
plt.xlabel("Actuals")
plt.ylabel("Predictions")
plt.title("Actuals vs. Predictiions")
plt.show()

## Model Evaluation

### Mean Squared Error

* A measure of the average magnitude of the errors without consideration for their direction; measures accuracy for continuous variables.
* Always non-negative
* Values closer to zero (0) are better

In [None]:
# Calculate the Mean Squared Error
MSE = values_and_predictions \
    .map(lambda vp: (vp[0] - vp[1])**2) \
    .reduce(lambda x, y: x + y) / values_and_predictions.count()

print("Mean Squared Error = " + str(MSE))

## Cleanup

In [None]:
sc.stop()