# 6.5 Linear Model Selection and Regularization

In [1]:
# -> Define SparkSession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('selection methods').getOrCreate()

In [2]:
# -> Load Modules:

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.regression import LinearRegression

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

import pandas as pd
import numpy as np

import utils

## *Lab 1: Subset Selection Methods*

### *6.5.1 Best Subset Selection*

In [3]:
# -> Load Hitters Data:

Hitters = spark.read.csv('data/Hitters.csv',header=True,inferSchema=True)
Hitters = Hitters.drop('_c0')
Hitters = Hitters.filter(F.col('Salary') != 'NA')
Hitters = Hitters.withColumn('Salary', F.col('Salary').cast(T.DoubleType()))

print('\nHitters data:'); Hitters.show(5)
print('\nData types:'); Hitters.printSchema()


Hitters data:
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+
|AtBat|Hits|HmRun|Runs|RBI|Walks|Years|CAtBat|CHits|CHmRun|CRuns|CRBI|CWalks|League|Division|PutOuts|Assists|Errors|Salary|NewLeague|
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+
|  315|  81|    7|  24| 38|   39|   14|  3449|  835|    69|  321| 414|   375|     N|       W|    632|     43|    10| 475.0|        N|
|  479| 130|   18|  66| 72|   76|    3|  1624|  457|    63|  224| 266|   263|     A|       W|    880|     82|    14| 480.0|        A|
|  496| 141|   20|  65| 78|   37|   11|  5628| 1575|   225|  828| 838|   354|     N|       E|    200|     11|     3| 500.0|        N|
|  321|  87|   10|  39| 42|   30|    2|   396|  101|    12|   48|  46|    33|     N|       E|    805|     40|     4|  91.5|        N|
|  594| 169|    4|  74| 51|   35|   11|  4408| 

In [4]:
categoricalCol = ['League', 'NewLeague', 'Division']  
continuousCol = [ 'AtBat', 'Hits', 'HmRun']
Cols = categoricalCol + continuousCol

container = utils.best_subset_selection_GLM(df = Hitters,
                       labelCol = 'Salary', 
                       Cols = Cols,
                       label_is_categorical=False)

n = 5
print('\nTop {} models:'.format(n))
print(container.head(n))

Total number of iterations: 64
Feature/s: ('League',), AIC=3966.076
Feature/s: ('NewLeague',), AIC=3966.128
Feature/s: ('Division',), AIC=3956.198
Feature/s: ('AtBat',), AIC=3921.573
Feature/s: ('Hits',), AIC=3909.918
Feature/s: ('HmRun',), AIC=3933.206
Feature/s: ('League', 'NewLeague'), AIC=3967.984
Feature/s: ('League', 'Division'), AIC=3958.138
Feature/s: ('League', 'AtBat'), AIC=3922.892
Feature/s: ('League', 'Hits'), AIC=3910.978
Feature/s: ('League', 'HmRun'), AIC=3933.989
Feature/s: ('NewLeague', 'Division'), AIC=3958.195
Feature/s: ('NewLeague', 'AtBat'), AIC=3923.238
Feature/s: ('NewLeague', 'Hits'), AIC=3911.422
Feature/s: ('NewLeague', 'HmRun'), AIC=3933.868
Feature/s: ('Division', 'AtBat'), AIC=3914.352
Feature/s: ('Division', 'Hits'), AIC=3903.815
Feature/s: ('Division', 'HmRun'), AIC=3925.292
Feature/s: ('AtBat', 'Hits'), AIC=3908.260
Feature/s: ('AtBat', 'HmRun'), AIC=3916.569
Feature/s: ('Hits', 'HmRun'), AIC=3906.349
Feature/s: ('League', 'NewLeague', 'Division'), AIC

### *6.5.2 Forward and Backward Stepwise Selection*

## *Lab 2: Ridge Regression and the Lasso*

### *6.6.1 Ridge regression*

In [5]:
# -> Prepare data:

data = utils.prepare_data(df = Hitters,
             labelCol = 'Salary',
             label_is_categorical = False,
             categoricalCols = ['League', 'NewLeague', 'Division'],
             continuousCols = [ 'AtBat',
                                'Hits',
                                'HmRun',
                                'Runs',
                                'RBI',
                                'Walks',
                                'Years',
                                'CAtBat',
                                'CHits',
                                'CHmRun',
                                'CRuns',
                                'CRBI',
                                'CWalks',
                                'PutOuts',
                                'Assists',
                                'Errors'])

# -> Define the model:

model = LinearRegression(featuresCol="features", 
                         labelCol="label",
                         predictionCol="prediction", 
                         elasticNetParam=0.0) # !

# -> Define grid values for lambda (regParam):

grid = list(np.linspace(1000, 1, 11))

# -> Estimate MSE for different lambda grid values with cross validation:

pipeline = Pipeline(stages=[model])
paramGrid = ParamGridBuilder().addGrid(model.regParam, grid).build()
modelEvaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=RegressionEvaluator(),
                      numFolds=3)  

cvModel = crossval.fit(data)

# -> Print the results:

summary_table = pd.DataFrame({'lambda':grid,  'MSE': cvModel.avgMetrics})
best_lambda_index = np.argmin(cvModel.avgMetrics)

print('\nLambda and corresponding MSE values:')
print(summary_table)

# -> Select the best model, get estimates and make predictions:

bestModel = cvModel.bestModel

print('\nBest Lambda value: {}'.format(bestModel.stages[-1]._java_obj.parent().getRegParam()))
print('\nCoefficient estimates : {}'.format(bestModel.stages[-1].coefficients))
print('\nIntercept estimate: {}'.format(bestModel.stages[-1].intercept))

print('\nPredictions:')
bestModel.transform(data).select('label', 'prediction').show(5)


Lambda and corresponding MSE values:
           MSE  lambda
0   333.004713  1000.0
1   331.595949   900.1
2   330.192991   800.2
3   328.804227   700.3
4   327.440657   600.4
5   326.116949   500.5
6   324.852554   400.6
7   323.670414   300.7
8   322.574203   200.8
9   321.321780   100.9
10  317.568241     1.0

Best Lambda value: 1.0

Coefficient estimates : [-62.42279068704056,26.867903449723745,-119.37567601471598,-1.9048904399579303,6.920138818440071,2.7432131262174924,-1.5300579746423604,-0.555542001339356,5.922990635648117,-6.746208980485967,-0.12152039845383315,0.1904404515636138,0.32656007735829073,1.1066452768372486,0.5719914092898067,-0.7368855917013047,0.2817738119789803,0.3430558690955312,-3.5309239818643996]

Intercept estimate: 197.87294915232937

Predictions:
+-----+------------------+
|label|        prediction|
+-----+------------------+
|475.0| 372.5635026211904|
|480.0| 711.5729492500842|
|500.0|1159.6195590092332|
| 91.5| 553.6505422507867|
|750.0|509.04120411137455

### *6.6.2 The Lasso*

In [6]:
# -> Hitters data:

print('Hitters data :'); data.show(5)

# -> Build the model:

model = LinearRegression(featuresCol="features", 
                         labelCol="label",
                         predictionCol="prediction",
                         elasticNetParam=1.0) # !

# -> Define grid values:

grid = list(np.linspace(1000, 1, 10))

# -> Estimate MSE for different lambda grid values with cross validation:

pipeline = Pipeline(stages=[model])

paramGrid = ParamGridBuilder().addGrid(model.regParam, grid).build()
modelEvaluator = RegressionEvaluator(labelCol="mpg", predictionCol="prediction", metricName="mse")

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=RegressionEvaluator(),
                      numFolds=3)  

cvModel = crossval.fit(data)

# -> Print the results:

summary_table = pd.DataFrame({'lambda':grid,  'MSE': cvModel.avgMetrics})
best_lambda_index = np.argmin(cvModel.avgMetrics)

print('\nLambda and corresponding MSE values:')
print(summary_table)

# -> Select the best model, get estimates and make predictions:

bestModel = cvModel.bestModel

print('\nBest Lambda value: {}'.format(bestModel.stages[-1]._java_obj.parent().getRegParam()))
print('\nCoefficient estimates : {}'.format(bestModel.stages[-1].coefficients))
print('\nIntercept estimate: {}'.format(bestModel.stages[-1].intercept))

print('\nPredictions:')
bestModel.transform(data).select('label', 'prediction').show(5)

Hitters data :
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+-----+--------------+-----------------+----------------+----------------------+-------------------------+------------------------+--------------------+
|AtBat|Hits|HmRun|Runs|RBI|Walks|Years|CAtBat|CHits|CHmRun|CRuns|CRBI|CWalks|League|Division|PutOuts|Assists|Errors|Salary|NewLeague|label|League_indexed|NewLeague_indexed|Division_indexed|League_indexed_encoded|NewLeague_indexed_encoded|Division_indexed_encoded|            features|
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+-----+--------------+-----------------+----------------+----------------------+-------------------------+------------------------+--------------------+
|  315|  81|    7|  24| 38|   39|   14|  3449|  835|    69|  321| 414|   375|     N|       W|    632|     43|    10| 475.0|       

## 6.7 Lab 3: PCR and PLS Regression