# 6.5 Linear Model Selection and Regularization

In [1]:
# -> Define SparkSession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('selection methods').getOrCreate()

In [2]:
# -> Load Modules:

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.regression import LinearRegression

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

import pandas as pd
import numpy as np

import utils

## *Lab 1: Subset Selection Methods*

### *6.5.1 Best Subset Selection*

In [3]:
# -> Load Hitters Data:

Hitters = spark.read.csv('data/Hitters.csv',header=True,inferSchema=True)
Hitters = Hitters.drop('_c0')
Hitters = Hitters.filter(F.col('Salary') != 'NA')
Hitters = Hitters.withColumn('Salary', F.col('Salary').cast(T.DoubleType()))

print('\nHitters data:'); Hitters.show(5)
print('\nData types:'); Hitters.printSchema()


Hitters data:
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+
|AtBat|Hits|HmRun|Runs|RBI|Walks|Years|CAtBat|CHits|CHmRun|CRuns|CRBI|CWalks|League|Division|PutOuts|Assists|Errors|Salary|NewLeague|
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+
|  315|  81|    7|  24| 38|   39|   14|  3449|  835|    69|  321| 414|   375|     N|       W|    632|     43|    10| 475.0|        N|
|  479| 130|   18|  66| 72|   76|    3|  1624|  457|    63|  224| 266|   263|     A|       W|    880|     82|    14| 480.0|        A|
|  496| 141|   20|  65| 78|   37|   11|  5628| 1575|   225|  828| 838|   354|     N|       E|    200|     11|     3| 500.0|        N|
|  321|  87|   10|  39| 42|   30|    2|   396|  101|    12|   48|  46|    33|     N|       E|    805|     40|     4|  91.5|        N|
|  594| 169|    4|  74| 51|   35|   11|  4408| 

In [4]:
categoricalCol = ['League', 'NewLeague', 'Division']  
continuousCol = [ 'AtBat', 'Hits', 'HmRun']
Cols = categoricalCol + continuousCol

container = utils.best_subset_selection_GLM(df = Hitters,
                       labelCol = 'Salary', 
                       Cols = Cols,
                       label_is_categorical=False)

n = 5
print('\nTop {} models:'.format(n))
print(container.head(n))

Total number of iterations: 64
Completed fitting feature/s: ('League',), AIC=3966.076
Completed fitting feature/s: ('NewLeague',), AIC=3966.128
Completed fitting feature/s: ('Division',), AIC=3956.198
Completed fitting feature/s: ('AtBat',), AIC=3921.573
Completed fitting feature/s: ('Hits',), AIC=3909.918
Completed fitting feature/s: ('HmRun',), AIC=3933.206
Completed fitting feature/s: ('League', 'NewLeague'), AIC=3967.984
Completed fitting feature/s: ('League', 'Division'), AIC=3958.138
Completed fitting feature/s: ('League', 'AtBat'), AIC=3922.892
Completed fitting feature/s: ('League', 'Hits'), AIC=3910.978
Completed fitting feature/s: ('League', 'HmRun'), AIC=3933.989
Completed fitting feature/s: ('NewLeague', 'Division'), AIC=3958.195
Completed fitting feature/s: ('NewLeague', 'AtBat'), AIC=3923.238
Completed fitting feature/s: ('NewLeague', 'Hits'), AIC=3911.422
Completed fitting feature/s: ('NewLeague', 'HmRun'), AIC=3933.868
Completed fitting feature/s: ('Division', 'AtBat'),

### *6.5.2 Forward and Backward Stepwise Selection*

## *Lab 2: Ridge Regression and the Lasso*

### *6.6.1 Ridge regression*

In [319]:
# -> Prepare data:

data = utils.prepare_data(df = Hitters,
             labelCol = 'Salary',
             label_is_categorical = False,
             categoricalCols = ['League', 'NewLeague', 'Division'],
             continuousCols = [ 'AtBat',
                                'Hits',
                                'HmRun',
                                'Runs',
                                'RBI',
                                'Walks',
                                'Years',
                                'CAtBat',
                                'CHits',
                                'CHmRun',
                                'CRuns',
                                'CRBI',
                                'CWalks',
                                'PutOuts',
                                'Assists',
                                'Errors'])

# -> Define the model:

model = LinearRegression(featuresCol="features", 
                         labelCol="label",
                         predictionCol="prediction", 
                         elasticNetParam=0.0) # !

# -> Define grid values for lambda (regParam):

grid = list(np.linspace(1000, 1, 11))

# -> Estimate MSE for different lambda grid values with cross validation:

pipeline = Pipeline(stages=[model])
paramGrid = ParamGridBuilder().addGrid(model.regParam, grid).build()
modelEvaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=RegressionEvaluator(),
                      numFolds=3)  

cvModel = crossval.fit(data)

# -> Print the results:

summary_table = pd.DataFrame({'lambda':grid,  'MSE': cvModel.avgMetrics})
best_lambda_index = np.argmin(cvModel.avgMetrics)

print('\nLambda and corresponding MSE values:')
print(summary_table)

# -> Select the best model, get estimates and make predictions:

bestModel = cvModel.bestModel

print('\nBest Lambda value: {}'.format(bestModel.stages[-1]._java_obj.parent().getRegParam()))
print('\nCoefficient estimates : {}'.format(bestModel.stages[-1].coefficients))
print('\nIntercept estimate: {}'.format(bestModel.stages[-1].intercept))

print('\nPredictions:')
bestModel.transform(data).select('label', 'prediction').show(5)


Lambda and corresponding MSE values:
           MSE  lambda
0   343.819151  1000.0
1   342.552647   900.1
2   341.312171   800.2
3   340.113320   700.3
4   338.978464   600.4
5   337.941137   500.5
6   337.054406   400.6
7   336.408027   300.7
8   336.167232   200.8
9   336.683349   100.9
10  346.375369     1.0

Best Lambda value: 200.79999999999995

Coefficient estimates : [-27.828336859541572,-6.8740744257379545,-93.04437759047963,0.02443551516885328,1.0303026074063057,0.08583898600646346,1.1190140974994425,0.8727045871636055,1.826706113225553,-0.00682257117777462,0.011120397634311637,0.06573723781721515,0.4554615848696543,0.13043560491309092,0.139562658365293,0.025847458246861334,0.194355378769608,0.044359744572198656,-1.8709215934185555]

Intercept estimate: 44.30873730038608

Predictions:
+-----+------------------+
|label|        prediction|
+-----+------------------+
|475.0| 514.2261795834019|
|480.0| 637.6618360001133|
|500.0| 948.1892132363663|
| 91.5|458.51915335445045|
|750.

### *6.6.2 The Lasso*

In [321]:
# -> Hitters data:

print('Hitters data :'); data.show(5)

# -> Build the model:

model = LinearRegression(featuresCol="features", 
                         labelCol="label",
                         predictionCol="prediction",
                         elasticNetParam=1.0) # !

# -> Define grid values:

grid = list(np.linspace(1000, 1, 10))

# -> Estimate MSE for different lambda grid values with cross validation:

pipeline = Pipeline(stages=[model])

paramGrid = ParamGridBuilder().addGrid(model.regParam, grid).build()
modelEvaluator = RegressionEvaluator(labelCol="mpg", predictionCol="prediction", metricName="mse")

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=RegressionEvaluator(),
                      numFolds=3)  

cvModel = crossval.fit(data)

# -> Print the results:

summary_table = pd.DataFrame({'lambda':grid,  'MSE': cvModel.avgMetrics})
best_lambda_index = np.argmin(cvModel.avgMetrics)

print('\nLambda and corresponding MSE values:')
print(summary_table)

# -> Select the best model, get estimates and make predictions:

bestModel = cvModel.bestModel

print('\nBest Lambda value: {}'.format(bestModel.stages[-1]._java_obj.parent().getRegParam()))
print('\nCoefficient estimates : {}'.format(bestModel.stages[-1].coefficients))
print('\nIntercept estimate: {}'.format(bestModel.stages[-1].intercept))

print('\nPredictions:')
bestModel.transform(data).select('label', 'prediction').show(5)

Hitters data :
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+-----+--------------+-----------------+----------------+----------------------+-------------------------+------------------------+--------------------+
|AtBat|Hits|HmRun|Runs|RBI|Walks|Years|CAtBat|CHits|CHmRun|CRuns|CRBI|CWalks|League|Division|PutOuts|Assists|Errors|Salary|NewLeague|label|League_indexed|NewLeague_indexed|Division_indexed|League_indexed_encoded|NewLeague_indexed_encoded|Division_indexed_encoded|            features|
+-----+----+-----+----+---+-----+-----+------+-----+------+-----+----+------+------+--------+-------+-------+------+------+---------+-----+--------------+-----------------+----------------+----------------------+-------------------------+------------------------+--------------------+
|  315|  81|    7|  24| 38|   39|   14|  3449|  835|    69|  321| 414|   375|     N|       W|    632|     43|    10| 475.0|       

## 6.7 Lab 3: PCR and PLS Regression