# 5. Resampling Methods

In [1]:
# -> Define Spark session and SQLContext 

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('resampling_methods').getOrCreate()

In [2]:
# Load modules

import numpy as np
import pandas as pd
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as F

from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder, PolynomialExpansion
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import utils

## 5.3 Lab: Cross-Validation and the Boostrap

### *5.3.1. The Validation Set Approach*

In [3]:
# -> Load Auto dataset

Auto = spark.read.csv('data/Auto.csv',header=True,inferSchema=True)
Auto = Auto.withColumn('horsepower', F.col('horsepower').cast(DoubleType()))
Auto = Auto.na.drop()

print('\nAuto dataset:'); Auto.show(3)
print('\nData types:'); Auto.printSchema()

# -> Prepare data

data = Auto.withColumn('horsepower_power_2', F.pow(F.col('horsepower'),2))
data = utils.prepare_data(data,
                          labelCol = 'mpg',
                          label_is_categorical = False,
                          categoricalCols = [],
                          continuousCols = ['horsepower', 'horsepower_power_2'])

# -> Create train and test samples

train, test = data.randomSplit([0.7,.3], seed=42)

# -> Describe the model:

model = LinearRegression(featuresCol="features", labelCol='label')

# -> Fit the model:

model_fit = model.fit(train)

# -> Estimate models' MSE for train and test samples:

train_predictions = model_fit.transform(train)
test_predictions = model_fit.transform(test)

evaluator = RegressionEvaluator(labelCol="mpg", predictionCol="prediction", metricName="mse")
train_mse = evaluator.evaluate(train_predictions)
test_mse = evaluator.evaluate(test_predictions)

print("Models' Mean Squared Error (MSE) on train data {:.3f}".format(train_mse))
print("Models' Mean Squared Error (MSE) on test data {:.3f}".format(test_mse))


Auto dataset:
+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|     130.0|  3504|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|     165.0|  3693|        11.5|  70|     1|   buick skylark 320|
|18.0|        8|       318.0|     150.0|  3436|        11.0|  70|     1|  plymouth satellite|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 3 rows


Data types:
root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: double (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 

### *5.3.3 k-Fold Cross-Validation*

In [4]:
# -> Prepare data:

data = utils.prepare_data(df=Auto,
                    labelCol='mpg',
                    label_is_categorical=False,
                    categoricalCols=[],
                    continuousCols=['horsepower'])

# -> Describe the model:

model = LinearRegression(featuresCol="features", labelCol='label')

# -> Estimate K=3 cross validation MSE:

modelEvaluator=RegressionEvaluator(metricName="mse")

pipeline = Pipeline(stages=[model])
paramGrid = ParamGridBuilder().build()

cv = CrossValidator(estimator=model,
                          estimatorParamMaps=paramGrid,
                          evaluator=modelEvaluator,
                          numFolds=3,
                          seed=42)
cvModel = cv.fit(data)
print('CV MSE average = {:.3f}'.format(cvModel.avgMetrics[0]))

CV MSE average = 24.231


### *5.3.4 The Bootstrap*

#### Estimating the Accuracy of a Statistic of Interest

In [5]:
# Generate data

from pyspark.sql.functions import rand, randn
Portfolio = spark.createDataFrame(pd.DataFrame(np.arange(0, 1000, 1), columns=['id']))
Portfolio = Portfolio.select("id", 
               F.rand(seed=10).alias("X"), 
               F.randn(seed=27).alias("Y"))
Portfolio.show(5) 

# Define a custom function to estimate statistic of interest

def custom_function(df, col_X, col_Y):
    var_X = df.agg(F.variance(col_X)).collect()[0][0]
    var_Y = df.agg(F.variance(col_Y)).collect()[0][0]
    cov_X_Y = df.stat.cov(col_X,col_Y)
    result = ((var_Y-cov_X_Y))/( var_X+ var_Y -2 * cov_X_Y)
    return result

# Initial parameters for Ordinary NonParametric Bootstrap 

boot_n = 100
left_quantile_fraction = 2.5
right_quantile_fraction = 97.5

# Ordinary NonParametric Bootsrap output 

results = np.zeros([boot_n])
for i in range(0, boot_n):
    df_tmp = Portfolio.sample(withReplacement = True, fraction=1.0)
    results[i] = custom_function(df_tmp, 'X', 'Y')
print('estimated bootstrap lower and upper quantiles = {}'.format(np.percentile(results, [left_quantile_fraction, right_quantile_fraction])))

+---+-------------------+------------------+
| id|                  X|                 Y|
+---+-------------------+------------------+
|  0|0.41371264720975787|0.5888539012978773|
|  1| 0.7311719281896606|0.8645537008427937|
|  2| 0.9031701155118229|1.2524569684217643|
|  3|0.09430205113458567|-2.573636861034734|
|  4|0.38340505276222947|0.5469737451926588|
+---+-------------------+------------------+
only showing top 5 rows

estimated bootstrap lower and upper quantiles = [0.89564129 0.92415818]


#### Estimating the Accuracy of a Linear Regression Model

In [6]:
## -> Estimate confidence interval for regression parameters using nonparametric Bootstrap

# -> Prepare data

data = utils.prepare_data(Auto,
                    labelCol = 'mpg',
                    label_is_categorical = False,
                    categoricalCols = [],
                    continuousCols = ['horsepower'])

# -> Describe the model

model = LinearRegression(featuresCol="features", labelCol='label')

# -> Bootstrap CI estimates 

def bootstrap_confidence_intervals(df, model, boot_n, lower_quantile, upper_quantile, sample_prop_data=0.5, seed=42):
    results = []
    for i in range(0, boot_n):
        data_tmp = df.sample(withReplacement = True, fraction=sample_prop_data, seed=seed)
        model_fit = model.fit(data_tmp)
        result = np.append(model_fit.coefficients.toArray(), model_fit.intercept)
        results.append(result)
    estim_percentiles = np.apply_along_axis(lambda x: np.percentile(x, [lower_quantile, upper_quantile]),0,results).T
    return estim_percentiles

boostrap_CI = bootstrap_confidence_intervals(df=data, 
                                             model=model, 
                                             boot_n=50, 
                                             lower_quantile=2.5,
                                             upper_quantile=97.5)
print('Estimated non-parametric Bootsrap 95% confidence intervals for the estimated regression model parameters:')
print(boostrap_CI)

Estimated non-parametric Bootsrap 95% confidence intervals for the estimated regression model parameters:
[[-0.17025546 -0.17025546]
 [40.84756209 40.84756209]]
