# Hyperparameter tuning

## Spark

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Apache_Spark_logo.svg/1280px-Apache_Spark_logo.svg.png" width="400">

**Hardware**: 10 nodes - r5.8xlarge (32 CPU, 256GB RAM)

In [2]:
import os
os.environ['TAXI_S3'] = 's3://saturn-titan/nyc-taxi'

In [3]:
from ml_utils import MLUtils

ml_utils = MLUtils(
    ml_task='tip',
    tool='spark',
    model='elastic_net',
)

# Load data and feature engineering

In [4]:
import numpy as np
import datetime
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T


spark = SparkSession.builder.getOrCreate()

In [5]:
%%time
tip_train = spark.read.parquet(f'{ml_utils.taxi_path}/data/ml/tip_train_sample')
tip_train.count()

CPU times: user 0 ns, sys: 3.55 ms, total: 3.55 ms
Wall time: 10.4 s


10994502

In [6]:
tip_train.head()

Row(id='326fdd4d9a1843488a38d16a3bb6278b', pickup_datetime=datetime.datetime(2016, 7, 16, 18, 24, 40), dropoff_datetime=datetime.datetime(2016, 7, 16, 18, 49, 56), pickup_taxizone_id=237.0, dropoff_taxizone_id=249.0, pickup_weekday=5, pickup_weekofyear=28, pickup_hour=18, pickup_minute=24, pickup_week_hour=138, passenger_count=1.0, tip_fraction=0.11428571428571428)

<br>

Let's take the same sample we used in the single node scikit example

In [7]:
sample = tip_train.sample(fraction=0.1, withReplacement=False, seed=42)
sample.count()

1097989

# Run grid search

In [8]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline

features = ml_utils.tip_vars.features
y_col = ml_utils.tip_vars.y_col

indexers = [
    StringIndexer(
    inputCol=c, 
    outputCol=f'{c}_idx', handleInvalid='keep')
    for c in ml_utils.tip_vars.categorical_feat
]
encoders = [
    OneHotEncoder(
        inputCol=f'{c}_idx',
        outputCol=f'{c}_onehot',
    ) 
    for c in ml_utils.tip_vars.categorical_feat
]
num_assembler = VectorAssembler(
    inputCols=ml_utils.tip_vars.numeric_feat,
    outputCol='num_features',
)
scaler = StandardScaler(inputCol='num_features', outputCol='num_features_scaled')

assembler = VectorAssembler(
    inputCols=[f'{c}_onehot' for c in ml_utils.tip_vars.categorical_feat] + ['num_features_scaled'],
    outputCol='features',
)

lr = LinearRegression(standardization=False, maxIter=100)
pipeline = Pipeline(
    stages=indexers + encoders + [num_assembler, scaler, assembler, lr])

params = ml_utils.tip_vars.elastic_net_grid_search_params
grid = (
    ParamGridBuilder()
    .addGrid(lr.elasticNetParam, params['clf__l1_ratio'])
    .addGrid(lr.regParam, params['clf__alpha'])
    .build()
)

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=grid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

In [9]:
sample = sample.withColumn('label', sample[y_col])
sample.cache()

DataFrame[id: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, pickup_taxizone_id: double, dropoff_taxizone_id: double, pickup_weekday: bigint, pickup_weekofyear: bigint, pickup_hour: bigint, pickup_minute: bigint, pickup_week_hour: bigint, passenger_count: double, tip_fraction: double, label: double]

In [10]:
%%time
with ml_utils.time_fit():
    fitted = crossval.fit(sample)

CPU times: user 1min 47s, sys: 27.5 s, total: 2min 14s
Wall time: 47min 45s


In [11]:
print(f'regParam: {fitted.bestModel.stages[-1]._java_obj.getRegParam()}')
print(f'elasticNetParam: {fitted.bestModel.stages[-1]._java_obj.getElasticNetParam()}')

regParam: 0.0
elasticNetParam: 0.28


## Predict on test set

In [12]:
tip_test = spark.read.parquet(f'{ml_utils.taxi_path}/data/ml/tip_test')
preds = fitted.transform(tip_test)

In [13]:
preds.head()

Row(id='2e8f402e4dc44f2fae8b9328a237c4d2', pickup_datetime=datetime.datetime(2019, 9, 9, 10, 19, 44), dropoff_datetime=datetime.datetime(2019, 9, 9, 10, 31, 26), pickup_taxizone_id=162.0, dropoff_taxizone_id=170.0, pickup_weekday=0, pickup_weekofyear=37, pickup_hour=10, pickup_minute=19, pickup_week_hour=10, passenger_count=1.0, tip_fraction=0.11764705882352941, pickup_taxizone_id_idx=3.0, dropoff_taxizone_id_idx=3.0, pickup_taxizone_id_onehot=SparseVector(251, {3: 1.0}), dropoff_taxizone_id_onehot=SparseVector(260, {3: 1.0}), num_features=DenseVector([0.0, 37.0, 10.0, 10.0, 19.0, 1.0]), num_features_scaled=DenseVector([0.0, 2.461, 1.5953, 0.2167, 1.0965, 0.8007]), features=SparseVector(517, {3: 1.0, 254: 1.0, 512: 2.461, 513: 1.5953, 514: 0.2167, 515: 1.0965, 516: 0.8007}), prediction=0.2241735993441916)

In [14]:
path = f'{ml_utils.taxi_path}/ml_results/predictions/{ml_utils.ml_task}__{ml_utils.tool}__{ml_utils.model}'
path

's3://saturn-titan/nyc-taxi/ml_results/predictions/tip__spark__elastic_net'

In [15]:
(preds
 .select(preds.id, preds[y_col].alias('actual'), preds.prediction.alias('predicted'))
 .show(5))

+--------------------+-------------------+-------------------+
|                  id|             actual|          predicted|
+--------------------+-------------------+-------------------+
|2e8f402e4dc44f2fa...|0.11764705882352941| 0.2241735993441916|
|5f067a4121244f42b...| 0.2168421052631579| 0.2263182368770626|
|60e8442d3d434df49...|               0.15|0.22355489078050717|
|2d1537ce2ed347778...|            0.10625|0.22432047622562074|
|13bb8a9ecbd04b559...|                0.0|0.21425493006033125|
+--------------------+-------------------+-------------------+
only showing top 5 rows



In [16]:
%%time
(preds
 .select(preds.id, preds[y_col].alias('actual'), preds.prediction.alias('predicted'))
 .write.parquet(path, mode='overwrite')
)

CPU times: user 1.88 ms, sys: 4.85 ms, total: 6.73 ms
Wall time: 6.44 s


In [17]:
evaluator = RegressionEvaluator(
    labelCol=y_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(preds)
ml_utils.write_metric_df('rmse', rmse)

Unnamed: 0,ml_task,tool,model,metric,value,fit_seconds
0,tip,spark,elastic_net,rmse,0.207875,2865.376522
