# XGBoost regression

## Dask cluster

<table>
    <tr>
        <td>
            <img src="https://docs.dask.org/en/latest/_images/dask_horizontal.svg" width="300">
        </td>
        <td>
            <img src="https://upload.wikimedia.org/wikipedia/commons/6/69/XGBoost_logo.png" width="300">
        </td>
    </tr>
</table>

In [1]:
import os
import numpy as np

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'pickup_taxizone_id', 
    'dropoff_taxizone_id',
]
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

# Initialize Dask cluster

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 3
cluster = SaturnCluster(n_workers=n_workers, scheduler_size='medium', worker_size='large', nthreads=2)
client = Client(cluster)
cluster

[2020-09-11 21:19:08] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

Open the dashboard (link above ^) and watch it when you execute some commands, you'll see which tasks are running across the cluster.

If you created your cluster here in this notebook, it might take a few minutes for all your nodes to become available. You can run the chunk below to block until all nodes are ready.

>**Pro tip**: Create and/or start your cluster from the "Dask" page in Saturn if you want to get a head start!

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Done!


# Load data and feature engineering

Load a sample from a single month for this exercise

In [5]:
import yaml
import snowflake.connector

creds = yaml.full_load(open('/home/jovyan/snowflake_creds.yml'))

# get connection info
conn_info = {
    'warehouse': 'COMPUTE_WH',
    'database': 'NYC_TAXI',
    'schema': 'PUBLIC',
    **creds,
}
conn = snowflake.connector.connect(**conn_info)
q = "select DISTINCT(DATE(PICKUP_DATETIME)) as date from taxi_yellow"
cur = conn.cursor().execute(q)
dates = cur.fetch_pandas_all()['DATE'].tolist()

In [6]:
from dask import delayed


query = """
SELECT
    pickup_taxizone_id,
    dropoff_taxizone_id,
    passenger_count,
    DIV0(TIP_AMOUNT, FARE_AMOUNT) as TIP_FRACTION,
    DAYOFWEEKISO(PICKUP_DATETIME) - 1 as PICKUP_WEEKDAY,
    WEEKOFYEAR(PICKUP_DATETIME) as PICKUP_WEEKOFYEAR,
    HOUR(PICKUP_DATETIME) as PICKUP_HOUR,
    (PICKUP_WEEKDAY * 24) + PICKUP_HOUR as PICKUP_WEEK_HOUR,
    MINUTE(PICKUP_DATETIME) as PICKUP_MINUTE
FROM taxi_yellow
WHERE
    date(pickup_datetime) = %s
"""


@delayed
def load(conn_info, query, day, frac=0.01):
    # q = query % str(day)
    conn = snowflake.connector.connect(**conn_info)
    cur = conn.cursor().execute(query, str(day))
    taxi = cur.fetch_pandas_all().sample(frac=0.01, replace=True)
    taxi.columns = [x.lower() for x in taxi.columns]
    return taxi

    

In [7]:
import datetime as dt
_dates = [x for x in dates if x is not None and x >= dt.date(2019, 1, 1) and x < dt.date(2019, 2, 1)]

In [8]:
import dask.dataframe as dd
taxi = dd.from_delayed([load(conn_info, query, day) for day in _dates])

In [9]:
taxi_train = taxi[features + [y_col]].astype(float).fillna(-1)

In [10]:
taxi_train.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,pickup_taxizone_id,dropoff_taxizone_id,tip_fraction
140123,6.0,2.0,15.0,159.0,14.0,1.0,141.0,236.0,0.171429
61423,6.0,2.0,9.0,153.0,24.0,1.0,234.0,138.0,0.296857
142761,6.0,2.0,16.0,160.0,8.0,4.0,107.0,236.0,0.21
163058,6.0,2.0,17.0,161.0,41.0,1.0,211.0,158.0,0.221333
96358,6.0,2.0,12.0,156.0,5.0,0.0,249.0,90.0,0.233333


Dask performs computations in a [lazy manner](https://tutorial.dask.org/01x_lazy.html), so we persist the dataframe to perform data loading and feature processing once.

In [11]:
%%time
taxi_train = taxi_train.persist()
_ = wait(taxi_train)

CPU times: user 162 ms, sys: 0 ns, total: 162 ms
Wall time: 13.4 s


# Train a model

In [13]:
import dask_xgboost

xgb_reg = dask_xgboost.XGBRegressor(
    objective="reg:squarederror",
    tree_method='approx',
    learning_rate=0.1,
    max_depth=8,
    n_estimators=100,
)

In [14]:
%%time
_ = xgb_reg.fit(taxi_train[features], y=taxi_train[y_col])

CPU times: user 52.1 ms, sys: 4.03 ms, total: 56.2 ms
Wall time: 21 s


## Save model

In [15]:
import cloudpickle

with open(f'{MODEL_PATH}/xgboost_dask.pkl', 'wb') as f:
    cloudpickle.dump(xgb_reg, f)

## Calculate metrics on test set

Use a different month for test set

In [19]:
import datetime as dt
_dates = [x for x in dates if x is not None and x >= dt.date(2019, 2, 1) and x < dt.date(2019, 3, 1)]
taxi_test = dd.from_delayed([load(conn_info, query, day) for day in _dates])

Convert to single-node DataFrames using `compute()` to calculate metrics with scikit-learn. If the data was larger, could use `dask_ml.metrics.mean_squared_error`

In [None]:
from dask.distributed import wait
taxi_test = taxi_test.persist()
_ = wait(taxi_test)

In [20]:
from sklearn.metrics import mean_squared_error

preds = xgb_reg.predict(taxi_test[features])
mean_squared_error(taxi_test[y_col].compute(), preds.compute(), squared=False)

0.20683957637041311