# XGBoost regression

## Single-node

![xgboost](https://upload.wikimedia.org/wikipedia/commons/6/69/XGBoost_logo.png)

In [1]:
import os

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'pickup_taxizone_id', 
    'dropoff_taxizone_id',
]
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

# Load data and feature engineering

Load a sample from a single month for this exercise

In [3]:
import os
import numpy as np
import datetime
import pandas as pd
import s3fs
import warnings
warnings.simplefilter("ignore")

import yaml
import snowflake.connector

creds = yaml.full_load(open('/home/jovyan/snowflake_creds.yml'))

conn = snowflake.connector.connect(
    warehouse='COMPUTE_WH',
    database='NYC_TAXI',
    schema='PUBLIC',
    **creds,
)
query = """
SELECT 
    pickup_taxizone_id,
    dropoff_taxizone_id,
    passenger_count,
    DIV0(TIP_AMOUNT, FARE_AMOUNT) as TIP_FRACTION,
    DAYOFWEEKISO(PICKUP_DATETIME) - 1 as PICKUP_WEEKDAY,
    WEEKOFYEAR(PICKUP_DATETIME) as PICKUP_WEEKOFYEAR,
    HOUR(PICKUP_DATETIME) as PICKUP_HOUR,
    (PICKUP_WEEKDAY * 24) + PICKUP_HOUR as PICKUP_WEEK_HOUR,
    MINUTE(PICKUP_DATETIME) as PICKUP_MINUTE
FROM taxi_yellow
WHERE
    date_trunc('MONTH', pickup_datetime) = %s
"""
cur = conn.cursor().execute(query, '2019-01-01')
taxi = cur.fetch_pandas_all()
taxi.columns = [x.lower() for x in taxi.columns]
taxi = taxi.sample(frac=0.3, replace=False)

In [4]:
print(f'Num rows: {len(taxi)}, Size: {taxi.memory_usage(deep=True).sum() / 1e6} MB')

Num rows: 2300294, Size: 64.408232 MB


In [6]:
taxi_train = taxi[features + [y_col]].astype(float).fillna(-1)

In [7]:
taxi_train.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,pickup_taxizone_id,dropoff_taxizone_id,tip_fraction
6967932,1.0,2.0,9.0,33.0,38.0,1.0,233.0,164.0,0.0
288609,0.0,4.0,17.0,17.0,51.0,2.0,113.0,142.0,0.208889
341787,0.0,4.0,21.0,21.0,6.0,1.0,170.0,87.0,0.137931
6062910,5.0,4.0,17.0,137.0,56.0,1.0,261.0,45.0,0.0
6744498,1.0,5.0,15.0,39.0,15.0,2.0,237.0,237.0,0.206957


# Train a model

Setting `nthread=-1` tells xgboost to use all available cores on this machine to parallelize model training

In [8]:
import xgboost

xgb_reg = xgboost.XGBRegressor(
    objective="reg:squarederror",
    tree_method='approx',
    learning_rate=0.1,
    max_depth=8,
    n_estimators=100,
    nthread=-1
)

In [9]:
%%time
_ = xgb_reg.fit(taxi_train[features], y=taxi_train[y_col])

CPU times: user 17min 23s, sys: 9.93 s, total: 17min 33s
Wall time: 11min 54s


## Save model

In [10]:
import cloudpickle

with open(f'{MODEL_PATH}/xgboost.pkl', 'wb') as f:
    cloudpickle.dump(xgb_reg, f)

## Calculate metrics on test set

Use a different month for test set

In [12]:
cur = conn.cursor().execute(query, '2019-02-01')
taxi_test = cur.fetch_pandas_all()
taxi_test.columns = [x.lower() for x in taxi_test.columns]
taxi_test = taxi_test.sample(frac=0.3, replace=False)


In [13]:
from sklearn.metrics import mean_squared_error

preds = xgb_reg.predict(taxi_test[features])
mean_squared_error(taxi_test[y_col], preds, squared=False)

17.250287388052328