# XGBoost regression (single-node)

![xgboost](https://upload.wikimedia.org/wikipedia/commons/6/69/XGBoost_logo.png)

In [1]:
import os
import warnings

import cloudpickle
import pandas as pd
import xgboost

from sklearn.metrics import mean_squared_error

warnings.simplefilter("ignore")

In [2]:
MODEL_PATH = "models"
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    "pickup_weekday",
    "pickup_weekofyear",
    "pickup_hour",
    "pickup_week_hour",
    "pickup_minute",
    "passenger_count",
]
categorical_feat = [
    "PULocationID", 
    "DOLocationID",
]
features = numeric_feat + categorical_feat
y_col = "tip_fraction"

# Load data and feature engineering

Load a sample from a single month for this exercise

In [3]:
taxi = pd.read_csv(
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-01.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"]
).sample(frac=0.3, replace=False)

In [4]:
print(f"Num rows: {len(taxi)}, Size: {taxi.memory_usage(deep=True).sum() / 1e6} MB")

Num rows: 2300338, Size: 483.07098 MB


In [5]:
def prep_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate features from a raw taxi dataframe.
    """
    df = df[df.fare_amount > 0]  # avoid divide-by-zero
    df["tip_fraction"] = df.tip_amount / df.fare_amount
    
    df["pickup_weekday"] = df.tpep_pickup_datetime.dt.weekday
    df["pickup_weekofyear"] = df.tpep_pickup_datetime.dt.weekofyear
    df["pickup_hour"] = df.tpep_pickup_datetime.dt.hour
    df["pickup_week_hour"] = (df.pickup_weekday * 24) + df.pickup_hour
    df["pickup_minute"] = df.tpep_pickup_datetime.dt.minute
    df = df[features + [y_col]].astype(float).fillna(-1)
    
    return df
    
taxi_train = prep_df(taxi)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

In [6]:
taxi_train.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,PULocationID,DOLocationID,tip_fraction
6154587,5.0,4.0,3.0,123.0,0.0,1.0,100.0,48.0,0.252
680337,4.0,1.0,11.0,107.0,55.0,1.0,142.0,140.0,0.332
6236863,5.0,4.0,13.0,133.0,49.0,1.0,262.0,74.0,0.15625
5375204,2.0,4.0,12.0,60.0,24.0,1.0,230.0,137.0,0.2125
5713524,3.0,4.0,17.0,89.0,34.0,1.0,100.0,161.0,0.0


# Train a model

Setting `nthread=-1` tells xgboost to use all available cores on this machine to parallelize model training

In [7]:
xgb_reg = xgboost.XGBRegressor(
    objective="reg:squarederror",
    tree_method="approx",
    learning_rate=0.1,
    max_depth=5,
    n_estimators=50,
    n_jobs=4,
    verbosity=1
)

In [8]:
%%time
_ = xgb_reg.fit(taxi_train[features], y=taxi_train[y_col])

CPU times: user 3min 52s, sys: 0 ns, total: 3min 52s
Wall time: 1min 2s


## Save model

In [9]:
with open(f"{MODEL_PATH}/xgboost.pkl", "wb") as f:
    cloudpickle.dump(xgb_reg, f)

## Calculate metrics on test set

Use a different month for test set

In [10]:
taxi_test = pd.read_csv(
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-02.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"]
).sample(frac=0.01, replace=False)

taxi_test = prep_df(taxi_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

In [11]:
preds = xgb_reg.predict(taxi_test[features])
mean_squared_error(taxi_test[y_col], preds, squared=False)

0.26804358164624675