# Random forest classification

## RAPIDS single GPU

<img src="https://rapids.ai/assets/images/RAPIDS-logo-purple.svg" width="400">

In [None]:
import os

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'pickup_taxizone_id', 
    'dropoff_taxizone_id',
]
features = numeric_feat + categorical_feat
y_col = 'high_tip'

# Load data and feature engineering

Load a full month for this exercise. Note we are loading the data with RAPIDS now (`cudf.read_csv` vs. `pd.read_csv`)

In [None]:
import os
import numpy as np
import datetime
import pandas as pd
from pandas.io.sql import read_sql
import s3fs
import warnings
import cudf
warnings.simplefilter("ignore")

import yaml
import snowflake.connector

creds = yaml.full_load(open('/home/jovyan/snowflake_creds.yml'))

conn = snowflake.connector.connect(
    warehouse='COMPUTE_WH',
    database='NYC_TAXI',
    schema='PUBLIC',
    **creds,
)

In [None]:
query = """
SELECT 
    pickup_taxizone_id,
    dropoff_taxizone_id,
    passenger_count,
    DIV0(tip_amount, fare_amount) > 0.2 AS high_tip,
    DAYOFWEEKISO(pickup_datetime) - 1 AS pickup_weekday,
    WEEKOFYEAR(pickup_datetime) AS pickup_weekofyear,
    HOUR(pickup_datetime) AS pickup_hour,
    (pickup_weekday * 24) + pickup_hour AS pickup_week_hour,
    MINUTE(pickup_datetime) AS pickup_minute
FROM taxi_yellow
WHERE
    DATE_TRUNC('MONTH', pickup_datetime) = %s
"""
taxi = conn.cursor().execute(query, '2019-01-01')
columns = [x[0] for x in taxi.description]
taxi = pd.DataFrame(taxi.fetchall(), columns=columns)
taxi.columns = taxi.columns.str.lower()
taxi = cudf.from_pandas(taxi)

In [None]:
print(f'Num rows: {len(taxi)}, Size: {taxi.memory_usage(deep=True).sum() / 1e6} MB')

In [None]:
taxi_train = taxi[features + [y_col]]
taxi_train[features] = taxi_train[features].astype("float32").fillna(-1)
taxi_train[y_col] = taxi_train[y_col].astype("int32").fillna(-1)

In [None]:
taxi_train.head()

# Train model

In [None]:
from cuml.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, seed=42)

In [None]:
%%time
_ = rfc.fit(taxi_train[features], taxi_train[y_col])

## Save model

In [None]:
import cloudpickle

with open(f'{MODEL_PATH}/random_forest_rapids.pkl', 'wb') as f:
    cloudpickle.dump(rfc, f)

## Calculate metrics on test set

Use a different month for test set

In [None]:
taxi = conn.cursor().execute(query, '2019-02-01')
columns = [x[0] for x in taxi.description]
# using fetchall() because rapids requires a different pyarrow version than snowflake-connector-python
taxi = pd.DataFrame(taxi.fetchall(), columns=columns)
taxi.columns = taxi.columns.str.lower()
taxi = cudf.from_pandas(taxi)
taxi_test = taxi

In [None]:
taxi_test[features] = taxi_test[features].astype("float32").fillna(-1)
taxi_test[y_col] = taxi_test[y_col].astype("int32").fillna(-1)

In [None]:
from cuml.metrics import roc_auc_score

preds = rfc.predict_proba(taxi_test[features])[1]
roc_auc_score(taxi_test[y_col], preds)