<img style="float: right" src="img/saturn.png" width="300" />

# Scaling Machine Learning in Python

## Large datasets

- Load and process large dataset
    - `dask.dataframe`
- Predict over large dataset
    - `ParallelPostFit`
    - `map_partitions`
- Train model with large dataset
    - `Incremental`
    - `dask_ml`
    - XGBoost

# Load and process large dataset

## Initialize Dask cluster

In [1]:
from dask_saturn import SaturnCluster
from dask.distributed import Client

cluster = SaturnCluster()
client = Client(cluster)
client.wait_for_workers(3)

[2020-11-09 01:34:31] INFO - dask-saturn | Cluster is ready


## Load data

In [2]:
import s3fs
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

s3 = s3fs.S3FileSystem(anon=True)

In [14]:
import dask
import dask.dataframe as dd
from dask.distributed import wait

In [3]:
files_2019 = 's3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv'
s3.glob(files_2019)

['nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-02.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-03.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-04.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-05.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-06.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-07.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-08.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-09.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-10.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-11.csv',
 'nyc-tlc/trip data/yellow_tripdata_2019-12.csv']

In [4]:
%%time

taxi = dd.read_csv(
    files_2019,
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
    storage_options={'anon': True},
    assume_missing=True,
)

CPU times: user 21.5 ms, sys: 30.7 ms, total: 52.2 ms
Wall time: 136 ms


In [7]:
taxi

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
npartitions=127,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,float64,datetime64[ns],datetime64[ns],float64,float64,float64,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [11]:
taxi_bytes = taxi.memory_usage(deep=True).sum()
taxi_bytes

dd.Scalar<series-..., dtype=int64>

In [12]:
%%time
print(f"Size (MB): {taxi_bytes.compute() / 1e6}")

Size (MB): 16367.014316
CPU times: user 82.9 ms, sys: 9.2 ms, total: 92.1 ms
Wall time: 49.1 s


In [15]:
taxi = taxi.persist()

In [16]:
%%time
_ = wait(taxi)

CPU times: user 74.8 ms, sys: 7.14 ms, total: 82 ms
Wall time: 41.6 s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 0)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 109)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 106)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 53)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 77)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 102)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 28)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 30)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f70790d5b2778996ba671c483', 7)>, <Future: finished, type: pandas.DataFrame, key: ('read-csv-557a801f7079

In [18]:
%%time
taxi_bytes = taxi.memory_usage(deep=True).sum()
print(f"Size (MB): {taxi_bytes.compute() / 1e6}")

Size (MB): 16367.014316
CPU times: user 51.2 ms, sys: 3.91 ms, total: 55.1 ms
Wall time: 2.63 s


# Exploratory analysis

In [19]:
%%time
taxi_describe = taxi.describe().compute().T
np.round(taxi_describe, 3)

CPU times: user 3.46 s, sys: 55.6 ms, total: 3.51 s
Wall time: 17.2 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,84152418.0,1.645,0.498,1.0,1.0,2.0,2.0,4.0
passenger_count,84152418.0,1.563,1.208,0.0,1.0,1.0,2.0,9.0
trip_distance,84399019.0,3.001,8.091,-37264.53,1.07,1.93,8.82,45977.22
RatecodeID,84152418.0,1.061,0.76,1.0,1.0,1.0,1.0,99.0
PULocationID,84399019.0,163.158,66.016,1.0,132.0,162.0,234.0,265.0
DOLocationID,84399019.0,161.353,70.251,1.0,116.0,163.0,236.0,265.0
payment_type,84152418.0,1.289,0.479,1.0,1.0,1.0,2.0,5.0
fare_amount,84399019.0,13.344,174.375,-1856.0,7.0,11.0,32.04,943274.8
extra,84399019.0,1.087,1.249,-60.0,0.0,1.0,3.0,535.38
mta_tax,84399019.0,0.495,0.067,-0.5,0.5,0.5,0.5,212.42


## Feature engineering

In [20]:
# specify feature and label column names
raw_features = [
    'tpep_pickup_datetime', 
    'passenger_count', 
    'tip_amount', 
    'fare_amount',
]
features = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
label = 'tip_fraction'

In [21]:
def prep_df(taxi_df):
    '''
    Generate features from a raw taxi dataframe.
    '''
    df = taxi_df[taxi_df.fare_amount > 0][raw_features].copy()  # avoid divide-by-zero
    df[label] = df.tip_amount / df.fare_amount
     
    df['pickup_weekday'] = df.tpep_pickup_datetime.dt.weekday
    df['pickup_weekofyear'] = df.tpep_pickup_datetime.dt.weekofyear
    df['pickup_hour'] = df.tpep_pickup_datetime.dt.hour
    df['pickup_week_hour'] = (df.pickup_weekday * 24) + df.pickup_hour
    df['pickup_minute'] = df.tpep_pickup_datetime.dt.minute
    df = df[features + [label]].astype(float).fillna(-1)
    
    return df

In [22]:
taxi_feat = prep_df(taxi)
taxi_feat.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,tip_fraction
0,1.0,1.0,0.0,24.0,46.0,1.0,0.235714
1,1.0,1.0,0.0,24.0,59.0,1.0,0.071429
2,4.0,51.0,13.0,109.0,48.0,3.0,0.0
3,2.0,48.0,15.0,63.0,52.0,5.0,0.0
4,2.0,48.0,15.0,63.0,56.0,5.0,0.0


In [23]:
from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    taxi_feat[features], 
    taxi_feat[label], 
    test_size=0.3,
    random_state=42
)

In [25]:
%%time
X_train, X_test, y_train, y_test = dask.persist(
    X_train, X_test, y_train, y_test,
)
_ = wait(X_train)

CPU times: user 299 ms, sys: 16.3 ms, total: 315 ms
Wall time: 9.93 s


In [30]:
len(X_train), len(y_train)

(58939024, 58939024)

In [31]:
len(X_test), len(y_test)

(25255601, 25255601)

# Predict over large dataset

## `ParallelPostFit`

In [33]:
taxi_feat_sample = taxi_feat.partitions[0].compute()

In [34]:
taxi_feat_sample.shape

(717801, 7)

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler

from dask_ml.wrappers import ParallelPostFit
from dask_ml.metrics import mean_squared_error

pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('clf', ElasticNet(normalize=False, max_iter=100, l1_ratio=0)),
])

ppf = ParallelPostFit(estimator=pipeline)

In [97]:
%%time
ppf_fitted = ppf.fit(taxi_feat_sample[features], taxi_feat_sample[label])

CPU times: user 1.47 s, sys: 40 ms, total: 1.51 s
Wall time: 826 ms


In [98]:
ppf_fitted.predict(taxi_feat_sample[features])

array([0.13886918, 0.14424305, 0.11733346, ..., 0.1640145 , 0.17145526,
       0.18054951])

In [99]:
preds = ppf_fitted.predict(taxi_feat[features])

In [100]:
preds

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan,)","(nan,)"
Count,3937 Tasks,127 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes unknown unknown Shape (nan,) (nan,) Count 3937 Tasks 127 Chunks Type int64 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan,)","(nan,)"
Count,3937 Tasks,127 Chunks
Type,int64,numpy.ndarray


In [101]:
preds.blocks[0].compute()

array([0.13886918, 0.14424305, 0.11733346, ..., 0.1640145 , 0.17145526,
       0.18054951])

In [104]:
mean_squared_error(taxi_feat[label].values, preds, squared=False)

13.453205303896075

## `map_partitions`

In [105]:
%%time
fitted = pipeline.fit(
    taxi_feat_sample[features], 
    taxi_feat_sample[label],
)

CPU times: user 1.47 s, sys: 31.6 ms, total: 1.5 s
Wall time: 818 ms


In [106]:
import cloudpickle

with open('/tmp/model.pkl', 'wb') as f:
    cloudpickle.dump(pipeline, f)

In [107]:
model = cloudpickle.load(open('/tmp/model.pkl', 'rb'))

def predict(df):
    preds = model.predict(df[features])
    return preds

In [108]:
preds = taxi_feat.map_partitions(predict)

In [109]:
preds.blocks[0].compute()

array([0.13886918, 0.14424305, 0.11733346, ..., 0.1640145 , 0.17145526,
       0.18054951])

In [111]:
mean_squared_error(taxi_feat[label].values, preds, squared=False)

13.453205303896075

## Train model with large dataset

### `Incremental`


https://ml.dask.org/incremental.html

## `dask_ml.linear_model`

In [112]:
from sklearn.pipeline import Pipeline

from dask_ml.linear_model import LinearRegression
from dask_ml.preprocessing import StandardScaler
from dask_ml.metrics import mean_squared_error
from dask_ml.model_selection import GridSearchCV

lr = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('clf', LinearRegression(penalty='l2', max_iter=100)),
])

In [118]:
X_train_arr = X_train.to_dask_array(lengths=True)
y_train_arr = y_train.to_dask_array(lengths=True)
X_test_arr = X_test.to_dask_array(lengths=True)
y_test_arr = y_test.to_dask_array(lengths=True)

In [116]:
%%time

lr_fitted = lr.fit(
    X_train_arr,
    y_train_arr,
)

CPU times: user 32.6 s, sys: 986 ms, total: 33.6 s
Wall time: 7min 6s


In [123]:
preds = fitted.predict(X_test_arr)
mean_squared_error(y_test_arr, preds, squared=False)

15.539181569446649

## XGBoost

`dask_xgboost` being deprecated soon in favor of `xgboost.dask`. Give example

In [126]:
from dask_xgboost import XGBRegressor

xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method='approx',
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
)

In [None]:
%%time

xgb_fitted = xgb.fit(
    X_train_arr,
    y_train_arr,
)

In [129]:
preds = fitted.predict(X_test_arr)
mean_squared_error(y_test_arr, preds, squared=False)

15.539181569446649