In [1]:
import cudf
from cuml import train_test_split
from sklearn.datasets import make_regression
import pandas as pd
from cuml.linear_model import LinearRegression as cuLinearRegression
from sklearn.metrics import r2_score
from dask.distributed import Client
import dask.dataframe as dd
import numpy as np

#change this to your Saturn Dask Cluster URL
SATURN_DASK_CLUSTER_URL = ''

client = Client(SATURN_DASK_CLUSTER_URL)
client

In [3]:
%%time

cols = ['Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', 'Passenger_Count', 'Trip_Distance', 'Start_Lon',
        'Start_Lat', 'End_Lon', 'End_Lat', 'Fare_Amt', 'Tip_Amt', 'Total_Amt']
df = dd.read_csv('s3://nyc-tlc/trip data/yellow_tripdata_2009-01.csv', usecols=cols, storage_options={'anon': True})
# filter wrong values in columns
query_frags = [
    'Fare_Amt > 0 and Fare_Amt < 500',
    'Passenger_Count > 0 and Passenger_Count < 6',
    'Start_Lon > -75 and Start_Lon < -73',
    'End_Lon > -75 and End_Lon < -73',
    'Start_Lat	> 40 and Start_Lat < 42',
    'End_Lat > 40 and End_Lat < 42'
]

df = df.query(' and '.join(query_frags))

df['Trip_Pickup_DateTime'] = df['Trip_Pickup_DateTime'].astype('datetime64[ns]')
df['Trip_Dropoff_DateTime'] = df['Trip_Dropoff_DateTime'].astype('datetime64[ns]')
df['Trip_Pickup_DateTime'] = df['Trip_Pickup_DateTime'].apply(lambda x: x.day, meta=('Trip_Dropoff_DateTime', 'int64'))
df['Trip_Dropoff_DateTime'] = df['Trip_Dropoff_DateTime'].apply(lambda x: x.day, meta=('Trip_Dropoff_DateTime', 'int64'))

df = df.compute()

CPU times: user 4.36 s, sys: 2.74 s, total: 7.1 s
Wall time: 1min 6s


In [4]:
not_fare_cols = [col for col in df.columns if col not in ['Fare_Amt']]


X = df[not_fare_cols]
y = pd.DataFrame(df['Fare_Amt']) # build this as a Pandas dataframe

X = cudf.DataFrame.from_pandas(X)
y = cudf.DataFrame.from_pandas(y)

X_cudf, X_cudf_test, y_cudf, y_cudf_test = train_test_split(X, y)

#y_cudf_test = y_cudf_test['Fare_Amt']

In [5]:
type(y_cudf_test)

cudf.core.dataframe.DataFrame

In [6]:
# Copy dataset from GPU memory to host memory.
# This is done to later compare CPU and GPU results.

#X_train = X_cudf.to_pandas()
#X_test = X_cudf_test.to_pandas()
#y_train = y_cudf.to_pandas()
#y_test = y_cudf_test.to_pandas()

## cuML Model

### Fit, predict and evaluate

In [7]:
%%time
ols_cuml = cuLinearRegression(fit_intercept=True,
                              normalize=True,
                              algorithm='eig')

ols_cuml.fit(X_cudf, y_cudf)

CPU times: user 1.06 s, sys: 381 ms, total: 1.44 s
Wall time: 1.42 s


LinearRegression(algorithm='eig', fit_intercept=True, normalize=True, handle=<cuml.common.handle.Handle object at 0x7f23a82803f0>)

In [8]:
%%time
predict_cuml = ols_cuml.predict(X_cudf_test)

CPU times: user 10.6 ms, sys: 12.2 ms, total: 22.7 ms
Wall time: 21.5 ms


In [9]:
%%time

r2_score_cuml = r2_score(y_cudf_test.to_pandas(), predict_cuml.to_pandas())

CPU times: user 45.9 ms, sys: 6.57 ms, total: 52.5 ms
Wall time: 51.2 ms


## See Results

In [10]:
print("R^2 score : %s" % r2_score_cuml)

R^2 score : 0.9879075387667043
