In [1]:
# load some default Python modules
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import feather
import dask.dataframe as ddf
import dask
% matplotlib inline
plt.style.use('seaborn-whitegrid')

In [2]:
train_df = feather.read_dataframe("../../datasets/kaggle/new-york-city-taxi-fare-prediction/train2.feather")

In [3]:
df_train = ddf.from_pandas(train_df, npartitions=12)

In [4]:
print(df_train.info())

<class 'dask.dataframe.core.DataFrame'>
Columns: 19 entries, fare_amount to year
dtypes: datetime64[ns](1), bool(7), float32(7), int64(3), uint8(1)None


In [5]:
print(df_train.compute().shape)
df_train.head()

(54246832, 19)


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_miles,is_pickup_JFK_new,is_dropoff_JFK_new,is_pickup_EWR_new,is_dropoff_EWR_new,is_pickup_LGA_new,is_dropoff_LGA_new,is_to_from_JFK_new,distance_to_center,hour,weekday,year
0,4.5,2009-06-15 17:26:21,-73.844315,40.721317,-73.841614,40.712276,1,0.0,False,False,False,False,False,False,False,7.298447,17,0,2009
1,16.9,2010-01-05 16:52:16,-74.016045,40.711304,-73.979271,40.782005,1,5.244397,False,False,False,False,False,False,False,5.56751,16,1,2010
2,5.7,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,0.0,False,False,False,False,False,False,False,1.714565,0,3,2011
3,7.7,2012-04-21 04:30:42,-73.987129,40.733143,-73.99157,40.758091,1,1.932986,False,False,False,False,False,False,False,3.654209,4,5,2012
4,5.3,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,1.366828,False,False,False,False,False,False,False,0.0,7,1,2010


In [6]:
import gc
del train_df
gc.collect()

64

In [7]:
df_train["hour_period"]=df_train["hour"] // 4

In [8]:
df_train = df_train.categorize(columns=["year","hour_period"])

In [9]:
df_train = ddf.get_dummies(df_train, prefix=["year","hour_period"], columns=["year","hour_period"])

In [10]:
df_train.info(verbose=True)

<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 54246832 entries, 0 to 54246831
Data columns (total 31 columns):
fare_amount           54246832 non-null float32
pickup_datetime       54246832 non-null datetime64[ns]
pickup_longitude      54246832 non-null float32
pickup_latitude       54246832 non-null float32
dropoff_longitude     54246832 non-null float32
dropoff_latitude      54246832 non-null float32
passenger_count       54246832 non-null uint8
distance_miles        54246832 non-null float32
is_pickup_JFK_new     54246832 non-null bool
is_dropoff_JFK_new    54246832 non-null bool
is_pickup_EWR_new     54246832 non-null bool
is_dropoff_EWR_new    54246832 non-null bool
is_pickup_LGA_new     54246832 non-null bool
is_dropoff_LGA_new    54246832 non-null bool
is_to_from_JFK_new    54246832 non-null bool
distance_to_center    54246832 non-null float32
hour                  54246832 non-null int64
weekday               54246832 non-null int64
year_2009             54246832 non-null

In [11]:
nyc = (-73.96, 40.78)
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...

In [12]:
df_train['distance_to_center_dropoff'] = distance(nyc[1], nyc[0], df_train.dropoff_latitude, df_train.dropoff_longitude)


In [13]:
df_train.info(verbose=True)

<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 54246832 entries, 0 to 54246831
Data columns (total 32 columns):
fare_amount                   54246832 non-null float32
pickup_datetime               54246832 non-null datetime64[ns]
pickup_longitude              54246832 non-null float32
pickup_latitude               54246832 non-null float32
dropoff_longitude             54246832 non-null float32
dropoff_latitude              54246832 non-null float32
passenger_count               54246832 non-null uint8
distance_miles                54246832 non-null float32
is_pickup_JFK_new             54246832 non-null bool
is_dropoff_JFK_new            54246832 non-null bool
is_pickup_EWR_new             54246832 non-null bool
is_dropoff_EWR_new            54246832 non-null bool
is_pickup_LGA_new             54246832 non-null bool
is_dropoff_LGA_new            54246832 non-null bool
is_to_from_JFK_new            54246832 non-null bool
distance_to_center            54246832 non-null float32
hou

In [14]:
df_train['distance_to_center_pickup_log'] = np.log2(df_train[['distance_to_center']])['distance_to_center']
df_train['distance_to_center_dropoff_log'] = np.log2(df_train[['distance_to_center_dropoff']])['distance_to_center_dropoff']
df_train['distance_to_center_pickup_log'] = df_train['distance_to_center_pickup_log'].mask(df_train['distance_to_center_pickup_log'] == np.inf, np.nan)
df_train['distance_to_center_pickup_log'] = df_train['distance_to_center_pickup_log'].mask(df_train['distance_to_center_pickup_log'] == -np.inf, np.nan)
df_train['distance_to_center_dropoff_log'] = df_train['distance_to_center_dropoff_log'].mask(df_train['distance_to_center_dropoff_log'] == np.inf, np.nan)
df_train['distance_to_center_dropoff_log'] = df_train['distance_to_center_dropoff_log'].mask(df_train['distance_to_center_dropoff_log'] == -np.inf, np.nan)
values = {'distance_to_center_pickup_log': 0, 'distance_to_center_dropoff_log': 0}
df_train = df_train.fillna(value=values)
df_train['distance_to_center_pickup_log'] = df_train['distance_to_center_pickup_log'].astype('int8')
df_train['distance_to_center_dropoff_log'] = df_train['distance_to_center_dropoff_log'].astype('int8')



In [15]:
df_train.compute()
df_train.head()

  return func(*args2)
  return func(*args2)


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_miles,is_pickup_JFK_new,is_dropoff_JFK_new,...,year_2015,hour_period_4,hour_period_0,hour_period_1,hour_period_2,hour_period_5,hour_period_3,distance_to_center_dropoff,distance_to_center_pickup_log,distance_to_center_dropoff_log
0,4.5,2009-06-15 17:26:21,-73.844315,40.721317,-73.841614,40.712276,1,0.0,False,False,...,0,1,0,0,0,0,0,7.811629,2,2
1,16.9,2010-01-05 16:52:16,-74.016045,40.711304,-73.979271,40.782005,1,5.244397,False,False,...,0,1,0,0,0,0,0,1.034978,2,0
2,5.7,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,0.0,False,False,...,0,0,1,0,0,0,0,2.424832,0,1
3,7.7,2012-04-21 04:30:42,-73.987129,40.733143,-73.99157,40.758091,1,1.932986,False,False,...,0,0,0,1,0,0,0,2.254531,1,1
4,5.3,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,1.366828,False,False,...,0,0,0,1,0,0,0,0.0,0,0


In [16]:
df_train = df_train.categorize(columns=["distance_to_center_pickup_log","distance_to_center_dropoff_log"])
df_train = ddf.get_dummies(df_train, prefix=["pickup_zone","dropoff_zone"], columns=["distance_to_center_pickup_log","distance_to_center_dropoff_log"])
df_train.head()

  return func(*args2)


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_miles,is_pickup_JFK_new,is_dropoff_JFK_new,...,pickup_zone_4,pickup_zone_5,pickup_zone_6,dropoff_zone_2,dropoff_zone_0,dropoff_zone_1,dropoff_zone_3,dropoff_zone_4,dropoff_zone_5,dropoff_zone_6
0,4.5,2009-06-15 17:26:21,-73.844315,40.721317,-73.841614,40.712276,1,0.0,False,False,...,0,0,0,1,0,0,0,0,0,0
1,16.9,2010-01-05 16:52:16,-74.016045,40.711304,-73.979271,40.782005,1,5.244397,False,False,...,0,0,0,0,1,0,0,0,0,0
2,5.7,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,0.0,False,False,...,0,0,0,0,0,1,0,0,0,0
3,7.7,2012-04-21 04:30:42,-73.987129,40.733143,-73.99157,40.758091,1,1.932986,False,False,...,0,0,0,0,0,1,0,0,0,0
4,5.3,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,1.366828,False,False,...,0,0,0,0,1,0,0,0,0,0


In [17]:
df_train.info(verbose=True)

  return func(*args2)


<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 54246832 entries, 0 to 54246831
Data columns (total 46 columns):
fare_amount                   54246832 non-null float32
pickup_datetime               54246832 non-null datetime64[ns]
pickup_longitude              54246832 non-null float32
pickup_latitude               54246832 non-null float32
dropoff_longitude             54246832 non-null float32
dropoff_latitude              54246832 non-null float32
passenger_count               54246832 non-null uint8
distance_miles                54246832 non-null float32
is_pickup_JFK_new             54246832 non-null bool
is_dropoff_JFK_new            54246832 non-null bool
is_pickup_EWR_new             54246832 non-null bool
is_dropoff_EWR_new            54246832 non-null bool
is_pickup_LGA_new             54246832 non-null bool
is_dropoff_LGA_new            54246832 non-null bool
is_to_from_JFK_new            54246832 non-null bool
distance_to_center            54246832 non-null float32
hou

In [18]:
path_train = '../../datasets/kaggle/new-york-city-taxi-fare-prediction/train_full_2'
df_train.to_parquet(path_train)

  return func(*args2)


In [19]:
test_df = feather.read_dataframe("../../datasets/kaggle/new-york-city-taxi-fare-prediction/test2.feather")
df_test = ddf.from_pandas(test_df, npartitions=1)

In [20]:
df_test["hour_period"]=df_test["hour"] // 4
df_test = df_test.categorize(columns=["year","hour_period"])
df_test = ddf.get_dummies(df_test, prefix=["year","hour_period"], columns=["year","hour_period"])
df_test.info(verbose=True)

<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 31 columns):
key                   9914 non-null object
pickup_datetime       9914 non-null datetime64[ns]
pickup_longitude      9914 non-null float32
pickup_latitude       9914 non-null float32
dropoff_longitude     9914 non-null float32
dropoff_latitude      9914 non-null float32
passenger_count       9914 non-null uint8
distance_miles        9914 non-null float32
is_pickup_JFK_new     9914 non-null bool
is_dropoff_JFK_new    9914 non-null bool
is_pickup_EWR_new     9914 non-null bool
is_dropoff_EWR_new    9914 non-null bool
is_pickup_LGA_new     9914 non-null bool
is_dropoff_LGA_new    9914 non-null bool
is_to_from_JFK_new    9914 non-null bool
distance_to_center    9914 non-null float32
hour                  9914 non-null int64
weekday               9914 non-null int64
year_2015             9914 non-null uint8
year_2011             9914 non-null uint8
year_2012             9914 non-null 

In [21]:
df_test['distance_to_center_dropoff'] = distance(nyc[1], nyc[0], df_test.dropoff_latitude, df_test.dropoff_longitude)
df_test['distance_to_center_pickup_log'] = np.log2(df_test[['distance_to_center']])['distance_to_center']
df_test['distance_to_center_dropoff_log'] = np.log2(df_test[['distance_to_center_dropoff']])['distance_to_center_dropoff']
df_test['distance_to_center_pickup_log'] = df_test['distance_to_center_pickup_log'].mask(df_test['distance_to_center_pickup_log'] == np.inf, np.nan)
df_test['distance_to_center_pickup_log'] = df_test['distance_to_center_pickup_log'].mask(df_test['distance_to_center_pickup_log'] == -np.inf, np.nan)
df_test['distance_to_center_dropoff_log'] = df_test['distance_to_center_dropoff_log'].mask(df_test['distance_to_center_dropoff_log'] == np.inf, np.nan)
df_test['distance_to_center_dropoff_log'] = df_test['distance_to_center_dropoff_log'].mask(df_test['distance_to_center_dropoff_log'] == -np.inf, np.nan)
values = {'distance_to_center_pickup_log': 0, 'distance_to_center_dropoff_log': 0}
df_test = df_test.fillna(value=values)
df_test['distance_to_center_pickup_log'] = df_test['distance_to_center_pickup_log'].astype('int8')
df_test['distance_to_center_dropoff_log'] = df_test['distance_to_center_dropoff_log'].astype('int8')


In [22]:
df_test = df_test.categorize(columns=["distance_to_center_pickup_log","distance_to_center_dropoff_log"])
df_test = ddf.get_dummies(df_test, prefix=["pickup_zone","dropoff_zone"], columns=["distance_to_center_pickup_log","distance_to_center_dropoff_log"])
df_test.info(verbose=True)


<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 46 columns):
key                           9914 non-null object
pickup_datetime               9914 non-null datetime64[ns]
pickup_longitude              9914 non-null float32
pickup_latitude               9914 non-null float32
dropoff_longitude             9914 non-null float32
dropoff_latitude              9914 non-null float32
passenger_count               9914 non-null uint8
distance_miles                9914 non-null float32
is_pickup_JFK_new             9914 non-null bool
is_dropoff_JFK_new            9914 non-null bool
is_pickup_EWR_new             9914 non-null bool
is_dropoff_EWR_new            9914 non-null bool
is_pickup_LGA_new             9914 non-null bool
is_dropoff_LGA_new            9914 non-null bool
is_to_from_JFK_new            9914 non-null bool
distance_to_center            9914 non-null float32
hour                          9914 non-null int64
weekday                   

  return func(*args2)


In [23]:
path_test = '../../datasets/kaggle/new-york-city-taxi-fare-prediction/test_full_2'
df_test.to_parquet(path_test)

  return func(*args2)
