In [103]:
import os
import opendatasets as od
import random
import joblib
import numpy as np
import pandas as pd
import pyspark
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import zipfile

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.metrics import max_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from sklearn.model_selection import GridSearchCV

In [2]:
dataset_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/overview'

In [3]:
%%time
od.download(dataset_url)

Skipping, found downloaded files in ".\new-york-city-taxi-fare-prediction" (use force=True to force download)
CPU times: total: 0 ns
Wall time: 0 ns


In [4]:
data_dir = './new-york-city-taxi-fare-prediction'

In [5]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
selected_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [6]:
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}

In [7]:
sample_frac = 0.10
def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)

In [8]:
%%time
df = pd.read_csv(
    data_dir+'/train.csv',
    usecols = selected_cols,
    dtype=dtypes,
    parse_dates=['pickup_datetime'],
    skiprows=skip_row
)

df

CPU times: total: 3min 54s
Wall time: 8min 28s


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.991600,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
...,...,...,...,...,...,...,...
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.724380,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.716700,1.0


### Data Exploration

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5542602 entries, 0 to 5542601
Data columns (total 7 columns):
 #   Column             Dtype              
---  ------             -----              
 0   fare_amount        float32            
 1   pickup_datetime    datetime64[ns, UTC]
 2   pickup_longitude   float32            
 3   pickup_latitude    float32            
 4   dropoff_longitude  float32            
 5   dropoff_latitude   float64            
 6   passenger_count    float32            
dtypes: datetime64[ns, UTC](1), float32(5), float64(1)
memory usage: 190.3 MB


In [10]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5542602.0,5542602.0,5542602.0,5542569.0,5542569.0,5542602.0
mean,11.34315,-70.44813,42.25435,-70.44518,39.9175,1.686349
std,40.99119,12.70321,10.35936,12.86386,9.81618,1.308499
min,-300.0,-3439.245,-3492.264,-3379.079,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73402,1.0
50%,8.5,-73.9818,40.75265,-73.98016,40.75314,1.0
75%,12.5,-73.96708,40.76712,-73.96368,40.76809,2.0
max,93963.36,3457.626,3376.602,3442.185,3400.392,208.0


In [11]:
df.head(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.9916,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
5,6.5,2011-02-07 20:01:00+00:00,0.0,0.0,0.0,0.0,1.0
6,4.5,2011-06-28 19:47:00+00:00,-73.988892,40.760159,-73.986443,40.757857,3.0
7,10.9,2011-10-15 10:55:24+00:00,-74.003899,40.725513,-73.976486,40.765537,2.0
8,5.3,2011-03-04 18:12:00+00:00,-73.99704,40.737095,-73.98613,40.735043,1.0
9,16.5,2014-04-29 18:28:00+00:00,-73.970322,40.750839,-73.99839,40.724862,6.0


In [12]:
df.tail(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
5542592,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756217,1.0
5542593,16.5,2013-09-12 08:30:10+00:00,-73.956657,40.778259,-73.988197,40.740953,2.0
5542594,13.5,2015-01-21 10:34:15+00:00,-73.985809,40.740952,-73.974899,40.762432,2.0
5542595,9.0,2013-02-17 03:38:00+00:00,-73.993782,40.725643,-73.990845,40.748162,6.0
5542596,7.5,2013-03-02 03:13:05+00:00,-74.002953,40.728493,-73.985329,40.726105,1.0
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.72438,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.7167,1.0
5542601,10.5,2010-08-10 11:19:42+00:00,-73.97464,40.787758,-74.001518,40.762081,1.0


In [13]:
df.sample(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1914746,21.299999,2009-05-07 15:11:00+00:00,-74.008316,40.719887,-73.972321,40.78607,5.0
5524014,10.5,2011-12-29 18:29:39+00:00,-73.975502,40.751289,-73.990028,40.757004,1.0
1797454,21.700001,2012-07-27 15:26:00+00:00,-74.004494,40.730503,-73.976448,40.782307,1.0
1574670,7.3,2009-09-16 09:01:00+00:00,-73.965591,40.755108,-73.978226,40.761173,1.0
3705320,28.27,2010-08-11 13:27:02+00:00,-73.985573,40.744061,-73.976936,40.756455,1.0
4874417,57.330002,2014-05-29 15:26:00+00:00,-73.983337,40.759548,-73.78244,40.64876,2.0
301986,14.1,2012-04-16 03:12:26+00:00,-73.993095,40.743088,-73.953857,40.743302,1.0
3959232,5.3,2010-07-24 13:59:00+00:00,-73.997498,40.714085,-73.99601,40.717578,1.0
1455697,5.5,2015-03-11 10:01:18+00:00,-73.954681,40.820984,-73.960945,40.809994,1.0
2103322,18.0,2014-07-14 21:46:43+00:00,-74.005623,40.740364,-73.949684,40.771063,1.0


In [14]:
df['pickup_datetime'].min(), df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:01:56+0000', tz='UTC'),
 Timestamp('2015-06-30 23:59:54+0000', tz='UTC'))

In [15]:
%%time
test_df = pd.read_csv(
    data_dir+'/test.csv',
    dtype=dtypes,
    parse_dates=['pickup_datetime']
)

test_df

CPU times: total: 109 ms
Wall time: 175 ms


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   key                9914 non-null   object             
 1   pickup_datetime    9914 non-null   datetime64[ns, UTC]
 2   pickup_longitude   9914 non-null   float32            
 3   pickup_latitude    9914 non-null   float32            
 4   dropoff_longitude  9914 non-null   float32            
 5   dropoff_latitude   9914 non-null   float64            
 6   passenger_count    9914 non-null   float32            
dtypes: datetime64[ns, UTC](1), float32(4), float64(1), object(1)
memory usage: 387.4+ KB


In [17]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.976181,40.750954,-73.974945,40.751743,1.671273
std,0.042799,0.033542,0.039093,0.035435,1.278756
min,-74.25219,40.573143,-74.263245,40.568973,1.0
25%,-73.9925,40.736125,-73.991249,40.735254,1.0
50%,-73.982327,40.753052,-73.980015,40.754065,1.0
75%,-73.968012,40.767113,-73.964062,40.768757,2.0
max,-72.986534,41.709557,-72.990967,41.696683,6.0


In [18]:
test_df.head(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0
5,2012-12-01 21:12:12.0000005,2012-12-01 21:12:12+00:00,-73.960983,40.765549,-73.979179,40.740053,1.0
6,2011-10-06 12:10:20.0000001,2011-10-06 12:10:20+00:00,-73.949013,40.773205,-73.959625,40.770893,1.0
7,2011-10-06 12:10:20.0000003,2011-10-06 12:10:20+00:00,-73.777283,40.646637,-73.985085,40.759368,1.0
8,2011-10-06 12:10:20.0000002,2011-10-06 12:10:20+00:00,-74.014099,40.709637,-73.99511,40.741365,1.0
9,2014-02-18 15:22:20.0000002,2014-02-18 15:22:20+00:00,-73.969582,40.765518,-73.980682,40.770725,1.0


In [19]:
test_df.tail(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
9904,2015-06-30 20:03:50.0000002,2015-06-30 20:03:50+00:00,-73.776848,40.645035,-73.95546,40.652458,6.0
9905,2015-02-27 19:36:02.0000006,2015-02-27 19:36:02+00:00,-73.989647,40.767406,-73.941177,40.845695,6.0
9906,2015-06-15 01:00:06.0000002,2015-06-15 01:00:06+00:00,-73.988052,40.720776,-73.991043,40.718346,6.0
9907,2015-02-03 09:00:58.0000001,2015-02-03 09:00:58+00:00,-73.863457,40.769611,-73.980995,40.763241,6.0
9908,2015-05-19 13:58:11.0000001,2015-05-19 13:58:11+00:00,-73.987968,40.718922,-73.982124,40.732956,6.0
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.8036,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.9916,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0
9913,2015-01-18 14:06:23.0000006,2015-01-18 14:06:23+00:00,-73.988022,40.75407,-74.000282,40.75922,6.0


In [20]:
test_df.sample(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
7457,2014-07-21 18:19:00.000000157,2014-07-21 18:19:00+00:00,-73.973778,40.75518,-73.976616,40.762287,2.0
6368,2009-12-23 20:14:19.0000002,2009-12-23 20:14:19+00:00,-73.96051,40.769657,-73.988983,40.748018,1.0
7079,2015-02-27 14:15:52.0000002,2015-02-27 14:15:52+00:00,-73.99456,40.74017,-73.977089,40.755638,2.0
851,2010-08-27 18:45:00.00000012,2010-08-27 18:45:00+00:00,-73.99427,40.742817,-73.989487,40.748472,1.0
9577,2012-01-26 07:33:00.000000120,2012-01-26 07:33:00+00:00,-74.007576,40.740952,-73.979858,40.764632,5.0
6606,2015-03-11 16:36:39.0000003,2015-03-11 16:36:39+00:00,-74.006447,40.716583,-73.992088,40.738323,1.0
1040,2014-11-16 08:47:12.0000001,2014-11-16 08:47:12+00:00,-73.988991,40.758591,-73.984741,40.769982,1.0
1729,2011-10-04 09:37:00.000000126,2011-10-04 09:37:00+00:00,-73.962456,40.770554,-73.962151,40.772167,1.0
3722,2013-03-06 23:49:53.0000003,2013-03-06 23:49:53+00:00,-73.99926,40.743969,-73.998581,40.76014,1.0
2562,2011-03-06 21:01:00.000000107,2011-03-06 21:01:00+00:00,-74.005829,40.740379,-73.968277,40.680248,1.0


In [21]:
test_df['pickup_datetime'].min(), test_df['pickup_datetime'].max()

(Timestamp('2009-01-01 11:04:24+0000', tz='UTC'),
 Timestamp('2015-06-30 20:03:50+0000', tz='UTC'))

Observations:

- This is a supervised learning regression problem
- Training data is 5.5 GB in size
- Training data has 5.5 million rows
- Test set is much smaller (< 10,000 rows)
- The training set has 8 columns:
    - `key` (a unique identifier)
    - `fare_amount` (target column)
    - `pickup_datetime`
    - `pickup_longitude`
    - `pickup_latitude`
    - `dropoff_longitude`
    - `dropoff_latitude`
    - `passenger_count`
- The test set has all columns except the target column `fare_amount`.
- The submission file should contain the `key` and `fare_amount` for each test sample.


## 3. Prepare Dataset for Training

- Split Training & Validation Set
- Fill/Remove Missing Values
- Extract Inputs & Outputs
   - Training
   - Validation
   - Test

### Split Training & Validation Set

We'll set aside 20% of the training data as the validation set, to evaluate the models we train on previously unseen data. 

Since the test set and training set have the same date ranges, we can pick a random 20% fraction.

In [22]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=25)

In [23]:
train_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2303415,5.5,2015-02-04 08:24:53+00:00,-73.948738,40.776600,-73.951958,40.769390,3.0
5119610,4.5,2010-10-02 12:06:31+00:00,-73.994873,40.745243,-73.998978,40.734206,1.0
1426135,14.9,2011-05-09 21:12:04+00:00,-73.983437,40.770870,-73.988327,40.722847,1.0
2858955,28.1,2010-10-22 03:53:09+00:00,-73.988220,40.723492,-73.792419,40.745597,1.0
2270080,7.3,2012-01-24 18:02:52+00:00,-73.967003,40.793743,-73.962250,40.779168,1.0
...,...,...,...,...,...,...,...
1564927,16.1,2011-07-22 23:39:00+00:00,-73.988411,40.723236,-73.982887,40.778265,1.0
5016438,10.5,2009-05-06 13:46:00+00:00,-73.975616,40.749329,-73.981918,40.768450,1.0
1055194,8.5,2012-08-09 12:05:22+00:00,-73.971191,40.782944,-73.973457,40.764785,1.0
3236158,11.0,2013-08-21 12:52:00+00:00,-74.004715,40.742008,-73.979591,40.746370,1.0


In [24]:
val_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2990758,3.7,2010-11-01 12:46:00+00:00,-73.973495,40.756996,-73.970581,40.762472,1.0
202499,8.1,2011-08-13 01:49:36+00:00,-74.014557,40.709663,-74.007309,40.741237,1.0
1357534,9.7,2009-02-14 22:19:00+00:00,-73.992615,40.697113,-73.997520,40.722172,3.0
389329,6.1,2010-02-05 16:54:06+00:00,-73.987984,40.737877,-73.974380,40.755862,1.0
2961984,22.5,2012-09-21 10:18:38+00:00,-74.009666,40.705090,-73.978798,40.754816,1.0
...,...,...,...,...,...,...,...
4199408,3.7,2010-01-09 19:06:00+00:00,-73.987808,40.732620,-73.992928,40.734870,1.0
638639,7.5,2015-02-11 14:09:37+00:00,-74.003418,40.732555,-73.988945,40.737392,1.0
499094,19.0,2013-09-12 22:10:52+00:00,-74.007141,40.703735,-73.991257,40.750139,1.0
461426,8.5,2015-01-08 10:36:05+00:00,-73.989716,40.756683,-73.972771,40.780453,1.0


### Fill/Remove Missing Values

There are no missing values in our sample, but if there were, we could simply drop the rows with missing values instead of trying to fill them (since we have a lot of training data)>

In [25]:
train_df = train_df.dropna()
val_df = val_df.dropna()

### Extract Inputs and Outputs

In [26]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [27]:
input_cols = ['pickup_datetime','pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [28]:
target_col = 'fare_amount'

In [29]:
## Training Data
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]

In [30]:
train_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2303415,2015-02-04 08:24:53+00:00,-73.948738,40.776600,-73.951958,40.769390,3.0
5119610,2010-10-02 12:06:31+00:00,-73.994873,40.745243,-73.998978,40.734206,1.0
1426135,2011-05-09 21:12:04+00:00,-73.983437,40.770870,-73.988327,40.722847,1.0
2858955,2010-10-22 03:53:09+00:00,-73.988220,40.723492,-73.792419,40.745597,1.0
2270080,2012-01-24 18:02:52+00:00,-73.967003,40.793743,-73.962250,40.779168,1.0
...,...,...,...,...,...,...
1564927,2011-07-22 23:39:00+00:00,-73.988411,40.723236,-73.982887,40.778265,1.0
5016438,2009-05-06 13:46:00+00:00,-73.975616,40.749329,-73.981918,40.768450,1.0
1055194,2012-08-09 12:05:22+00:00,-73.971191,40.782944,-73.973457,40.764785,1.0
3236158,2013-08-21 12:52:00+00:00,-74.004715,40.742008,-73.979591,40.746370,1.0


In [31]:
train_targets

2303415     5.5
5119610     4.5
1426135    14.9
2858955    28.1
2270080     7.3
           ... 
1564927    16.1
5016438    10.5
1055194     8.5
3236158    11.0
4224132    13.0
Name: fare_amount, Length: 4434055, dtype: float32

In [32]:
## Training Data
val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

In [33]:
val_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2990758,2010-11-01 12:46:00+00:00,-73.973495,40.756996,-73.970581,40.762472,1.0
202499,2011-08-13 01:49:36+00:00,-74.014557,40.709663,-74.007309,40.741237,1.0
1357534,2009-02-14 22:19:00+00:00,-73.992615,40.697113,-73.997520,40.722172,3.0
389329,2010-02-05 16:54:06+00:00,-73.987984,40.737877,-73.974380,40.755862,1.0
2961984,2012-09-21 10:18:38+00:00,-74.009666,40.705090,-73.978798,40.754816,1.0
...,...,...,...,...,...,...
4199408,2010-01-09 19:06:00+00:00,-73.987808,40.732620,-73.992928,40.734870,1.0
638639,2015-02-11 14:09:37+00:00,-74.003418,40.732555,-73.988945,40.737392,1.0
499094,2013-09-12 22:10:52+00:00,-74.007141,40.703735,-73.991257,40.750139,1.0
461426,2015-01-08 10:36:05+00:00,-73.989716,40.756683,-73.972771,40.780453,1.0


In [34]:
val_targets

2990758     3.7
202499      8.1
1357534     9.7
389329      6.1
2961984    22.5
           ... 
4199408     3.7
638639      7.5
499094     19.0
461426      8.5
4868465     8.1
Name: fare_amount, Length: 1108514, dtype: float32

In [35]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0


In [36]:
## Test Data
test_inputs = test_df[input_cols]
test_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...
9909,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0


# Train & Evaluate hardcoded models

### Train Hardcoded Models

### Train Baseline Model

# Feature Engineering

#### Extract parts of Date

In [37]:
def add_datetime(df, col):
    df[col+ '_year'] = df[col].dt.year
    df[col+ '_month'] = df[col].dt.month
    df[col+ '_day'] = df[col].dt.day
    df[col+ '_weekday'] = df[col].dt.weekday
    df[col+ '_hour'] = df[col].dt.hour

In [38]:
add_datetime(train_df, 'pickup_datetime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_year'] = df[col].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_month'] = df[col].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_day'] = df[col].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [39]:
add_datetime(val_df, 'pickup_datetime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_year'] = df[col].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_month'] = df[col].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+ '_day'] = df[col].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [40]:
add_datetime(test_df, 'pickup_datetime')
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0,2015,1,27,1,13
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21
...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0,2015,5,10,6,12
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0,2015,1,12,0,17
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0,2015,4,19,6,20
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0,2015,1,31,5,1


#### Add distance between Pickup and Drop 

In [41]:
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [42]:
def add_trip_distance(df):
    df['trip_distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [43]:
add_trip_distance(train_df)
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trip_distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
2303415,5.5,2015-02-04 08:24:53+00:00,-73.948738,40.776600,-73.951958,40.769390,3.0,2015,2,4,2,8,0.845513
5119610,4.5,2010-10-02 12:06:31+00:00,-73.994873,40.745243,-73.998978,40.734206,1.0,2010,10,2,5,12,1.274384
1426135,14.9,2011-05-09 21:12:04+00:00,-73.983437,40.770870,-73.988327,40.722847,1.0,2011,5,9,0,21,5.352537
2858955,28.1,2010-10-22 03:53:09+00:00,-73.988220,40.723492,-73.792419,40.745597,1.0,2010,10,22,4,3,16.669256
2270080,7.3,2012-01-24 18:02:52+00:00,-73.967003,40.793743,-73.962250,40.779168,1.0,2012,1,24,1,18,1.668192
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1564927,16.1,2011-07-22 23:39:00+00:00,-73.988411,40.723236,-73.982887,40.778265,1.0,2011,7,22,4,23,6.132624
5016438,10.5,2009-05-06 13:46:00+00:00,-73.975616,40.749329,-73.981918,40.768450,1.0,2009,5,6,2,13,2.190121
1055194,8.5,2012-08-09 12:05:22+00:00,-73.971191,40.782944,-73.973457,40.764785,1.0,2012,8,9,3,12,2.026860
3236158,11.0,2013-08-21 12:52:00+00:00,-74.004715,40.742008,-73.979591,40.746370,1.0,2013,8,21,2,12,2.170448


In [44]:
add_trip_distance(val_df)
val_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trip_distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
2990758,3.7,2010-11-01 12:46:00+00:00,-73.973495,40.756996,-73.970581,40.762472,1.0,2010,11,1,0,12,0.656220
202499,8.1,2011-08-13 01:49:36+00:00,-74.014557,40.709663,-74.007309,40.741237,1.0,2011,8,13,5,1,3.561519
1357534,9.7,2009-02-14 22:19:00+00:00,-73.992615,40.697113,-73.997520,40.722172,3.0,2009,2,14,5,22,2.815285
389329,6.1,2010-02-05 16:54:06+00:00,-73.987984,40.737877,-73.974380,40.755862,1.0,2010,2,5,4,16,2.303480
2961984,22.5,2012-09-21 10:18:38+00:00,-74.009666,40.705090,-73.978798,40.754816,1.0,2012,9,21,4,10,6.106875
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4199408,3.7,2010-01-09 19:06:00+00:00,-73.987808,40.732620,-73.992928,40.734870,1.0,2010,1,9,5,19,0.498657
638639,7.5,2015-02-11 14:09:37+00:00,-74.003418,40.732555,-73.988945,40.737392,1.0,2015,2,11,2,14,1.332042
499094,19.0,2013-09-12 22:10:52+00:00,-74.007141,40.703735,-73.991257,40.750139,1.0,2013,9,12,3,22,5.327522
461426,8.5,2015-01-08 10:36:05+00:00,-73.989716,40.756683,-73.972771,40.780453,1.0,2015,1,8,3,10,3.001998


In [45]:
add_trip_distance(test_df)
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0,2015,1,27,1,13,2.321720
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13,2.423889
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11,0.618009
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21,1.959910
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21,5.383931
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0,2015,5,10,6,12,2.123367
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0,2015,1,12,0,17,3.268916
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0,2015,4,19,6,20,19.171439
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0,2015,1,31,5,1,8.338278


#### Adding distance from popular landmarks
-- Both dropoff and pickup

In [46]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [47]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = haversine_np(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

In [60]:
%%time
for a_df in [train_df, val_df, test_df]:
    for name, lonlat in [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]:
        add_landmark_dropoff_distance(a_df, name, lonlat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[landmark_name + '_drop_distance'] = haversine_np(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])


CPU times: total: 1.97 s
Wall time: 2.22 s


#### Removing outliers & invalid data

In [61]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [62]:
train_df = remove_outliers(train_df)

In [63]:
val_df = remove_outliers(val_df)

In [64]:
def remove_outliers(df):
    return df[(df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [65]:
test_df = remove_outliers(test_df)

In [66]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0,2015,1,27,1,13,2.321720,20.574949,9.760151,17.346891,4.239318,4.218800
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13,2.423889,21.550981,11.315998,15.789650,5.382910,3.098180
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11,0.618009,20.594007,9.526878,17.576952,3.946866,4.514444
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21,1.959910,21.689160,10.195201,16.969532,3.844222,4.636742
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21,5.383931,21.113870,10.295947,16.808312,4.434000,3.967057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0,2015,5,10,6,12,2.123367,21.507015,6.880891,21.014938,0.645636,8.809757
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0,2015,1,12,0,17,3.268916,21.462229,7.254930,20.464521,0.420324,8.229248
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0,2015,4,19,6,20,19.171439,1.169152,16.084441,32.772347,20.734211,19.933692
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0,2015,1,31,5,1,8.338278,22.402436,6.138515,23.410822,3.200773,11.556237


## Scaling & One Hot Encoding

### Split inputs & targets

In [71]:
input_cols = ['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip_distance',
       'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
       'met_drop_distance', 'wtc_drop_distance']

In [72]:
target_col = 'fare_amount'

In [73]:
train_inputs = train_df[input_cols]
train_target = train_df[target_col]
val_inputs = val_df[input_cols]
val_target = val_df[target_col]
test_inputs = test_df[input_cols]

In [74]:
def add_dateparts(df, col):
    df[col + '_year'] = df[col].dt.year
    df[col + '_month'] = df[col].dt.month
    df[col + '_day'] = df[col].dt.day
    df[col + '_weekday'] = df[col].dt.weekday
    df[col + '_hour'] = df[col].dt.hour

In [75]:
add_dateparts(train_df, 'pickup_datetime')

In [76]:
add_dateparts(val_df, 'pickup_datetime')

In [77]:
add_dateparts(test_df, 'pickup_datetime')

In [78]:
train_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
2303415,-73.948738,40.776600,-73.951958,40.769390,3.0,2015,2,4,2,8,0.845513,20.423068,6.613098,20.737149,1.460475,7.976788
5119610,-73.994873,40.745243,-73.998978,40.734206,1.0,2010,10,2,5,12,1.274384,21.283724,11.540781,15.596907,5.856048,2.571082
1426135,-73.983437,40.770870,-73.988327,40.722847,1.0,2011,5,9,0,21,5.352537,19.898871,11.344980,16.115241,6.630834,2.144481
2858955,-73.988220,40.723492,-73.792419,40.745597,1.0,2010,10,22,4,3,16.669256,11.652659,7.697350,32.778989,14.856932,18.677560
2270080,-73.967003,40.793743,-73.962250,40.779168,1.0,2012,1,24,1,18,1.668192,21.802059,7.430176,20.462616,0.084231,8.415249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1564927,-73.988411,40.723236,-73.982887,40.778265,1.0,2011,7,22,4,23,6.132624,23.005028,9.164123,18.911015,1.661490,7.643196
5016438,-73.975616,40.749329,-73.981918,40.768450,1.0,2009,5,6,2,13,2.190121,22.236259,9.130337,18.438540,1.990452,6.638471
1055194,-73.971191,40.782944,-73.973457,40.764785,1.0,2012,8,9,3,12,2.026860,21.427787,8.477557,18.884854,1.839171,6.560900
3236158,-74.004715,40.742008,-73.979591,40.746370,1.0,2013,8,21,2,12,2.170448,20.604131,9.512928,17.590807,3.921110,4.538506


In [79]:
val_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
2990758,-73.973495,40.756996,-73.970581,40.762472,1.0,2010,11,1,0,12,0.656220,21.077390,8.284742,18.991358,1.981002,6.455639
202499,-74.014557,40.709663,-74.007309,40.741237,1.0,2011,8,13,5,1,3.561519,22.277986,11.900043,15.210869,5.636243,3.189780
1357534,-73.992615,40.697113,-73.997520,40.722172,3.0,2009,2,14,5,22,2.815285,20.558814,12.046282,15.344725,6.984932,1.489636
389329,-73.987984,40.737877,-73.974380,40.755862,1.0,2010,2,5,4,16,2.303480,20.869106,8.765653,18.396865,2.779685,5.662068
2961984,-74.009666,40.705090,-73.978798,40.754816,1.0,2012,9,21,4,10,6.106875,21.095731,9.155189,18.009475,3.030897,5.372839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4199408,-73.987808,40.732620,-73.992928,40.734870,1.0,2010,1,9,5,19,0.498657,20.875854,11.046949,16.103791,5.545145,2.857863
638639,-74.003418,40.732555,-73.988945,40.737392,1.0,2015,2,11,2,14,1.332042,20.728290,10.625103,16.510606,5.146622,3.271747
499094,-74.007141,40.703735,-73.991257,40.750139,1.0,2013,9,12,3,22,5.327522,21.651956,10.307348,16.840345,4.018539,4.457213
461426,-73.989716,40.756683,-73.972771,40.780453,1.0,2015,1,8,3,10,3.001998,22.539090,8.320886,19.765726,0.813933,8.162350


In [80]:
test_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
0,-73.973320,40.763805,-73.981430,40.743835,1.0,2015,1,27,1,13,2.321720,20.574949,9.760151,17.346891,4.239318,4.218800
1,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13,2.423889,21.550981,11.315998,15.789650,5.382910,3.098180
2,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11,0.618009,20.594007,9.526878,17.576952,3.946866,4.514444
3,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21,1.959910,21.689160,10.195201,16.969532,3.844222,4.636742
4,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21,5.383931,21.113870,10.295947,16.808312,4.434000,3.967057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,-73.968124,40.796997,-73.955643,40.780388,6.0,2015,5,10,6,12,2.123367,21.507015,6.880891,21.014938,0.645636,8.809757
9910,-73.945511,40.803600,-73.960213,40.776371,6.0,2015,1,12,0,17,3.268916,21.462229,7.254930,20.464521,0.420324,8.229248
9911,-73.991600,40.726608,-73.789742,40.647011,6.0,2015,4,19,6,20,19.171439,1.169152,16.084441,32.772347,20.734211,19.933692
9912,-73.985573,40.735432,-73.939178,40.801731,6.0,2015,1,31,5,1,8.338278,22.402436,6.138515,23.410822,3.200773,11.556237


# Save intermediate DataFrames

# Train & Evaluate different models 

In [81]:
def evaluate(model):
    train_preds = model.predict(train_inputs)
    train_rmse = mean_squared_error(train_target, train_preds, squared=False)
    val_preds = model.predict(val_inputs)
    val_rmse = mean_squared_error(val_target, val_preds, squared=False)
    return train_rmse, val_rmse, train_preds, val_preds

In [82]:
def predict_and_submit(model, fname):
    test_preds = model.predict(test_inputs)
    sub_df = pd.read_csv(data_dir+'/sample_submission.csv')
    sub_df['fare_amount'] = test_preds
    sub_df.to_csv(fname, index=None)
    return sub_df

#### Linear Regression

In [83]:
lr =  LinearRegression().fit(train_inputs, train_target)

In [84]:
evaluate(lr)

(5.123407853265336,
 5.144734213282815,
 array([ 7.6202788 ,  6.29109827, 13.31805216, ...,  7.79086387,
         8.95246216, 12.22423443]),
 array([ 5.1657183 , 11.59417335,  8.75609537, ..., 15.83429693,
        10.71271507, 10.50124108]))

In [85]:
predict_and_submit(lr, 'lrv1.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.128323
1,2015-01-27 13:08:24.0000003,11.457968
2,2011-10-08 11:53:44.0000002,5.422538
3,2012-12-01 21:12:12.0000002,8.706219
4,2012-12-01 21:12:12.0000003,14.405156
...,...,...
9909,2015-05-10 12:37:51.0000002,8.994604
9910,2015-01-12 17:05:51.0000001,11.222609
9911,2015-04-19 20:44:15.0000001,47.662044
9912,2015-01-31 01:05:19.0000005,22.443973


In [86]:
lr.predict(test_inputs)

array([10.12832279, 11.45796822,  5.42253846, ..., 47.66204424,
       22.44397311,  9.03203221])

#### Ridge Regression

In [87]:
ridge = Ridge(random_state=42).fit(train_inputs, train_target)

In [88]:
evaluate(ridge)

(5.123407930050416,
 5.144735213200609,
 array([ 7.62073329,  6.29086406, 13.3186418 , ...,  7.79138339,
         8.95264999, 12.2245251 ]),
 array([ 5.16584129, 11.59359154,  8.75546453, ..., 15.83400087,
        10.71298447, 10.50134552]))

In [89]:
predict_and_submit(ridge, 'ridgev1.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.128478
1,2015-01-27 13:08:24.0000003,11.457350
2,2011-10-08 11:53:44.0000002,5.422510
3,2012-12-01 21:12:12.0000002,8.706268
4,2012-12-01 21:12:12.0000003,14.405728
...,...,...
9909,2015-05-10 12:37:51.0000002,8.995537
9910,2015-01-12 17:05:51.0000001,11.223566
9911,2015-04-19 20:44:15.0000001,47.660645
9912,2015-01-31 01:05:19.0000005,22.443741


#### Random Forest Regressor

In [102]:
%%time
rfr = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=10, n_estimators=50).fit(train_inputs, train_target)

CPU times: total: 1h 3min 9s
Wall time: 13min 30s


In [94]:
%%time
evaluate(rfr)

CPU times: total: 58.7 s
Wall time: 8.56 s


(3.813471578819109,
 3.9420868277445043,
 array([ 4.80527903,  6.6669364 , 13.52100146, ...,  7.69585458,
        10.14889919, 12.17995959]),
 array([ 5.1107022 ,  9.60602668,  8.81980254, ..., 16.09707032,
        12.34721201, 10.65697131]))

In [95]:
predict_and_submit(rfr, 'rfr.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.649380
1,2015-01-27 13:08:24.0000003,10.649380
2,2011-10-08 11:53:44.0000002,5.110702
3,2012-12-01 21:12:12.0000002,8.951939
4,2012-12-01 21:12:12.0000003,13.541100
...,...,...
9909,2015-05-10 12:37:51.0000002,8.501313
9910,2015-01-12 17:05:51.0000001,12.994134
9911,2015-04-19 20:44:15.0000001,55.715389
9912,2015-01-31 01:05:19.0000005,21.776482


## Final Gradient Boosting Regressor Model

In [194]:
# Best params include random_state=42, objective= 'reg:squarederror', 'n_jobs=-1, max_depth=9, n_estimators=800 & learning_rate=0.05

In [209]:
%%time
xgb = XGBRegressor(random_state=42, n_jobs=-1, max_depth= 9, n_estimators= 800, learning_rate=0.05, objective='reg:squarederror').fit(train_inputs, train_target)

CPU times: total: 33min 59s
Wall time: 6min 41s


In [212]:
%%time
evaluate(xgb)

CPU times: total: 3min 32s
Wall time: 15min 22s


(2.9553912,
 3.5693164,
 array([ 5.926566 ,  5.4743724, 14.822488 , ...,  8.581012 , 12.190047 ,
        12.463433 ], dtype=float32),
 array([ 4.7655873,  8.360509 ,  9.945253 , ..., 17.1786   , 11.206909 ,
         8.668226 ], dtype=float32))

In [213]:
predict_and_submit(xgb, 'xgb.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.193446
1,2015-01-27 13:08:24.0000003,10.616613
2,2011-10-08 11:53:44.0000002,4.661512
3,2012-12-01 21:12:12.0000002,9.463488
4,2012-12-01 21:12:12.0000003,16.952761
...,...,...
9909,2015-05-10 12:37:51.0000002,8.925991
9910,2015-01-12 17:05:51.0000001,11.456564
9911,2015-04-19 20:44:15.0000001,54.319321
9912,2015-01-31 01:05:19.0000005,18.441191
