In [1]:
import os
import opendatasets as od
import random
import joblib
import numpy as np
import pandas as pd
import pyspark
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import zipfile

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.metrics import max_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

In [2]:
dataset_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/overview'

In [3]:
%%time
od.download(dataset_url)

Skipping, found downloaded files in ".\new-york-city-taxi-fare-prediction" (use force=True to force download)
CPU times: total: 0 ns
Wall time: 0 ns


In [4]:
data_dir = './new-york-city-taxi-fare-prediction'

In [5]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
selected_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [6]:
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}

In [7]:
sample_frac = 0.10
def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)

In [8]:
%%time
df = pd.read_csv(
    data_dir+'/train.csv',
    usecols = selected_cols,
    dtype=dtypes,
    parse_dates=['pickup_datetime'],
    skiprows=skip_row
)

df

CPU times: total: 3min 55s
Wall time: 8min 55s


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.991600,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
...,...,...,...,...,...,...,...
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.724380,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.716700,1.0


### Data Exploration

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5542602 entries, 0 to 5542601
Data columns (total 7 columns):
 #   Column             Dtype              
---  ------             -----              
 0   fare_amount        float32            
 1   pickup_datetime    datetime64[ns, UTC]
 2   pickup_longitude   float32            
 3   pickup_latitude    float32            
 4   dropoff_longitude  float32            
 5   dropoff_latitude   float64            
 6   passenger_count    float32            
dtypes: datetime64[ns, UTC](1), float32(5), float64(1)
memory usage: 190.3 MB


In [10]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5542602.0,5542602.0,5542602.0,5542569.0,5542569.0,5542602.0
mean,11.34315,-70.44813,42.25435,-70.44518,39.9175,1.686349
std,40.99119,12.70321,10.35936,12.86386,9.81618,1.308499
min,-300.0,-3439.245,-3492.264,-3379.079,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73402,1.0
50%,8.5,-73.9818,40.75265,-73.98016,40.75314,1.0
75%,12.5,-73.96708,40.76712,-73.96368,40.76809,2.0
max,93963.36,3457.626,3376.602,3442.185,3400.392,208.0


In [11]:
df.head(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.9916,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
5,6.5,2011-02-07 20:01:00+00:00,0.0,0.0,0.0,0.0,1.0
6,4.5,2011-06-28 19:47:00+00:00,-73.988892,40.760159,-73.986443,40.757857,3.0
7,10.9,2011-10-15 10:55:24+00:00,-74.003899,40.725513,-73.976486,40.765537,2.0
8,5.3,2011-03-04 18:12:00+00:00,-73.99704,40.737095,-73.98613,40.735043,1.0
9,16.5,2014-04-29 18:28:00+00:00,-73.970322,40.750839,-73.99839,40.724862,6.0


In [12]:
df.tail(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
5542592,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756217,1.0
5542593,16.5,2013-09-12 08:30:10+00:00,-73.956657,40.778259,-73.988197,40.740953,2.0
5542594,13.5,2015-01-21 10:34:15+00:00,-73.985809,40.740952,-73.974899,40.762432,2.0
5542595,9.0,2013-02-17 03:38:00+00:00,-73.993782,40.725643,-73.990845,40.748162,6.0
5542596,7.5,2013-03-02 03:13:05+00:00,-74.002953,40.728493,-73.985329,40.726105,1.0
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.72438,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.7167,1.0
5542601,10.5,2010-08-10 11:19:42+00:00,-73.97464,40.787758,-74.001518,40.762081,1.0


In [13]:
df.sample(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
683185,6.1,2011-03-17 23:48:00+00:00,-74.009712,40.705395,-74.003609,40.720703,1.0
694856,4.5,2013-03-05 06:42:44+00:00,-73.984871,40.747887,-73.993111,40.749622,1.0
3367116,16.9,2012-02-08 00:02:00+00:00,0.0,0.0,0.0,0.0,5.0
4221594,9.7,2009-12-29 11:57:49+00:00,-73.909904,40.743393,-73.909904,40.743394,2.0
3107517,16.0,2013-10-19 04:32:51+00:00,-73.984215,40.743652,-73.941826,40.787178,2.0
2662465,18.0,2014-01-26 02:44:00+00:00,-73.988632,40.718803,-73.985703,40.775477,1.0
2083466,9.3,2010-01-28 08:07:30+00:00,-73.997253,40.741879,-74.010468,40.709202,1.0
3780818,10.9,2010-02-25 17:20:00+00:00,-73.971519,40.728645,-73.987656,40.76036,5.0
3635986,10.9,2012-04-21 12:00:57+00:00,-73.978111,40.748867,-73.981949,40.778435,2.0
1544183,18.5,2015-01-13 06:29:18+00:00,-73.95813,40.732948,-73.982719,40.767578,1.0


In [24]:
df['pickup_datetime'].min(), df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:01:56+0000', tz='UTC'),
 Timestamp('2015-06-30 23:59:54+0000', tz='UTC'))

In [17]:
%%time
main_test_df = pd.read_csv(
    data_dir+'/test.csv',
    dtype=dtypes
)

main_test_df

CPU times: total: 15.6 ms
Wall time: 20.2 ms


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6.0


In [18]:
main_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                9914 non-null   object 
 1   pickup_datetime    9914 non-null   object 
 2   pickup_longitude   9914 non-null   float32
 3   pickup_latitude    9914 non-null   float32
 4   dropoff_longitude  9914 non-null   float32
 5   dropoff_latitude   9914 non-null   float64
 6   passenger_count    9914 non-null   float32
dtypes: float32(4), float64(1), object(2)
memory usage: 387.4+ KB


In [19]:
main_test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.976181,40.750954,-73.974945,40.751743,1.671273
std,0.042799,0.033542,0.039093,0.035435,1.278756
min,-74.25219,40.573143,-74.263245,40.568973,1.0
25%,-73.9925,40.736125,-73.991249,40.735254,1.0
50%,-73.982327,40.753052,-73.980015,40.754065,1.0
75%,-73.968012,40.767113,-73.964062,40.768757,2.0
max,-72.986534,41.709557,-72.990967,41.696683,6.0


In [20]:
main_test_df.head(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966049,40.789776,-73.988564,40.744427,1.0
5,2012-12-01 21:12:12.0000005,2012-12-01 21:12:12 UTC,-73.960983,40.765549,-73.979179,40.740053,1.0
6,2011-10-06 12:10:20.0000001,2011-10-06 12:10:20 UTC,-73.949013,40.773205,-73.959625,40.770893,1.0
7,2011-10-06 12:10:20.0000003,2011-10-06 12:10:20 UTC,-73.777283,40.646637,-73.985085,40.759368,1.0
8,2011-10-06 12:10:20.0000002,2011-10-06 12:10:20 UTC,-74.014099,40.709637,-73.99511,40.741365,1.0
9,2014-02-18 15:22:20.0000002,2014-02-18 15:22:20 UTC,-73.969582,40.765518,-73.980682,40.770725,1.0


In [21]:
main_test_df.tail(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
9904,2015-06-30 20:03:50.0000002,2015-06-30 20:03:50 UTC,-73.776848,40.645035,-73.95546,40.652458,6.0
9905,2015-02-27 19:36:02.0000006,2015-02-27 19:36:02 UTC,-73.989647,40.767406,-73.941177,40.845695,6.0
9906,2015-06-15 01:00:06.0000002,2015-06-15 01:00:06 UTC,-73.988052,40.720776,-73.991043,40.718346,6.0
9907,2015-02-03 09:00:58.0000001,2015-02-03 09:00:58 UTC,-73.863457,40.769611,-73.980995,40.763241,6.0
9908,2015-05-19 13:58:11.0000001,2015-05-19 13:58:11 UTC,-73.987968,40.718922,-73.982124,40.732956,6.0
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.8036,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.9916,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6.0
9913,2015-01-18 14:06:23.0000006,2015-01-18 14:06:23 UTC,-73.988022,40.75407,-74.000282,40.75922,6.0


In [22]:
main_test_df.sample(10)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
8735,2015-04-10 11:56:54.0000006,2015-04-10 11:56:54 UTC,-73.977158,40.756481,-73.966286,40.761829,3.0
7694,2012-11-20 21:54:00.0000002,2012-11-20 21:54:00 UTC,-73.998665,40.726212,-73.982475,40.764985,2.0
3083,2010-12-09 07:29:00.000000129,2010-12-09 07:29:00 UTC,-73.99292,40.73077,-74.007622,40.705523,1.0
743,2011-06-01 07:37:00.000000174,2011-06-01 07:37:00 UTC,-73.985313,40.768517,-73.987534,40.753053,1.0
5048,2013-01-14 18:42:54.0000001,2013-01-14 18:42:54 UTC,-73.954735,40.769779,-73.956268,40.778728,1.0
2849,2012-01-26 07:33:00.00000083,2012-01-26 07:33:00 UTC,-73.978722,40.747108,-73.994072,40.761742,1.0
3988,2014-04-26 22:02:17.0000001,2014-04-26 22:02:17 UTC,-73.993156,40.75515,-73.98822,40.722696,1.0
3066,2010-12-09 07:29:00.00000087,2010-12-09 07:29:00 UTC,-73.964134,40.756603,-73.956726,40.766875,1.0
5214,2012-10-09 07:47:46.0000004,2012-10-09 07:47:46 UTC,-73.972298,40.765244,-73.973419,40.78992,1.0
9378,2012-11-03 17:11:00.000000105,2012-11-03 17:11:00 UTC,-73.976089,40.748554,-73.973557,40.752808,5.0


In [23]:
main_test_df['pickup_datetime'].min(), main_test_df['pickup_datetime'].max()

('2009-01-01 11:04:24 UTC', '2015-06-30 20:03:50 UTC')

Observations:

- This is a supervised learning regression problem
- Training data is 5.5 GB in size
- Training data has 5.5 million rows
- Test set is much smaller (< 10,000 rows)
- The training set has 8 columns:
    - `key` (a unique identifier)
    - `fare_amount` (target column)
    - `pickup_datetime`
    - `pickup_longitude`
    - `pickup_latitude`
    - `dropoff_longitude`
    - `dropoff_latitude`
    - `passenger_count`
- The test set has all columns except the target column `fare_amount`.
- The submission file should contain the `key` and `fare_amount` for each test sample.


## 3. Prepare Dataset for Training

- Split Training & Validation Set
- Fill/Remove Missing Values
- Extract Inputs & Outputs
   - Training
   - Validation
   - Test

### Split Training & Validation Set

We'll set aside 20% of the training data as the validation set, to evaluate the models we train on previously unseen data. 

Since the test set and training set have the same date ranges, we can pick a random 20% fraction.

In [26]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=25)

In [27]:
train_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2303415,5.5,2015-02-04 08:24:53+00:00,-73.948738,40.776600,-73.951958,40.769390,3.0
5119610,4.5,2010-10-02 12:06:31+00:00,-73.994873,40.745243,-73.998978,40.734206,1.0
1426135,14.9,2011-05-09 21:12:04+00:00,-73.983437,40.770870,-73.988327,40.722847,1.0
2858955,28.1,2010-10-22 03:53:09+00:00,-73.988220,40.723492,-73.792419,40.745597,1.0
2270080,7.3,2012-01-24 18:02:52+00:00,-73.967003,40.793743,-73.962250,40.779168,1.0
...,...,...,...,...,...,...,...
1564927,16.1,2011-07-22 23:39:00+00:00,-73.988411,40.723236,-73.982887,40.778265,1.0
5016438,10.5,2009-05-06 13:46:00+00:00,-73.975616,40.749329,-73.981918,40.768450,1.0
1055194,8.5,2012-08-09 12:05:22+00:00,-73.971191,40.782944,-73.973457,40.764785,1.0
3236158,11.0,2013-08-21 12:52:00+00:00,-74.004715,40.742008,-73.979591,40.746370,1.0


In [28]:
val_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2990758,3.7,2010-11-01 12:46:00+00:00,-73.973495,40.756996,-73.970581,40.762472,1.0
202499,8.1,2011-08-13 01:49:36+00:00,-74.014557,40.709663,-74.007309,40.741237,1.0
1357534,9.7,2009-02-14 22:19:00+00:00,-73.992615,40.697113,-73.997520,40.722172,3.0
389329,6.1,2010-02-05 16:54:06+00:00,-73.987984,40.737877,-73.974380,40.755862,1.0
2961984,22.5,2012-09-21 10:18:38+00:00,-74.009666,40.705090,-73.978798,40.754816,1.0
...,...,...,...,...,...,...,...
4199408,3.7,2010-01-09 19:06:00+00:00,-73.987808,40.732620,-73.992928,40.734870,1.0
638639,7.5,2015-02-11 14:09:37+00:00,-74.003418,40.732555,-73.988945,40.737392,1.0
499094,19.0,2013-09-12 22:10:52+00:00,-74.007141,40.703735,-73.991257,40.750139,1.0
461426,8.5,2015-01-08 10:36:05+00:00,-73.989716,40.756683,-73.972771,40.780453,1.0


### Fill/Remove Missing Values

There are no missing values in our sample, but if there were, we could simply drop the rows with missing values instead of trying to fill them (since we have a lot of training data)>

In [29]:
train_df = train_df.dropna()
val_df = val_df.dropna()

### Extract Inputs and Outputs

In [30]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [31]:
input_cols = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [32]:
target_col = 'fare_amount'

In [33]:
## Training Data
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]

In [34]:
train_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2303415,2015-02-04 08:24:53+00:00,-73.948738,40.776600,-73.951958,40.769390,3.0
5119610,2010-10-02 12:06:31+00:00,-73.994873,40.745243,-73.998978,40.734206,1.0
1426135,2011-05-09 21:12:04+00:00,-73.983437,40.770870,-73.988327,40.722847,1.0
2858955,2010-10-22 03:53:09+00:00,-73.988220,40.723492,-73.792419,40.745597,1.0
2270080,2012-01-24 18:02:52+00:00,-73.967003,40.793743,-73.962250,40.779168,1.0
...,...,...,...,...,...,...
1564927,2011-07-22 23:39:00+00:00,-73.988411,40.723236,-73.982887,40.778265,1.0
5016438,2009-05-06 13:46:00+00:00,-73.975616,40.749329,-73.981918,40.768450,1.0
1055194,2012-08-09 12:05:22+00:00,-73.971191,40.782944,-73.973457,40.764785,1.0
3236158,2013-08-21 12:52:00+00:00,-74.004715,40.742008,-73.979591,40.746370,1.0


In [35]:
train_targets

2303415     5.5
5119610     4.5
1426135    14.9
2858955    28.1
2270080     7.3
           ... 
1564927    16.1
5016438    10.5
1055194     8.5
3236158    11.0
4224132    13.0
Name: fare_amount, Length: 4434055, dtype: float32

In [36]:
## Training Data
val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

In [37]:
val_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2990758,2010-11-01 12:46:00+00:00,-73.973495,40.756996,-73.970581,40.762472,1.0
202499,2011-08-13 01:49:36+00:00,-74.014557,40.709663,-74.007309,40.741237,1.0
1357534,2009-02-14 22:19:00+00:00,-73.992615,40.697113,-73.997520,40.722172,3.0
389329,2010-02-05 16:54:06+00:00,-73.987984,40.737877,-73.974380,40.755862,1.0
2961984,2012-09-21 10:18:38+00:00,-74.009666,40.705090,-73.978798,40.754816,1.0
...,...,...,...,...,...,...
4199408,2010-01-09 19:06:00+00:00,-73.987808,40.732620,-73.992928,40.734870,1.0
638639,2015-02-11 14:09:37+00:00,-74.003418,40.732555,-73.988945,40.737392,1.0
499094,2013-09-12 22:10:52+00:00,-74.007141,40.703735,-73.991257,40.750139,1.0
461426,2015-01-08 10:36:05+00:00,-73.989716,40.756683,-73.972771,40.780453,1.0


In [38]:
val_targets

2990758     3.7
202499      8.1
1357534     9.7
389329      6.1
2961984    22.5
           ... 
4199408     3.7
638639      7.5
499094     19.0
461426      8.5
4868465     8.1
Name: fare_amount, Length: 1108514, dtype: float32

In [39]:
## Test Data
test_inputs = test_df[input_cols]
test_inputs

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44 UTC,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12 UTC,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12 UTC,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...
9909,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6.0
