In [1]:
import os
import opendatasets as od
import random
import joblib
import numpy as np
import pandas as pd
import pyspark
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import zipfile

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.metrics import max_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

In [2]:
dataset_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/overview'

In [3]:
%%time
od.download(dataset_url)

Skipping, found downloaded files in ".\new-york-city-taxi-fare-prediction" (use force=True to force download)
CPU times: total: 0 ns
Wall time: 0 ns


In [4]:
data_dir = './new-york-city-taxi-fare-prediction'

In [5]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
selected_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [6]:
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}

In [7]:
sample_frac = 0.10
def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)

In [8]:
%%time
df = pd.read_csv(
    data_dir+'/train.csv',
    usecols = selected_cols,
    dtype=dtypes,
    parse_dates=['pickup_datetime'],
    skiprows=skip_row
)

df

CPU times: total: 3min 55s
Wall time: 8min 55s


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.991600,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
...,...,...,...,...,...,...,...
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.724380,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.716700,1.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5542602 entries, 0 to 5542601
Data columns (total 7 columns):
 #   Column             Dtype              
---  ------             -----              
 0   fare_amount        float32            
 1   pickup_datetime    datetime64[ns, UTC]
 2   pickup_longitude   float32            
 3   pickup_latitude    float32            
 4   dropoff_longitude  float32            
 5   dropoff_latitude   float64            
 6   passenger_count    float32            
dtypes: datetime64[ns, UTC](1), float32(5), float64(1)
memory usage: 190.3 MB


In [10]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5542602.0,5542602.0,5542602.0,5542569.0,5542569.0,5542602.0
mean,11.34315,-70.44813,42.25435,-70.44518,39.9175,1.686349
std,40.99119,12.70321,10.35936,12.86386,9.81618,1.308499
min,-300.0,-3439.245,-3492.264,-3379.079,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73402,1.0
50%,8.5,-73.9818,40.75265,-73.98016,40.75314,1.0
75%,12.5,-73.96708,40.76712,-73.96368,40.76809,2.0
max,93963.36,3457.626,3376.602,3442.185,3400.392,208.0


In [11]:
df.head(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782004,1.0
1,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751048,1.0
2,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758138,2.0
3,4.1,2009-11-06 01:04:03+00:00,-73.9916,40.744713,-73.983078,40.744682,2.0
4,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
5,6.5,2011-02-07 20:01:00+00:00,0.0,0.0,0.0,0.0,1.0
6,4.5,2011-06-28 19:47:00+00:00,-73.988892,40.760159,-73.986443,40.757857,3.0
7,10.9,2011-10-15 10:55:24+00:00,-74.003899,40.725513,-73.976486,40.765537,2.0
8,5.3,2011-03-04 18:12:00+00:00,-73.99704,40.737095,-73.98613,40.735043,1.0
9,16.5,2014-04-29 18:28:00+00:00,-73.970322,40.750839,-73.99839,40.724862,6.0


In [12]:
df.tail(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
5542592,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756217,1.0
5542593,16.5,2013-09-12 08:30:10+00:00,-73.956657,40.778259,-73.988197,40.740953,2.0
5542594,13.5,2015-01-21 10:34:15+00:00,-73.985809,40.740952,-73.974899,40.762432,2.0
5542595,9.0,2013-02-17 03:38:00+00:00,-73.993782,40.725643,-73.990845,40.748162,6.0
5542596,7.5,2013-03-02 03:13:05+00:00,-74.002953,40.728493,-73.985329,40.726105,1.0
5542597,6.0,2014-10-18 07:51:00+00:00,-73.997681,40.72438,-73.994148,40.717797,1.0
5542598,5.7,2010-11-18 07:08:58+00:00,-73.997589,40.735889,-73.984558,40.754055,1.0
5542599,8.0,2013-12-21 14:03:00+00:00,-73.976486,40.765919,-73.991524,40.759857,6.0
5542600,13.0,2013-09-04 20:20:00+00:00,-73.995605,40.725712,-73.954651,40.7167,1.0
5542601,10.5,2010-08-10 11:19:42+00:00,-73.97464,40.787758,-74.001518,40.762081,1.0


In [13]:
df.sample(10)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
683185,6.1,2011-03-17 23:48:00+00:00,-74.009712,40.705395,-74.003609,40.720703,1.0
694856,4.5,2013-03-05 06:42:44+00:00,-73.984871,40.747887,-73.993111,40.749622,1.0
3367116,16.9,2012-02-08 00:02:00+00:00,0.0,0.0,0.0,0.0,5.0
4221594,9.7,2009-12-29 11:57:49+00:00,-73.909904,40.743393,-73.909904,40.743394,2.0
3107517,16.0,2013-10-19 04:32:51+00:00,-73.984215,40.743652,-73.941826,40.787178,2.0
2662465,18.0,2014-01-26 02:44:00+00:00,-73.988632,40.718803,-73.985703,40.775477,1.0
2083466,9.3,2010-01-28 08:07:30+00:00,-73.997253,40.741879,-74.010468,40.709202,1.0
3780818,10.9,2010-02-25 17:20:00+00:00,-73.971519,40.728645,-73.987656,40.76036,5.0
3635986,10.9,2012-04-21 12:00:57+00:00,-73.978111,40.748867,-73.981949,40.778435,2.0
1544183,18.5,2015-01-13 06:29:18+00:00,-73.95813,40.732948,-73.982719,40.767578,1.0


In [17]:
%%time
main_test_df = pd.read_csv(
    data_dir+'/test.csv',
    dtype=dtypes
)

main_test_df

CPU times: total: 15.6 ms
Wall time: 20.2 ms


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6.0


Observations:

- This is a supervised learning regression problem
- Training data is 5.5 GB in size
- Training data has 5.5 million rows
- Test set is much smaller (< 10,000 rows)
- The training set has 8 columns:
    - `key` (a unique identifier)
    - `fare_amount` (target column)
    - `pickup_datetime`
    - `pickup_longitude`
    - `pickup_latitude`
    - `dropoff_longitude`
    - `dropoff_latitude`
    - `passenger_count`
- The test set has all columns except the target column `fare_amount`.
- The submission file should contain the `key` and `fare_amount` for each test sample.
