<a href="https://colab.research.google.com/github/rautaditya2606/NY_TAXi_Fare/blob/main/NYC_Taxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [292]:
#!pip install opendatasets --quiet
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import matplotlib
%matplotlib inline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import random
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [293]:
database_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction'
od.download(database_url)

Skipping, found downloaded files in "./new-york-city-taxi-fare-prediction" (use force=True to force download)


In [294]:
data_dir = '/content/new-york-city-taxi-fare-prediction/'

In [295]:
# !pip install dask-ml --quiet


In [296]:
!head {data_dir + '/train.csv'}

key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
2012-12-03 13:10:00.000000125,9,2012-12-03 13:10:00 UTC,-74.006462,40.7267

In [297]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')

In [298]:
selected_cols
dtypes = {
 'fare_amount': 'float32',
 'pickup_longitude':'float32',
 'pickup_latitude':'float32',
 'dropoff_longitude':'float32',
 'dropoff_latitude':'float32',
 'passenger_count': 'uint8'
}

In [299]:
def skip_rows(row_idx):
    if row_idx == 0:
        return False
    return random.random() > 0.01
df = pd.read_csv(
    data_dir + '/train.csv',
    usecols=selected_cols,
    dtype=dtypes,
    parse_dates=['pickup_datetime'],
    skiprows=skip_rows
)


In [300]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,10.1,2012-05-21 15:20:00+00:00,-73.977791,40.771839,-73.966202,40.778114,2
1,7.3,2009-06-20 19:28:37+00:00,-73.979347,40.776630,-73.973839,40.757072,1
2,12.0,2014-09-13 12:17:00+00:00,-73.984230,40.766895,-73.962387,40.776035,1
3,6.0,2012-10-22 10:43:00+00:00,-73.968658,40.770302,-73.982574,40.774700,1
4,11.7,2012-03-02 13:13:00+00:00,-73.966690,40.765072,-73.978241,40.791958,1
...,...,...,...,...,...,...,...
554686,5.0,2012-10-01 19:19:00+00:00,-73.912537,40.770298,-73.910400,40.766842,5
554687,9.0,2014-09-20 22:17:11+00:00,-74.001350,40.731415,-73.978531,40.752445,1
554688,12.0,2013-01-04 16:32:14+00:00,-74.012756,40.707821,-73.985428,40.738548,1
554689,8.0,2013-09-26 06:42:30+00:00,-73.952393,40.786648,-73.958405,40.765530,1


In [301]:
test_df = pd.read_csv(data_dir + '/test.csv', parse_dates=['pickup_datetime'])

In [302]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6


In [303]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,554691.0,554691.0,554691.0,554690.0,554690.0,554691.0
mean,11.336429,-72.501442,39.925476,-72.511871,39.926926,1.690976
std,9.753218,12.535655,9.576158,12.340616,10.314622,1.315154
min,-145.0,-2635.390137,-2459.046631,-2635.390137,-3493.651855,0.0
25%,6.0,-73.992073,40.734901,-73.991417,40.733986,1.0
50%,8.5,-73.981819,40.752579,-73.980156,40.753166,1.0
75%,12.5,-73.967087,40.766998,-73.963638,40.768066,2.0
max,498.0,2551.566162,3315.077881,2551.566162,3322.083252,6.0


In [304]:
!nvidia-smi

Thu Aug  7 16:59:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             32W /   70W |     114MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [305]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [306]:
train_df.shape, val_df.shape

((443752, 7), (110939, 7))

In [307]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [308]:
list(train_df.columns)

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [309]:
input_cols = [
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']
target_cols = 'fare_amount'

In [310]:
train_inputs = train_df[input_cols]
train_targets = train_df[target_cols]
val_inputs = val_df[input_cols]
val_targets = val_df[target_cols]

In [311]:
def add_dateparts(df, col):
  df[col+'_year'] = df[col].dt.year
  df[col+'_month'] = df[col].dt.month
  df[col+'_day'] = df[col].dt.day
  df[col+'_weekday'] = df[col].dt.weekday
  df[col+'_hour'] = df[col].dt.hour

In [312]:
add_dateparts(train_df, 'pickup_datetime')
add_dateparts(val_df, 'pickup_datetime')
add_dateparts(test_df, 'pickup_datetime')

In [313]:
train_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
185116,10.0,2013-07-13 09:16:09+00:00,-74.008278,40.737148,-73.978943,40.752647,1,2013,7,13,5,9
142082,14.1,2010-07-24 01:01:00+00:00,-73.954628,40.778164,-73.945908,40.827831,2,2010,7,24,5,1
548008,4.1,2011-08-26 19:00:00+00:00,-73.990776,40.734451,-73.996376,40.727596,5,2011,8,26,4,19
461574,64.5,2013-06-19 22:43:27+00:00,-73.872948,40.774139,-73.962051,40.607670,2,2013,6,19,2,22
219319,5.3,2009-03-31 09:46:04+00:00,-73.982361,40.727844,-73.998528,40.728512,1,2009,3,31,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...
110268,8.0,2014-03-04 11:02:18+00:00,-73.944168,40.775982,-73.962616,40.773132,1,2014,3,4,1,11
259178,19.0,2015-05-09 22:13:35+00:00,-73.995453,40.725163,-73.988571,40.778561,1,2015,5,9,5,22
365838,6.9,2010-09-15 08:24:18+00:00,-73.972389,40.793991,-73.951279,40.771687,1,2010,9,15,2,8
131932,8.0,2014-09-23 11:12:31+00:00,-73.991570,40.749138,-74.007812,40.745171,1,2014,9,23,1,11


In [314]:
val_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
506681,12.5,2014-05-07 15:31:33+00:00,-74.003181,40.749031,-73.977959,40.741917,2,2014,5,7,2,15
48872,8.0,2013-04-07 13:55:00+00:00,-73.987343,40.729099,-73.989128,40.716698,5,2013,4,7,6,13
137443,5.3,2009-05-17 01:25:53+00:00,-73.986702,40.702335,-73.988213,40.689957,2,2009,5,17,6,1
31863,10.0,2014-04-21 13:20:00+00:00,-73.990028,40.734009,-73.976700,40.758057,1,2014,4,21,0,13
5535,7.0,2014-04-25 13:53:25+00:00,-73.979271,40.753304,-73.988785,40.752842,1,2014,4,25,4,13
...,...,...,...,...,...,...,...,...,...,...,...,...
409999,13.0,2014-06-01 23:18:31+00:00,-73.955856,40.692936,-73.994827,40.684639,1,2014,6,1,6,23
386421,9.7,2012-06-25 17:38:00+00:00,-73.978836,40.756016,-73.990639,40.740051,2,2012,6,25,0,17
402475,9.0,2014-05-25 19:10:00+00:00,-74.007248,40.743805,-73.984062,40.764969,2,2014,5,25,6,19
484323,10.1,2009-04-12 04:13:00+00:00,-74.011436,40.658218,-74.074532,40.666206,3,2009,4,12,6,4


In [315]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1,2015,1,27,1,13
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,1,13
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1,2011,10,8,5,11
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1,2012,12,1,5,21
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,5,21
...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6,2015,5,10,6,12
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6,2015,1,12,0,17
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6,2015,4,19,6,20
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6,2015,1,31,5,1


In [316]:
def haversine_vectorized(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c
    return km

def add_trip_dist(df):
    df['trip_distance'] = haversine_vectorized(
        df['pickup_latitude'],
        df['pickup_longitude'],
        df['dropoff_latitude'],
        df['dropoff_longitude']
    )


In [317]:
add_trip_dist(train_df)
add_trip_dist(val_df)
add_trip_dist(test_df)

In [318]:
val_df.head(1)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
506681,12.5,2014-05-07 15:31:33+00:00,-74.003181,40.749031,-73.977959,40.741917,2,2014,5,7,2,15,2.267388


In [319]:
train_df.head(1)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
185116,10.0,2013-07-13 09:16:09+00:00,-74.008278,40.737148,-73.978943,40.752647,1,2013,7,13,5,9,3.013058


In [320]:
test_df.head(1)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,27,1,13,2.32326


In [321]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [322]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
  lon, lat = landmark_lonlat
  df[landmark_name+'_drop_distance'] = haversine_vectorized(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

In [323]:
def add_landmarks(a_df):
  landmarks = [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]
  for name, lonlat in landmarks:
    add_landmark_dropoff_distance(a_df, name, lonlat)

In [324]:
add_landmarks(train_df)
add_landmarks(val_df)
add_landmarks(test_df)

In [325]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) &
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) &
              (df['pickup_longitude'] <= -72) &
              (df['dropoff_longitude'] >= -75) &
              (df['dropoff_longitude'] <= -72) &
              (df['pickup_latitude'] >= 40) &
              (df['pickup_latitude'] <= 42) &
              (df['dropoff_latitude'] >=40) &
              (df['dropoff_latitude'] <= 42) &
              (df['passenger_count'] >= 1) &
              (df['passenger_count'] <= 6)]

In [326]:
train_df = remove_outliers(train_df)

In [327]:
val_df = remove_outliers(val_df)

In [328]:
# train_df.to_parquet('train.parquet')

In [329]:
# val_df.to_parquet('val.parquet')

In [330]:
# test_df.to_parquet('test.parquet')

In [331]:
list(train_df.columns)

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'pickup_datetime_year',
 'pickup_datetime_month',
 'pickup_datetime_day',
 'pickup_datetime_weekday',
 'pickup_datetime_hour',
 'trip_distance',
 'jfk_drop_distance',
 'lga_drop_distance',
 'ewr_drop_distance',
 'met_drop_distance',
 'wtc_drop_distance']

In [332]:
input_cols = ['pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'pickup_datetime_year',
 'pickup_datetime_month',
 'pickup_datetime_day',
 'pickup_datetime_weekday',
 'pickup_datetime_hour',
 'trip_distance',
 'jfk_drop_distance',
 'lga_drop_distance',
 'ewr_drop_distance',
 'met_drop_distance',
 'wtc_drop_distance']

In [333]:
target_cols = 'fare_amount'

In [334]:
train_inputs = train_df[input_cols]
val_inputs = val_df[input_cols]
test_inputs = test_df[input_cols]

In [335]:
train_targets = train_df[target_cols]
val_targets = val_df[target_cols]

In [336]:
def evaluate(model):
  train_pred = model.predict(train_inputs)
  train_rmse = np.sqrt(mean_squared_error(train_pred, train_targets))
  val_pred = model.predict(val_inputs)
  val_rmse = np.sqrt(mean_squared_error(val_pred, val_targets))
  return train_pred, val_pred, train_rmse, val_rmse

In [337]:
# model1 = Ridge(random_state=42).fit(train_inputs, train_targets)

In [338]:
evaluate(model1)

(array([10.40795076, 14.10743238,  6.7502594 , ...,  8.87557039,
         9.52258816,  2.89024113]),
 array([ 9.6267539 ,  8.36524658,  7.01473702, ..., 11.0091306 ,
        29.34867509,  6.14561969]),
 np.float64(4.96365144854102),
 np.float64(5.015948316709888))

In [339]:
# model2 = RandomForestRegressor(n_estimators=200, max_depth=10, n_jobs=-1, random_state=42).fit(train_inputs, train_targets)

In [340]:
# evaluate(model3)

In [341]:
def predict_and_submit(model, fname):
    test_preds = model.predict(test_inputs)
    sub_df = pd.read_csv(data_dir+'/sample_submission.csv')
    sub_df['fare_amount'] = test_preds
    sub_df.to_csv(fname, index=None)
    return sub_df

In [342]:
# predict_and_submit(model3, 'xgb_submission.csv')

In [343]:
import matplotlib.pyplot as plt

def test_params(ModelClass, **params):
    model = ModelClass(**params).fit(train_inputs, train_targets)
    train_preds = model.predict(train_inputs)
    val_preds = model.predict(val_inputs)
    train_rmse = np.sqrt(mean_squared_error(train_targets, train_preds))
    val_rmse = np.sqrt(mean_squared_error(val_targets, val_preds))
    return train_rmse, val_rmse


def test_param_and_plot(ModelClass, param_name, param_values, **other_params):
    """Trains multiple models by varying the value of param_name according to param_values"""
    train_errors, val_errors = [], []
    for value in param_values:
        params = dict(other_params)
        params[param_name] = value
        train_rmse, val_rmse = test_params(ModelClass, **params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)

    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])

In [344]:
best_params = {
    'random_state': 42,
    'n_jobs': -1,
    'objective': 'reg:squarederror',
    'learning_rate' : 0.05
}

In [345]:
# %%time
# test_param_and_plot(XGBRegressor, 'n_estimators', [100, 250, 300], **best_params)

In [346]:
# %%time
# test_param_and_plot(XGBRegressor, 'max_depth', [3,5,7], **best_params)

In [347]:
#3.65725

In [348]:
model4 = LGBMRegressor(num_leaves=64, max_depth=5, learning_rate=0.1, n_estimators=600, n_jobs=-1).fit(train_inputs, train_targets)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2641
[LightGBM] [Info] Number of data points in the train set: 432730, number of used features: 16
[LightGBM] [Info] Start training from score 11.324710


In [349]:
evaluate(model4)#3.65725

(array([11.27760179, 12.66704924,  5.15678813, ..., 10.64167712,
         8.02689323, 12.60201805]),
 array([11.88704547,  7.74030524,  6.63978603, ..., 11.10259497,
        17.61353618,  5.35691985]),
 np.float64(3.2981855232623953),
 np.float64(3.6573200864064117))

In [350]:
predict_and_submit(model4, 'xgb_submission.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.562187
1,2015-01-27 13:08:24.0000003,11.412982
2,2011-10-08 11:53:44.0000002,4.600595
3,2012-12-01 21:12:12.0000002,8.815906
4,2012-12-01 21:12:12.0000003,15.785061
...,...,...
9909,2015-05-10 12:37:51.0000002,8.623473
9910,2015-01-12 17:05:51.0000001,11.932009
9911,2015-04-19 20:44:15.0000001,55.309121
9912,2015-01-31 01:05:19.0000005,20.149030


In [351]:
# gpu_df = dask_cudf.read_csv(data_dir + '/train.csv')