In [1]:
import numpy as np 
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os

In [2]:
import gc
gc.collect()

20

In [3]:
yellow_taxi = pd.read_csv('../data/yellow_taxi_2019.csv')

In [4]:
yellow_taxi.shape

(44327014, 18)

In [5]:
yellow_taxi.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
RatecodeID                 int64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
dtype: object

In [6]:
#Identify null values
print(yellow_taxi.isnull().sum())

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count                0
trip_distance                  0
RatecodeID                     0
store_and_fwd_flag             0
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     4855978
dtype: int64


In [7]:
yellow_taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.0,1,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.0,2,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [8]:
def clean_df(df):
    return df[(df.fare_amount > 0) & (df.trip_distance > 0) & (df.fare_amount <= 50) &
                  (df.passenger_count > 0) & (df.passenger_count < 10)]

In [9]:
y_sub_2 = yellow_taxi[['tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','fare_amount']]
y_sub_2.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount
0,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,7.0
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,14.0
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.0,4.5
3,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.0,3.5
4,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.0,52.0


In [10]:
y_clean = clean_df(y_sub_2)
print(len(y_clean))

41756269


In [11]:
y_clean.dtypes

tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
fare_amount              float64
dtype: object

In [12]:
#Convert to datetime format
y_clean['tpep_dropoff_datetime'] = pd.to_datetime(y_clean['tpep_dropoff_datetime'], infer_datetime_format=True)
y_clean['tpep_pickup_datetime'] = pd.to_datetime(y_clean['tpep_pickup_datetime'], infer_datetime_format=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
y_clean.dtypes 

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
dtype: object

In [14]:
y_clean.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount
0,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,7.0
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,14.0
7,2019-01-01 00:21:28,2019-01-01 00:28:37,1,1.3,6.5
8,2019-01-01 00:32:01,2019-01-01 00:45:39,1,3.7,13.5
9,2019-01-01 00:57:32,2019-01-01 01:09:32,2,2.1,10.0


In [15]:
y_clean['ride_duration'] = y_clean['tpep_dropoff_datetime'] - y_clean['tpep_pickup_datetime']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
y_clean.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,ride_duration
0,1,1.5,7.0,00:06:40
1,1,2.6,14.0,00:19:12
7,1,1.3,6.5,00:07:09
8,1,3.7,13.5,00:13:38
9,2,2.1,10.0,00:12:00


In [17]:
y_clean.drop(columns=['tpep_pickup_datetime','tpep_dropoff_datetime'], inplace=True)
y_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,passenger_count,trip_distance,fare_amount,ride_duration
0,1,1.5,7.0,00:06:40
1,1,2.6,14.0,00:19:12
7,1,1.3,6.5,00:07:09
8,1,3.7,13.5,00:13:38
9,2,2.1,10.0,00:12:00


In [19]:
y_clean.dtypes

passenger_count              int64
trip_distance              float64
fare_amount                float64
ride_duration      timedelta64[ns]
dtype: object

In [28]:
y_clean['ride_duration'] = y_clean['ride_duration'] / np.timedelta64(1, 's')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
y_clean.dtypes

passenger_count      int64
trip_distance      float64
fare_amount        float64
ride_duration      float64
ride_duration_2    float64
dtype: object

In [32]:
y_clean.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,ride_duration
0,1,1.5,7.0,400.0
1,1,2.6,14.0,1152.0
7,1,1.3,6.5,429.0
8,1,3.7,13.5,818.0
9,2,2.1,10.0,720.0


In [33]:
y_clean.describe()

Unnamed: 0,passenger_count,trip_distance,fare_amount,ride_duration
count,41756270.0,41756270.0,41756270.0,41756270.0
mean,1.599404,2.4772,11.43362,996.5398
std,1.217323,2.596429,7.674,4362.563
min,1.0,0.01,0.01,-5056830.0
25%,1.0,0.97,6.5,391.0
50%,1.0,1.6,9.0,645.0
75%,2.0,2.8,13.5,1032.0
max,9.0,831.8,50.0,2618881.0


In [34]:
y = y_clean['fare_amount']
train = y_clean.drop(columns=['fare_amount'])

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0,test_size=0.01)

In [35]:
#Cross-validation
params = {
    # Parameters that we are going to tune.
    'max_depth': 8, #Result of tuning with CV
    'eta':.03, #Result of tuning with CV
    'subsample': 1, #Result of tuning with CV
    'colsample_bytree': 0.8, #Result of tuning with CV
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 1
}

#Block of code used for hypertuning parameters. Adapt to each round of parameter tuning.
#Turn off CV in submission
CV=False
if CV:
    dtrain = xgb.DMatrix(train,label=y)
    gridsearch_params = [
        (eta)
        for eta in np.arange(.04, 0.12, .02)
    ]

    # Define initial best params and RMSE
    min_rmse = float("Inf")
    best_params = None
    for (eta) in gridsearch_params:
        print("CV with eta={} ".format(
                                 eta))

        # Update our parameters
        params['eta'] = eta

        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=1000,
            nfold=3,
            metrics={'rmse'},
            early_stopping_rounds=10
        )

        # Update best RMSE
        mean_rmse = cv_results['test-rmse-mean'].min()
        boost_rounds = cv_results['test-rmse-mean'].argmin()
        print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
        if mean_rmse < min_rmse:
            min_rmse = mean_rmse
            best_params = (eta)

    print("Best params: {}, RMSE: {}".format(best_params, min_rmse))
else:
    #Print final params to use for the model
    params['silent'] = 0 #Turn on output
    print(params)

{'max_depth': 8, 'eta': 0.03, 'subsample': 1, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 0}


In [36]:
def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

In [37]:
model = XGBmodel(x_train,x_test,y_train,y_test,params)

[0]	test-rmse:12.9841
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:12.5967


KeyboardInterrupt: 