In [None]:
%matplotlib inline
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile

train = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip", 
                    compression="zip", index_col="id")
test = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip",
                   compression="zip", index_col="id")

train.head()

Turns out that `store_and_forward == Y` may be a MUCH cleaner and authoritative subset of the data, with the added benefit that it is MUCH smaller.

In [None]:
train = train.loc[train["store_and_fwd_flag"] == "Y",:]
train.shape

Let's include some sanity checks for the data.

In [None]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)

In [None]:
EARTH_RADIUS=6378.137  ## km
def haversine(xy1, xy2):
    return 2*EARTH_RADIUS*np.arcsin(np.sqrt(
        np.sin((xy2[:,0]-xy1[:,0])/2)**2 +
        np.cos(xy1[:,0])*np.cos(xy2[:,0])*np.sin((xy2[:,1]-xy2[:,1])/2)
    ))
train["distance"] = haversine(
    np.radians(train[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(train[["dropoff_longitude", "dropoff_latitude"]].values))
test["distance"] = haversine(
    np.radians(test[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(test[["dropoff_longitude", "dropoff_latitude"]].values))

pyplot.hist(np.log(train["distance"]+1e-5), bins=50)

In [None]:
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_weekofyear'] = train['pickup_datetime'].dt.isocalendar().week
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_weekofyear'] = test['pickup_datetime'].dt.isocalendar().week
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

Since we have (in the training set) `distance` and `trip_duration`, we can simply calculate an `avg_speed` variable. Here, we are computing it in units of m/s (...since `distance` is in kilometers, and `trip_duration` is in seconds).

In [None]:
train.loc[:, 'avg_speed'] = 1000 * train['distance'] / train['trip_duration']

One idea for an approach would be to compute `predicted_trip_duration = predicted_distance*avg_speed` on the test set, and learn to predict the residual `residual = trip_duration - predicted_trip_duration`. That way, we can predict `predicted_trip_duration + residual` as our final submission prediction.

To do this, we need to predict the speed, and do so in the same way both for test and training set.

In [None]:
from sklearn import neighbors
speed_model = neighbors.KNeighborsRegressor(n_neighbors=2)

speed_model.fit(train[['pickup_dt']], train['avg_speed'])

In [None]:
train['pred_speed'] = speed_model.predict(train[['pickup_dt']])
test['pred_speed'] = speed_model.predict(test[['pickup_dt']])

In [None]:
train['pred_duration'] = train['pred_speed'] * train['distance']
test['pred_duration'] = test['pred_speed'] * test['distance']

In [None]:
train['residuals'] = train['trip_duration'] - train['pred_duration']

There are some clear outliers in the dataset.
Let's get rid of taxicab trips that are longer than 15h, as well as trips shorter than 1m.

In [None]:
print(f"Shape before dropping outliers: {train.shape}")
train.drop(train[train["trip_duration"] > 20*60*60].index, inplace=True)
train.drop(train[train["trip_duration"] < 60].index, inplace=True)
print(f"Shape after dropping outliers: {train.shape}")

In [None]:
do_not_use_for_training = ['id', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'check_trip_duration',
                           'pickup_date', 'avg_speed', 
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group',
                           'store_and_fwd_flag', 
                           'residuals','pred_speed','pred_duration']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
X, y = train[feature_names], train["residuals"]
Xtest = test[feature_names]
print(f"Shape of training data: X {X.shape} y {y.shape}")
print(f"Shape of test features: X {Xtest.shape}")
print(f"Features used: {feature_names}")

In [None]:
from sklearn import linear_model, model_selection, metrics, pipeline
from sklearn import preprocessing, svm, compose, feature_selection
import xgboost

def rmsle_score(yt, yp): 
    return np.sqrt(metrics.mean_squared_log_error(yt, yp))
                   
rmsle = metrics.make_scorer(rmsle_score,greater_is_better=False)

params = {
    "poly__degree": [2],
    "features__score_func": [feature_selection.f_regression],
    "features__k": [3,5,7,10,13],
    #"lm__alpha": np.logspace(-3,3,7),
    "xgb__tree_method": ["gpu_hist"],
}
model = model_selection.RandomizedSearchCV(
        pipeline.Pipeline([
          ("features",feature_selection.SelectKBest()),
          ("scaler", preprocessing.StandardScaler()),
          ("poly", preprocessing.PolynomialFeatures()),
          #("lm", linear_model.Lasso())
          ("xgb", xgboost.XGBRegressor())
        ]),
    params, #scoring=rmsle, 
    cv=5, n_jobs=-1)
model.get_params()

In [None]:
%%time
model.fit(X, y)

In [None]:
print(f"Model score: {model.best_score_}")
print(f"Model chosen parameters: {model.best_params_}")

In [None]:
y_val = train['pred_duration'] + model.predict(X)
y_val[y_val < 0] = 0
print(f"Model RMSLE score: {rmsle_score(train['trip_duration'], y_val)}")

In [None]:
%%time
submission = test[[]].assign(trip_duration=test['pred_duration']+model.predict(Xtest))
submission['trip_duration'] = np.where(submission['trip_duration'] < 0, 0, submission['trip_duration'])
submission.to_csv("submission.csv")