In [None]:
%matplotlib inline
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile

train = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip", 
                    compression="zip", index_col="id")
test = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip",
                   compression="zip", index_col="id")

train.head()

In [None]:
train = train.loc[train["store_and_fwd_flag"] == "Y",:]
train.shape

Let's include some sanity checks for the data.

In [None]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)

In [None]:
EARTH_RADIUS=6378.137  ## km
def haversine(xy1, xy2):
    return 2*EARTH_RADIUS*np.arcsin(np.sqrt(
        np.sin((xy2[:,0]-xy1[:,0])/2)**2 +
        np.cos(xy1[:,0])*np.cos(xy2[:,0])*np.sin((xy2[:,1]-xy2[:,1])/2)
    ))
train["distance"] = haversine(
    np.radians(train[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(train[["dropoff_longitude", "dropoff_latitude"]].values))
test["distance"] = haversine(
    np.radians(test[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(test[["dropoff_longitude", "dropoff_latitude"]].values))

pyplot.hist(np.log(train["distance"]+1e-5), bins=50)

In [None]:
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_weekofyear'] = train['pickup_datetime'].dt.isocalendar().week
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_weekofyear'] = test['pickup_datetime'].dt.isocalendar().week
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

Since we have (in the training set) `distance` and `trip_duration`, we can simply calculate an `avg_speed` variable. Here, we are computing it in units of m/s (...since `distance` is in kilometers, and `trip_duration` is in seconds).

In [None]:
train.loc[:, 'avg_speed'] = 1000 * train['distance'] / train['trip_duration']

One idea for an approach would be to compute `predicted_trip_duration = predicted_distance*avg_speed` on the test set, and learn to predict the residual `residual = trip_duration - predicted_trip_duration`. That way, we can predict `predicted_trip_duration + residual` as our final submission prediction.

To do this, we need to predict the speed, and do so in the same way both for test and training set.

In [None]:
from sklearn import neighbors
speed_model = neighbors.KNeighborsRegressor(n_neighbors=2)

speed_model.fit(train[['pickup_dt']], train['avg_speed'])

In [None]:
train['pred_speed'] = speed_model.predict(train[['pickup_dt']])
test['pred_speed'] = speed_model.predict(test[['pickup_dt']])

In [None]:
train['pred_duration'] = train['pred_speed'] * train['distance']
test['pred_duration'] = test['pred_speed'] * test['distance']

In [None]:
train['residuals'] = train['trip_duration'] - train['pred_duration']

Let's just check for high correlations with our target quantities in our dataset...

In [None]:
train.corr()[['trip_duration','pred_duration','residuals']].sort_values("trip_duration",key=abs,ascending=False)

There are some clear outliers in the dataset.
Let's get rid of taxicab trips that are longer than 3h, as well as trips shorter than 1m.

In [None]:
print(f"Shape before dropping outliers: {train.shape}")
train.drop(train[train["trip_duration"] > 3*60*60].index, inplace=True)
train.drop(train[train["trip_duration"] < 60].index, inplace=True)
print(f"Shape after dropping outliers: {train.shape}")

# External data

Let's merge in extra data sets. Two of the award winning datasets generated for this competition are `knycmetars2016` with extensive hour-by-hour weather data and `new-york-city-taxi-with-osrm` with fastest, and second-fastest routes with both distances and step-by-step route desciptions.

To merge them, first do "+ Add data" in the sidebar, and then read in the CSV files using the Copy File Path link in the sidebar. For the weather data, we would have to interpolate - so we train a battery of simplistic kNN models to fill in the relevant data. For the trip data, the CSV file shares the same ID column as our original data, so we can use these row IDs to merge the data together. Pandas provides `df.merge` that allows for us to use the index column as merge keys if we so choose.

In [None]:
distance_matrix_data_train_1 = pd.read_csv("../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_1.csv", index_col="id")
distance_matrix_data_train_2 = pd.read_csv("../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_2.csv", index_col="id")
distance_matrix_data_test = pd.read_csv("../input/new-york-city-taxi-with-osrm/fastest_routes_test.csv", index_col="id")


In [None]:
distance_matrix_data_train = pd.concat([distance_matrix_data_train_1, distance_matrix_data_train_2])
distance_matrix_data_train.head()

In [None]:
%%time
train = pd.merge(train, distance_matrix_data_train, left_index=True, right_index=True, 
         sort=False, how="left")
train.head()

In [None]:
test = pd.merge(test, distance_matrix_data_test, left_index=True, right_index=True,
               sort=False, how="left")
print(f"# NA in the result: {test.isna().sum().sum()}")
test.head()

In [None]:
knyc_metars = pd.read_csv("../input/knycmetars2016/KNYC_Metars.csv")
knyc_metars["Time"] = pd.to_datetime(knyc_metars["Time"])
knyc_metars["dt"] = (knyc_metars["Time"] - train['pickup_datetime'].min()).dt.total_seconds()
knyc_metars.head()

In [None]:
knyc_metars.dropna(axis=1,inplace=True)
knyc_metars.isna().sum(), knyc_metars.shape

In [None]:
categorical_weathers = ["Wind Dir", "Events", "Conditions"]
numeric_weathers = ["Temp.", "Humidity", "Dew Point", "Wind Speed", "Gust Speed", "Precip"]
[c for c in knyc_metars.columns if c not in categorical_weathers and c not in numeric_weathers]

In [None]:
%%time
from sklearn import neighbors, multioutput
categorical_weather_model = multioutput.MultiOutputClassifier(
    neighbors.KNeighborsClassifier(n_neighbors=2), n_jobs=-1)
numeric_weather_model = multioutput.MultiOutputRegressor(
    neighbors.KNeighborsRegressor(n_neighbors=2), n_jobs=-1)

categorical_weather_model.fit(knyc_metars[["dt"]], knyc_metars[categorical_weathers])
train[categorical_weathers] = categorical_weather_model.predict(train[["pickup_dt"]])
test[categorical_weathers] = categorical_weather_model.predict(test[["pickup_dt"]])

numeric_weather_model.fit(knyc_metars[["dt"]], knyc_metars[numeric_weathers])
train[numeric_weathers] = numeric_weather_model.predict(train[["pickup_dt"]])
test[numeric_weathers] = numeric_weather_model.predict(test[["pickup_dt"]])

We now have better distance estimates, so we should recalculate and repredict our average speed, and thus also our predicted distances and residuals.

In [None]:
# m/s
train['osrm_speed'] = train["total_distance"] / train['trip_duration']
speed_model.fit(train[['pickup_dt']], train['osrm_speed'])
train['pred_osrm_speed'] = speed_model.predict(train[['pickup_dt']])
test['pred_osrm_speed'] = speed_model.predict(test[['pickup_dt']])

train['osrm_duration'] = train['pred_speed'] * train["total_distance"]
test['osrm_duration'] = test['pred_speed'] * test["total_distance"]

train['osrm_residuals'] = train['trip_duration'] - train['pred_duration']

In [None]:
do_not_use_for_training = ['id', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'check_trip_duration',
                           'pickup_date', 'avg_speed', "osrm_speed",
                           'pred_speed','pred_duration',
                           "pred_osrm_speed", "osrm_duration",
                           'residuals', "osrm_residuals",
                           'store_and_fwd_flag', 
                           'starting_street', 'end_street',
                           'total_distance', 'number_of_steps', 
                           'street_for_each_step', 'distance_per_step', 
                           'travel_time_per_step', 'step_maneuvers', 
                           'step_direction', 'step_location_list',
                           'Wind Dir', 'Events', 'Conditions',
                           'total_travel_time', 'Temp.', 'Humidity', 
                           'Dew Point', 'Wind Speed', 'Gust Speed', 
                           #'Precip'
                          ]
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
X, y = train[feature_names], train["residuals"]
Xtest = test[feature_names]
print(f"Total feature set: {[f for f in train.columns]}")
print(f"Shape of training data: X {X.shape} y {y.shape}")
print(f"Shape of test features: X {Xtest.shape}")
print(f"Features used: {feature_names}")

In [None]:
from sklearn import linear_model, model_selection, metrics, pipeline
from sklearn import preprocessing, svm, compose, feature_selection
from sklearn import ensemble, decomposition
import xgboost

def rmsle_score(yt, yp): 
    return np.sqrt(metrics.mean_squared_log_error(yt, yp))

rmsle = metrics.make_scorer(rmsle_score,greater_is_better=False)

params = {
    "pca__n_components": [3,5,7,10,15],
    "features__score_func": [feature_selection.f_regression],
    "features__k": [3,5,7,10,'all'],
    "lm__alpha": np.logspace(-3,3,7),
}
model = model_selection.RandomizedSearchCV(
        pipeline.Pipeline([
          ("pca", decomposition.PCA()),
          ("features",feature_selection.SelectKBest()),
          ("scaler", preprocessing.StandardScaler()),
          ("lm", linear_model.Ridge())
        ]),
    params, #scoring=rmsle, 
    cv=5, n_jobs=-1)
model.get_params()

In [None]:
%%time
model.fit(X, y)

In [None]:
print(f"Model score: {model.best_score_}")
print(f"Model chosen parameters: {model.best_params_}")

In [None]:
dur_val = train['pred_duration'] + model.predict(X)
dur_val[dur_val < 0] = 0
print(f"Model RMSLE score: {rmsle_score(train['trip_duration'], dur_val)}")

In [None]:
import yellowbrick as yb
import yellowbrick.regressor as ybr
import yellowbrick.features as ybf

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y)
X_train.shape, X_val.shape

In [None]:
# Residuals plot
ybr.residuals_plot(model.best_estimator_, X_train, y_train, X_val, y_val, is_fitted=True)

In [None]:
ybr.prediction_error(model.best_estimator_, X_train, y_train, X_val, y_val, is_fitted=True)

In [None]:
import seaborn

for f in train.columns:
    pyplot.figure()
    seaborn.displot(data=train.sample(frac=0.1), x=f, y="trip_duration")
    pyplot.title(f"Plot of {f} against trip_duration")
    pyplot.show()
    

In [None]:
%%time
# Let's use the base_model to predict both kinds of residuals, then
# compute and feed both sets of predicted durations into a meta-model
from sklearn import base

meta_model = ensemble.ExtraTreesRegressor()

avg_model = base.clone(model.best_estimator_)
osrm_model = base.clone(model.best_estimator_)

avg_model.fit(X, train["residuals"])
osrm_model.fit(X, train["osrm_residuals"])

pred_avg_residuals = avg_model.predict(X)
pred_osrm_residuals = osrm_model.predict(X)

meta_predictors = train[["pred_duration", "osrm_duration"]].assign(
    pred_avg_residuals=pred_avg_residuals,pred_osrm_residuals=pred_osrm_residuals)

meta_model.fit(meta_predictors, train["trip_duration"])

In [None]:
%%time
#submission = test[[]].assign(trip_duration=test['pred_duration']+model.predict(Xtest))

test_avg_residuals = avg_model.predict(Xtest)
test_osrm_residuals = osrm_model.predict(Xtest)

test_meta = test[["pred_duration", "osrm_duration"]].assign(
    pred_avg_residuals=test_avg_residuals,pred_osrm_residuals=test_osrm_residuals)
    
submission = test[[]].assign(trip_duration=meta_model.predict(test_meta))
submission['trip_duration'] = np.where(submission['trip_duration'] < 0, 0, submission['trip_duration'])
submission.to_csv("submission.csv")