In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip", compression="zip")
df_test = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip", compression="zip")
df_sub = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip", compression="zip")

# General Information about the dataset

__First of all, it is important to know the data itself a little better. it would be nice to know:__
  * how many datasets are relevant
  * which columns are present in the data sets
  * which datatypes are present 
  * how many rows are in the datasets
  * how heavy the dataset are in terms of memory usage

In [None]:
for name, ds in zip(["df_train","df_test","df_sub"],[df_train, df_test, df_sub]):
    
    print("---------------")
    print("{}\n".format(name))
    print(ds.info())
    print("\n")

* the main goal of the competition is to predict the trip duration of a taxi drive based on several attributes describing the drive
* three dataset are provided: train set, test set and submission file
* train set/test set
    * time features: pickup and dropoff datetime
    * geographical features: pickup and drop off longitude/latitude
    * others: store-and fwd Flag, passenger count
* submission file:
    * structure of the submission 

#### Another important question to get to know the data is the structure concerning missingness. Are there many values gone missing or is the dataset complete?

In [None]:
for name, ds in zip(["df_train","df_test","df_sub"],[df_train, df_test, df_sub]):
    
    print("---------------")
    print("{}\n".format(name))
    print(ds.isnull().sum())
    print("\n")

* no missing values in the data set

For closing this first description of the datasets, it's maybe also interesting to have a look on the raw data

In [None]:
pd.concat([df_train.head(),df_train.tail()],axis=0)

In [None]:
pd.concat([df_test.head(),df_test.tail()],axis=0)

## Data exploration

In [None]:
print("cat features train set: {}".format(df_train.select_dtypes(exclude="number").columns))
print("\n")
print("numeric features train set: {}".format(df_train.select_dtypes(include="number").columns))

In [None]:
print("cat features test set: {}".format(df_test.select_dtypes(exclude="number").columns))
print("\n")
print("numeric features test set: {}".format(df_test.select_dtypes(include="number").columns))

In [None]:
df_train["pickup_datetime"] = pd.to_datetime(df_train["pickup_datetime"], format="%Y-%m-%d %H:%M:%S")
df_train["dropoff_datetime"] = pd.to_datetime(df_train["dropoff_datetime"], format="%Y-%m-%d %H:%M:%S")

df_test["pickup_datetime"] = pd.to_datetime(df_test["pickup_datetime"], format="%Y-%m-%d %H:%M:%S")

In [None]:
from tqdm.auto import tqdm

tqdm.pandas()

df_train["year"] = df_train["pickup_datetime"].progress_apply(lambda x: x.year)
df_train["month"] = df_train["pickup_datetime"].progress_apply(lambda x: x.month)
df_train["day"] = df_train["pickup_datetime"].progress_apply(lambda x: x.day)
df_train["hour"] = df_train["pickup_datetime"].progress_apply(lambda x: x.hour)
df_train["minute"] = df_train["pickup_datetime"].progress_apply(lambda x: x.minute)

df_test["year"] = df_test["pickup_datetime"].progress_apply(lambda x: x.year)
df_test["month"] = df_test["pickup_datetime"].progress_apply(lambda x: x.month)
df_test["day"] = df_test["pickup_datetime"].progress_apply(lambda x: x.day)
df_test["hour"] = df_test["pickup_datetime"].progress_apply(lambda x: x.hour)
df_test["minute"] = df_test["pickup_datetime"].progress_apply(lambda x: x.minute)

## Which time frame is considered in the dataset?

In [None]:
print("df_train: {}".format(df_train["year"].unique()))
print("df_test: {}".format(df_test["year"].unique()))

* the datasets focus only the year 2016

In [None]:
fig, sub = plt.subplots(2,3,figsize=(25,6))
counter = 0

for feat, subplot in zip(["month","day","hour","month","day","hour"], sub.flatten()):
    
    if counter<3:
        sns.barplot(x=df_train[feat].value_counts().index, y = df_train[feat].value_counts().values, ax= subplot, palette="CMRmap")
        subplot.grid()
        subplot.set_title("Train set {}".format(feat))
    else:
        sns.barplot(x=df_test[feat].value_counts().index, y = df_test[feat].value_counts().values, ax= subplot, palette="CMRmap")
        subplot.grid()
        subplot.set_title("Test set {}".format(feat))
    
    counter+=1
    
fig.tight_layout()

* all records are from the year 2016 and regarding the months the rides took place between January and June 2016
* the rides are approx. equally distributed to the days per months. however at the end of the month there are fewer rides
* regarding the time of the day, most of the clients have been driven between 18 - 23 h and the least has been transported between 0 and 6 h 

### to verify this hypothesis: we'll conduct some hypothesis test

but first of all, we have to know which test we can use to verify
to use anova or t-tests to verify, the data needs to be normally distributed <br>
--> to check if this prerequisite is met, we'll have a look on the q-q-plots of the feature "trip_duration"

In [None]:
fig,sub = plt.subplots(2,3,figsize=(12,6))

sns.distplot(df_train["trip_duration"], hist_kws={"edgecolor":"black"}, ax=sub[0][0])
sns.distplot(np.log1p(df_train["trip_duration"]), hist_kws={"edgecolor":"black"}, ax=sub[0][1])
sns.distplot(df_train[df_train["trip_duration"]<60*120]["trip_duration"], hist_kws={"edgecolor":"black"}, ax=sub[0][2])

prob = stats.probplot(df_train["trip_duration"], dist=stats.norm, plot=sub[1][0])
prob = stats.probplot(np.log1p(df_train["trip_duration"]), dist=stats.norm, plot=sub[1][1])
prob = stats.probplot(df_train[df_train["trip_duration"]<60*120]["trip_duration"], dist=stats.norm, plot=sub[1][2])


counter = 0

for name, subplot in zip(["duration [raw data]","duration [log]", "duration [< 2 h]","probPlot duration [raw data]","probPlot duration [log]", "probPlot duration [< 2 h]"], sub.flatten()):
    subplot.set_title("{}".format(name))
    subplot.grid()
    
    if counter < 3:
        if name == "duration [log]": 
            subplot.set_xlabel("trip duration in log(sec)")
        else:
            subplot.set_xlabel("trip duration in sec")
    
    counter += 1

fig.tight_layout()

* the log of duration seems to be the most suitable to use anova and t-test

### Are there many outlier present in the dataset?

In [None]:
def get_outlier(df):
    
    outlier_index = []
    
    q1 = np.quantile(df["trip_duration"],0.25)
    q3 = np.quantile(df["trip_duration"],0.75)
    IQR = q3 - q1
    outlier_step = 1.5 * IQR
    
    lower_barreer = q1 - outlier_step
    upper_barreer = q3 + outlier_step
    
    outlier_list_col = df[(df["trip_duration"] < lower_barreer) | (df["trip_duration"] > upper_barreer)].index
    outlier_index.extend(outlier_list_col)
    
    return outlier_index

In [None]:
print("tuckey outlier df_train: {}".format(df_train.iloc[get_outlier(df_train)].shape))

In [None]:
print("min duration: {} sec ".format(df_train["trip_duration"].min()))
print("max duration: {} hour ".format(np.round(df_train["trip_duration"].max()/(60**2)),2))

* right skewed distribution: main part of the data is distributed between 0 and 58 min (2092 s)
* partly heavy outliers (in total 74,2 k outlier)
* max duration 980 h

## Which distances are travelled?

In [None]:
from math import sin, cos, sqrt, atan2, radians

def get_distance(lon_1, lon_2, lat_1, lat_2):

    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat_1)
    lon1 = radians(lon_1)
    lat2 = radians(lat_2)
    lon2 = radians(lon_2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [None]:
df_train["distance"] = df_train.progress_apply(lambda x: get_distance(x["pickup_longitude"],x["dropoff_longitude"],x["pickup_latitude"],x["dropoff_latitude"]),axis=1)
df_test["distance"] = df_test.progress_apply(lambda x: get_distance(x["pickup_longitude"],x["dropoff_longitude"],x["pickup_latitude"],x["dropoff_latitude"]),axis=1)

In [None]:
fig,sub = plt.subplots(2,3,figsize=(12,6))

sns.distplot(df_train["distance"], hist_kws={"edgecolor":"black"}, ax=sub[0][0])
sns.distplot(np.log1p(df_train["distance"]), hist_kws={"edgecolor":"black"}, ax=sub[0][1])
sns.distplot(df_train[df_train["distance"]<15]["distance"], hist_kws={"edgecolor":"black"}, ax=sub[0][2])

prob = stats.probplot(df_train["distance"], dist=stats.norm, plot=sub[1][0])
prob = stats.probplot(np.log1p(df_train["distance"]), dist=stats.norm, plot=sub[1][1])
prob = stats.probplot(df_train[df_train["distance"]<15]["distance"], dist=stats.norm, plot=sub[1][2])


counter = 0

for name, subplot in zip(["distance [raw data]","distance [log]", "distance [< 15 km]","probPlot distance [raw data]","probPlot distance [log]", "probPlot distance [< 2 h]"], sub.flatten()):
    subplot.set_title("{}".format(name))
    subplot.grid()
    
    if counter < 3:
        if name == "km [log]": 
            subplot.set_xlabel("distance in log(km)")
        else:
            subplot.set_xlabel("distance in km")
    
    counter += 1

fig.tight_layout()

## Where do the rides take exactly place?

In [None]:
df_train.head()

Map where the passengers have been picked up (blue) and dropped off (red)

In [None]:
import folium
f = folium.Figure(width=1500, height=500)
mapa = folium.Map(location = (40.7679, -73.9822), zoom_start=11).add_to(f)

for index, row in df_train.sample(1000).iterrows():
    folium.Marker([row["pickup_latitude"], row["pickup_longitude"]], icon=folium.Icon(color="blue")).add_to(mapa)
    folium.Marker([row["dropoff_latitude"], row["dropoff_longitude"]], icon=folium.Icon(color="red")).add_to(mapa)


display(mapa)

* the rides take place in the newyork region with focus on manhattan
* some passengers set out to be dropped off in areas outside newyork or the airport

In [None]:
import folium
f = folium.Figure(width=1500, height=500)
mapa = folium.Map(location = (40.7679, -73.9822), zoom_start=11).add_to(f)

for index, row in df_train[df_train["distance"]>20].sample(200).iterrows():
    folium.Marker([row["pickup_latitude"], row["pickup_longitude"]], icon=folium.Icon(color="blue")).add_to(mapa)
    folium.Marker([row["dropoff_latitude"], row["dropoff_longitude"]], icon=folium.Icon(color="red")).add_to(mapa)


display(mapa)

* many of the ride > 20 are rides to the or from the airports

In [None]:
from tqdm.auto import tqdm

tqdm.pandas()

df_train["pickUp_coordinates"] = df_train.progress_apply(lambda x: (x["pickup_latitude"], x["pickup_longitude"]), axis=1)
df_train["dropOff_coordinates"] = df_train.progress_apply(lambda x: (x["dropoff_latitude"], x["dropoff_longitude"]), axis=1)

In [None]:
fig ,sub = plt.subplots(1,1,figsize=(12,4))

sns.barplot(x = df_train["passenger_count"].value_counts().index, y = df_train["passenger_count"].value_counts().values, ax= sub, palette="PuBu_r")
sub.grid()
sub.set_xlabel("Passenger per ride");

* most of the rides are people taking a taxi alone

# Multivariate Analysis

In [None]:
correlation = df_train[["trip_duration","month","day","hour","distance"]].corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(correlation, mask=mask, cmap=cmap, linecolor = "black",lw=0.09);

* no linear relationship observable between the features

## is the trip duration time-dependent?

In [None]:
fig, sub = plt.subplots(1,3,figsize=(25,5))

for name, subplot in zip(["month","day", "hour"], sub.flatten()):
    
    data = df_train.groupby(name)["trip_duration"].mean()
    sns.barplot(x=data.index, y=data.values, ax=subplot, palette="CMRmap")
    subplot.grid(color="lightgrey")

fig.tight_layout()

#### Anova

In [None]:
print("ANOVA month/trip-duration: {}".format(stats.f_oneway(*[np.log1p(df_train[df_train["month"]==feat]["trip_duration"]) for feat in df_train["month"].unique()])))
print("ANOVA day/trip-duration: {}".format(stats.f_oneway(*[np.log1p(df_train[df_train["day"]==feat]["trip_duration"]) for feat in df_train["day"].unique()])))
print("ANOVA hour/trip-duration: {}".format(stats.f_oneway(*[np.log1p(df_train[df_train["hour"]==feat]["trip_duration"]) for feat in df_train["hour"].unique()])))

* in all three time categories at least two means are significantly different from each other

In [None]:
from itertools import combinations 
hour_list = df_train["month"].unique()

for feat1, feat2 in combinations(hour_list,2):
    t, p = stats.ttest_ind(np.log1p(df_train[df_train["month"]==feat1]["trip_duration"]),np.log1p(df_train[df_train["month"]==feat2]["trip_duration"]))
    if p > 0.01:
        print("p-value of t-Test between {} and {}: {}".format(feat1,feat2, np.round(p,2)))

* only month january and february are not signific. different from each other in terms of trip duration

In [None]:
from itertools import combinations 
hour_list = df_train["hour"].unique()

for feat1, feat2 in combinations(hour_list,2):
    t, p = stats.ttest_ind(np.log1p(df_train[df_train["hour"]==feat1]["trip_duration"]),np.log1p(df_train[df_train["hour"]==feat2]["trip_duration"]))
    if p > 0.01:
        print("p-value of t-Test between {} and {}: {}".format(feat1,feat2, np.round(p,2)))

by night and in the morning, the trip duration is signific. lower than in the time frame between 14-18 h

In [None]:
fig, sub = plt.subplots(1,1,figsize=(12,6))

sns.barplot(x=df_train.groupby("passenger_count")["trip_duration"].mean().index, y= df_train.groupby("passenger_count")["trip_duration"].mean(), ax=sub)
sub.grid()

In [None]:
print("ANOVA passenger count/trip-duration: {}".format(stats.f_oneway(*[np.log1p(df_train[df_train["passenger_count"]==feat]["trip_duration"]) for feat in df_train["passenger_count"].unique()])))

In [None]:
from itertools import combinations 
import warnings 
warnings.filterwarnings("ignore")
passenger_list = df_train["passenger_count"].unique()

for feat1, feat2 in combinations(passenger_list,2):
    
    feat1_data = np.log1p(df_train[df_train["passenger_count"]==feat1]["trip_duration"])
    feat2_data = np.log1p(df_train[df_train["passenger_count"]==feat2]["trip_duration"])
    
    t, p = stats.ttest_ind(feat1_data, feat2_data)
    if p > 0.01:
        print("p-value of t-Test between {} and {}: {}".format(feat1,feat2, np.round(p,2)))

* only the combinations above are not signific. different from each other

# First Model

In [None]:
from sklearn.preprocessing import LabelEncoder

X, y= df_train[["passenger_count","pickup_longitude","pickup_latitude","month","day","hour", "distance","store_and_fwd_flag"]], df_train["trip_duration"]

enc = LabelEncoder()
X["store_and_fwd_flag"] = enc.fit_transform(X["store_and_fwd_flag"])

In [None]:
from sklearn.model_selection import KFold 
kf = KFold(n_splits=5) 
kf.get_n_splits(X,y)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error

xgboost_params = { 
   "objective": "reg:squarederror",
   "n_estimators": 40,
   "booster": "gbtree",
   "learning_rate": 0.1,
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

In [None]:
fold_dict = {}

for fold,(train_index, test_index) in enumerate(kf.split(X)):

    print("Training the fold {}".format(fold+1))
    reg = xgb.XGBRegressor(**xgboost_params)
    reg.fit(X.values[train_index],y.values[train_index])
    xgb_preds = abs(reg.predict(X.values[test_index]))
    fold_dict[fold] = mean_squared_log_error(y.values[test_index], xgb_preds)
    print("Result for fold {}: {}".format(fold+1, mean_squared_log_error(y.values[test_index], xgb_preds)))

In [None]:
booster = reg.get_booster()
importance = booster.get_fscore()
imp_dict = {X.columns[i]:float(importance.get('f'+str(i),0.)) for i in range(len(X.columns))}
sorted_importance = {k: v for k, v in sorted(imp_dict.items(), key=lambda item: item[1])}

fig, sub = plt.subplots(1,1,figsize=(12,4))
sns.barplot(x=list(sorted_importance.keys()),y=list(sorted_importance.values()), ax=sub);

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2)

In [None]:
train_errors = []
val_errors = []

for i in range(1, X_train.shape[0],2*10**5):
    print("Round training from sample 1 to sample {}".format(i))
    reg.fit(X_train.iloc[:i],y_train.iloc[:i])
    train_preds = abs(reg.predict(X_train.iloc[:i]))
    val_preds = abs(reg.predict(X_val))
    train_errors.append(mean_squared_log_error(y_train.iloc[:i], train_preds))
    val_errors.append(mean_squared_log_error(y_val, val_preds))

In [None]:
fig, sub = plt.subplots(1,1,figsize=(12,6))

sns.lineplot(x=range(1, X_train.shape[0],2*10**5), y=train_errors, label="Training Error")
sns.lineplot(x=range(1, X_train.shape[0],2*10**5), y=val_errors, label="Validation Error")

sub.grid()

Analysis of Learning curve:
   * Training error and Validation error are converging --> Adding more training data doesn't improve the model
   * the error level of ~ 0.33 is concerning the ranking not terrible
   * concerning variance, the model generalizes well on the validation set (the gap between Training and validation error is low and get narrower by adding more training data) 

### Validation curve Learning Rate 

In [None]:
xgboost_params = { 
   "objective": "reg:squarederror",
   "n_estimators": 40,
   "booster": "gbtree",
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

In [None]:
train_errors = []
val_errors = []

for i in np.arange(0.05, 1.0, 0.1):
    print("Learning rate {}".format(i))
    
    xgboost_params["learning_rate"] = i 
    
    reg = xgb.XGBRegressor(**xgboost_params)
    reg.fit(X_train,y_train)
    train_preds = abs(reg.predict(X_train))
    val_preds = abs(reg.predict(X_val))
    train_errors.append(mean_squared_log_error(y_train, train_preds))
    val_errors.append(mean_squared_log_error(y_val, val_preds))

In [None]:
fig, sub = plt.subplots(1,1,figsize=(12,5))

sns.lineplot(x=np.arange(0.05, 1.0, 0.1),y=train_errors, label="Training loss",color="blue", ax=sub)
sns.lineplot(x=np.arange(0.05, 1.0, 0.1),y=val_errors, label="validation loss",color="dimgrey", ax=sub)

sub.set_xticks(np.arange(0,1.1,0.1))

sub.grid()

In [None]:
X_train, y_train= df_train[["passenger_count","pickup_longitude","pickup_latitude","month","day","hour", "distance","store_and_fwd_flag"]], df_train["trip_duration"]

enc = LabelEncoder()
X_train["store_and_fwd_flag"] = enc.fit_transform(X_train["store_and_fwd_flag"])

In [None]:
X_test = df_test[["passenger_count","pickup_longitude","pickup_latitude","month","day","hour", "distance","store_and_fwd_flag"]]

enc = LabelEncoder()
X_test["store_and_fwd_flag"] = enc.fit_transform(X_test["store_and_fwd_flag"])

In [None]:
xgboost_params = { 
   "objective": "reg:squarederror",
   "n_estimators": 40,
   "booster": "gbtree",
   "learning_rate": 0.1,
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

In [None]:
reg = xgb.XGBRegressor(**xgboost_params)
reg.fit(X_train,y_train)
xgb_preds = abs(reg.predict(X_test))

In [None]:
f = {"id":df_sub["id"],"trip_duration":xgb_preds}
f = pd.DataFrame(f)
f.to_csv("submission.csv",index=False)