# NYC taxi trip duration - EDA + Predictive Modeling

## Data Loading

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import plot
from matplotlib.colors import LogNorm

%matplotlib inline
sns.set({'figure.figsize':(15,8)})

import os

In [None]:
# Load the training and testing datasets
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

## Data Exploration & Cleaning

In [None]:
# Print the first 5 rows of the traing dataset
df_train.head()

In [None]:
# Print the first 5 rows of the testing dataset
df_test.head()

In [None]:
# List all the training dataset features, the number of values in each and their type
df_train.info()

In [None]:
# List all the training dataset features, the number of values in each and their type
df_test.info()

We can see that the training dataset counts 11 features versus 9 for the testing dataset. The two extra features in the training dataset are "dropoff_datetime" (numerical feature) and **the target feature: "trip_duration"**. The training dataset also counts more values per feature: 1.458.644 versus 625.134 for the testing dataset.

In [None]:
# See if there are any missing values in the training dataset
for i, x in zip(list(df_train.isnull().sum().index),list(df_train.isnull().sum().values)):
    print(f"The feature {i} counts {x} missing value")

In [None]:
# See if there are any missing value in the testing dataset as well
for i, x in zip(list(df_test.isnull().sum().index),list(df_test.isnull().sum().values)):
    print(f"The feature {i} counts {x} missing value")

In [None]:
# Now let's make sure there are no duplicated values
train_mv = df_train.duplicated().sum()
test_mv = df_test.duplicated().sum()
print(f'The training dataset counts {train_mv} duplicated value and the testing dataset counts {test_mv} as well.')

#### It is not convenient to have both the date and the timing in the same cell, for the features "pickup_datetime" and "dropoff_datetime". Hence, we will split them accordingly.

In [None]:
# Split the cells content in two new features: pickup_day and pickup_time
df_train['pickup_day'] = pd.to_datetime(df_train['pickup_datetime']).dt.date
df_train['pickup_time'] = pd.to_datetime(df_train['pickup_datetime']).dt.time

# We apply this same logic to the testing set
df_test['pickup_day'] = pd.to_datetime(df_test['pickup_datetime']).dt.date
df_test['pickup_time'] = pd.to_datetime(df_test['pickup_datetime']).dt.time

# Then we do the same for the dropoff feature
df_train['dropoff_day'] = pd.to_datetime(df_train['dropoff_datetime']).dt.date
df_train['dropoff_time'] = pd.to_datetime(df_train['dropoff_datetime']).dt.time

In [None]:
# We can now drop the 'pickup_datetime' and 'dropoff_datetime' columns
df_train.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)

df_train.shape

In [None]:
# And while we're at it, let's not forget to drop it on the test dataframe too to maintain consistency
df_test.drop(['pickup_datetime'], axis=1, inplace=True)
df_test.shape

#### Ok, now we can take a closer look. First, let's examine our most important feature: the target one, "trip_duration". 

In [None]:
df_train['trip_duration'].describe()

In [None]:
# Now that we now the mean and median, let's see if there are outliers
df_train.boxplot(['trip_duration']);

As we can see, we need to handle the outliers considering their extent. Let's zoom on the box to get a better look.

In [None]:
df_train.boxplot(['trip_duration'], showfliers=False, notch=True);

In [None]:
len(df_train.trip_duration[df_train.trip_duration > 4000].values)

In [None]:
len(df_train.trip_duration[df_train.trip_duration < 10].values)

We can see that most of the trip durations last between 0 and roughly 2500 seconds (approximately 46 minutes), with the highest concentration between around 800 1100 seconds, and with less than 10.000 trips with a duration over 4000 seconds (approximately 66 minutes). As a result, **we will from now on consider the trip durations over 4000 seconds as outliers** and won't take them into account in our modeling process.

Now, let's look at the latitude and longitude features, to see if they have outliers too.

In [None]:
df_train.boxplot(['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude']);

Seems like we have outliers here too. Let's verify.

In [None]:
len(df_train.pickup_longitude[df_train.pickup_longitude < -80].values)

In [None]:
len(df_train.pickup_longitude[df_train.pickup_longitude > -50].values)

In [None]:
len(df_train.pickup_latitude[df_train.pickup_latitude < 25].values)

In [None]:
len(df_train.pickup_latitude[df_train.pickup_latitude > 50].values)

In [None]:
# Creating our cleaned train dataset
df2_train = df_train[(df_train.trip_duration < 4000) & 
                     (df_train.pickup_longitude > -80) & 
                     (df_train.pickup_longitude < -50) &
                     (df_train.pickup_latitude > 25) &
                     (df_train.pickup_latitude < 50)
                    ]

df2_train.shape

In [None]:
df2_train.head()

In [None]:
df_test.shape, df2_train.shape

## Features Engineering

Now that we have normalized our target feature, let's plot those its points to see what their distribution looks like.

In [None]:
df2_train['trip_duration'].hist(bins=100, histtype='stepfilled')
plt.title("Ditribution of the trip_duration feature points");

This is what appears to be a **normal distribution with a left skewness**.

Let's see if we can visualize some particularly high concentrations of points in specific places.

In [None]:
min_long = -80
max_long = -50
min_lat = 25
max_lat = 50

In [None]:
fig = plt.figure(1, figsize=(10,5))
hist = plt.hist2d(df2_train.pickup_longitude, df2_train.pickup_latitude, bins=50, range=[[min_long,max_long], [min_lat,max_lat]], norm=LogNorm())
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.colorbar(label='Value counts')
plt.title('NYC pickup locations')
plt.show()

One last thing, let's store the features according to their type. 

In [None]:
num_col = [
    col for col in df2_train.columns if 
    (df2_train[col].dtype=='int64' or df2_train[col].dtype=='float64') 
    and col != 'trip_duration']

num_col

In [None]:
cat_col = [
    col for col in df2_train.columns if 
    (df2_train[col].dtype=='object') 
    and col != 'trip_duration']

cat_col

In [None]:
for col in cat_col:
    df2_train[col] = df2_train[col].astype('category').cat.codes
    
df2_train.head()

In [None]:
# Let's not forget to do the same for the test dataset
test_num_col = [
    col for col in df_test.columns if 
    (df_test[col].dtype=='int64' or df_test[col].dtype=='float64')]

test_num_col

In [None]:
test_cat_col = [
    col for col in df_test.columns if 
    (df_test[col].dtype=='object')
    and col != 'id']

test_cat_col

In [None]:
for col in test_cat_col:
    df_test[col] = df_test[col].astype('category').cat.codes
    
df_test.head()

In [None]:
# Finally, we lock the target fature in a constant one
TARGET = df2_train.trip_duration

# Features Selection

In [None]:
X_train = df2_train[['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_day', 'pickup_time']]
y_train = df2_train['trip_duration']
X_train.shape, y_train.shape

# Model Selection

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2, random_state=42, stratify=y_train)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
m = RandomForestRegressor(n_estimators=20, max_leaf_nodes=50000, n_jobs=-1)
m.fit(X_train, y_train)

In [None]:
m.score(X_valid, y_valid)

# Model Validation & Training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_new, X_valid, y_train_new, y_valid = train_test_split(X_train, y_train, 
                                                              test_size=.2, random_state=42, stratify=y_train)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

Good ! Now we can cross validate our scores and start the predictions.

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(m, X_train, y_train, cv=5, scoring='neg_mean_squared_log_error')
cv_scores

In [None]:
X_test = df_test[['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_day', 'pickup_time']]
y_test_pred = m.predict(X_test)
y_test_pred

In [None]:
submission = pd.DataFrame(df_test.loc[:, 'id'])
submission['trip_duration'] = y_test_pred
print(submission.shape)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)