# NYC taxi trip duration - EDA + Modeling

## Data Loading

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import plot
from matplotlib.colors import LogNorm

%matplotlib inline
sns.set({'figure.figsize':(15,8)})

import os

In [None]:
# Load the training and testing datasets
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

## Data Exploration & Cleaning

In [None]:
# Print the first 5 rows of the traing dataset
df_train.head()

In [None]:
# Print the first 5 rows of the testing dataset
df_test.head()

In [None]:
# List all the training dataset features, the number of values in each and their type
df_train.info()

In [None]:
# List all the training dataset features, the number of values in each and their type
df_test.info()

We can see that the training dataset counts 11 features versus 9 for the testing dataset. The two extra features in the training dataset are "dropoff_datetime" (numerical feature) and **the target feature: "trip_duration"**. The training dataset also counts more values per feature: 1.458.644 versus 625.134 for the testing dataset.

In [None]:
# See if there are any missing values in the training dataset
for i, x in zip(list(df_train.isnull().sum().index),list(df_train.isnull().sum().values)):
    print(f"The feature {i} counts {x} missing value")

In [None]:
# See if there are any missing value in the testing dataset as well
for i, x in zip(list(df_test.isnull().sum().index),list(df_test.isnull().sum().values)):
    print(f"The feature {i} counts {x} missing value")

In [None]:
# Now let's make sure there are no duplicated values
train_mv = df_train.duplicated().sum()
test_mv = df_test.duplicated().sum()
print(f'The training dataset counts {train_mv} duplicated value and the testing dataset counts {test_mv} as well.')

It is not convenient to have both the date and the timing in the same cell, for the features "pickup_datetime" and "dropoff_datetime". Hence, we will split them accordingly.

In [None]:
# Split the cells content in two new features: pickup_day and pickup_time
df_train['pickup_day'] = pd.to_datetime(df_train['pickup_datetime']).dt.date
df_train['pickup_time'] = pd.to_datetime(df_train['pickup_datetime']).dt.time

# We apply this same logic to the testing set
df_test['pickup_day'] = pd.to_datetime(df_test['pickup_datetime']).dt.date
df_test['pickup_time'] = pd.to_datetime(df_test['pickup_datetime']).dt.time

# Then we do the same for the dropoff feature
df_train['dropoff_day'] = pd.to_datetime(df_train['dropoff_datetime']).dt.date
df_train['dropoff_time'] = pd.to_datetime(df_train['dropoff_datetime']).dt.time

In [None]:
df_train.head()

Ok, now we can take a closer look. First, let's examine our most important feature: the target one, "trip_duration". 

In [None]:
df_train['trip_duration'].describe()

In [None]:
# Now that we now the mean and median, let's see if there are outliers
df_train.boxplot(['trip_duration']);

As we can see, we need to handle the outliers considering their extent. Let's zoom on the box to get a better look.

In [None]:
df_train.boxplot(['trip_duration'], showfliers=False, notch=True);

In [None]:
len(df_train.trip_duration[df_train.trip_duration > 4000].values)

In [None]:
len(df_train.trip_duration[df_train.trip_duration < 10].values)

We can see that most of the trip durations last between 0 and roughly 2500 seconds (approximately 46 minutes), with the highest concentration between around 800 1100 seconds, and with less than 10.000 trips with a duration over 4000 seconds (approximately 66 minutes). As a result, **we will from now on consider the trip durations over 4000 seconds as outliers** and won't take them into account in our modeling process.

## Features Engineering

In [None]:
# Let's create a new dataframe, without the outliers
df2_train = df_train[df_train.trip_duration < 4000]
df2_train.info()

In [None]:
# Let's see the values changes 
df2_train['trip_duration'].describe()

Now that we have normalized our target feature, let's plot those its points to see what their distribution looks like.

In [None]:
df2_train['trip_duration'].hist(bins=100, histtype='stepfilled')
plt.title("Ditribution of the trip_duration feature points");

This is what appears to be a **normal distribution with a left skewness**.

Now, we should handle the latitudes and longitudes. First, let's see if we can visualize some particularly high concentrations of points in specific places, in order to get a first insight.

In [None]:
df2_train.boxplot(['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude']);

Seems like we have outliers. Let's verify.

In [None]:
len(df2_train.pickup_longitude[df2_train.pickup_longitude < -80].values)

In [None]:
len(df2_train.pickup_longitude[df2_train.pickup_longitude > -50].values)

In [None]:
len(df2_train.pickup_latitude[df2_train.pickup_latitude < 25].values)

In [None]:
len(df2_train.pickup_latitude[df2_train.pickup_latitude > 50].values)

Now we can set maximum and minimum ranges for our data visualization of the highest concentrations of pickup and dropoff locations.

In [None]:
min_long = -80
max_long = -50
min_lat = 25
max_lat = 50

In [None]:
fig = plt.figure(1, figsize=(10,5))
hist = plt.hist2d(df2_train.pickup_longitude, df2_train.pickup_latitude, bins=50, range=[[min_long,max_long], [min_lat,max_lat]], norm=LogNorm())
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.colorbar(label='Value counts')
plt.title('NYC pickup locations')
plt.show()

One last thing, let's store the features according to their type. 

In [None]:
num_col = [
    col for col in df2_train.columns if 
    (df2_train[col].dtype=='int64' or df2_train[col].dtype=='float64') 
    and col != 'trip_duration']

num_col

In [None]:
cat_col = [
    col for col in df2_train.columns if 
    (df2_train[col].dtype=='object') 
    and col != 'trip_duration']

cat_col

In [None]:
for col in cat_col:
    df2_train[col] = df2_train[col].astype('category').cat.codes
    
df2_train.head()

In [None]:
# Finally, we lock the target fature in a constant one
TARGET = df2_train.trip_duration

In [None]:
df2_train.head()

# Model Validation & Training

In [None]:
X_train = df2_train[['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_day', 'pickup_time']]
X_train.shape

In [None]:
y_train = df2_train.trip_duration
y_train.shape

Let's split our model so we can validate the training before testing.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_new, X_valid, y_train_new, y_valid = train_test_split(X_train, y_train, 
                                                              test_size=.2, random_state=42, stratify=y_train)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
m1 = RandomForestRegressor(n_estimators=10, random_state=6)
m1.fit(X_train, y_train)
m1.score(X_valid, y_valid)

Now that's a great score. Maybe too great, **our model is probably overfitting**. Let's try to reduce the number of trees.

Let's try to reduce the number of trees.

In [None]:
m2 = RandomForestRegressor(n_estimators=5, random_state=42)
m2.fit(X_train, y_train)
m2.score(X_valid, y_valid)

Not working. Let's try with the number of leaves our trees can produce. 

In [None]:
m3 = RandomForestRegressor(n_estimators=8, random_state=42, max_leaf_nodes=100)
m3.fit(X_train, y_train)
m3.score(X_valid, y_valid)

Looks like we can reduce the risk of overfitting this way. Now 0.61 is a bit of a low score, so we may have been too limiting on the number of leaves ; let's raise it back a little.

In [None]:
m4 = RandomForestRegressor(n_estimators=10, random_state=42, max_leaf_nodes=750)
m4.fit(X_train, y_train)
m4.score(X_valid, y_valid)

In [None]:
m5 = RandomForestRegressor(n_estimators=10, random_state=42, max_leaf_nodes=50000)
m5.fit(X_train, y_train)
m5.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import r2_score

In [None]:
y_valid_pred = m5.predict(X_valid)
y_valid_pred

In [None]:
r2_score(y_valid, y_valid_pred)

Last thing, let's check the **mean squared error**, since the evaluation metric for this competition is Root Mean Squared Logarithmic Error. The lower the MSE, the higher the accuracy on the predictions.

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import SGDRegressor

In [None]:
sgd = SGDRegressor()

In [None]:
sgd.fit(X_train, y_train)

In [None]:
MSE(y_valid, m5.predict(X_valid))

In [None]:
loss = MSE(y_valid, sgd.predict(X_valid))
loss

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
MSE(y_valid, lr.predict(X_valid))

Good ! Now we can cross validate our scores and start the predictions.

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(m5, X_train, y_train, cv=5, scoring='neg_mean_squared_log_error')
cv_scores

In [None]:
X_test = df_test[['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_day', 'pickup_time']]
y_test_pred = m5.predict(X_test)
y_test_pred[:10]

In [None]:
submission = pd.DataFrame(df_test.loc[:, 'id'])
submission['trip_duration'] = y_test_pred
print(submission.shape)
submission.head()

In [None]:
submission.to_csv("submit_file.csv", index=False)