## Import Python Packages and Environment Setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import os
import zipfile

from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

%matplotlib inline

## Importing and Analyzing the Train, Test, and Sample Submission CSVs

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
zf = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip') 
train = pd.read_csv(zf.open('train.csv'))
train.head()

In [None]:
zf = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/test.zip') 
test = pd.read_csv(zf.open('test.csv'))
test.head()

In [None]:
zf = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip') 
sample_submission = pd.read_csv(zf.open('sample_submission.csv'))
sample_submission.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sample_submission.info()

## Feature Engineering - Total Distance Travelled
Calculating Distance (in Miles) between the Pick up and Drop off Coordinates and saving it as a separate column in the Train and Test DataFrames.

In [None]:
def get_distance(source_lat, source_long, dest_lat, dest_long):
    # Returns the distance in Miles between the source and the destination.
    
    distance = geodesic((source_lat, source_long), 
                        (dest_lat, dest_long)).miles
    return distance

In [None]:
train['distance'] = train.apply(lambda x: get_distance(x.pickup_latitude, x.pickup_longitude,
                                                       x.dropoff_latitude, x.dropoff_longitude), axis=1)

train.head()

In [None]:
test['distance'] = test.apply(lambda x: get_distance(x.pickup_latitude, x.pickup_longitude,
                                                     x.dropoff_latitude, x.dropoff_longitude), axis=1)

test.head()

## Data Visualizations

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.distplot(train['trip_duration'].values, axlabel = 'trip_duration', bins = 500)

In [None]:
sns.set(rc={'figure.figsize':(15, 5)})
sns.distplot(train['trip_duration'].values/3600, axlabel = 'Trip Duration (in Hours)', bins = 75)

In [None]:
df_filtered = train[train['distance'] <= 100].copy()
sns.distplot(df_filtered['distance'].values, axlabel = 'Trip Distance', bins = 20)

In [None]:
df_filtered = test[test['distance'] <= 100].copy()
sns.distplot(df_filtered['distance'].values, axlabel = 'Trip Distance', bins = 20)

In [None]:
df_filtered = train[train['distance'] <= 20].copy()
sns.distplot(df_filtered['distance'].values, axlabel = 'Trip Distance in Miles (Filtered)', bins = 50)

In [None]:
df_filtered = test[test['distance'] <= 20].copy()
sns.distplot(df_filtered['distance'].values, axlabel = 'Trip Distance in Miles (Filtered)', bins = 50)

## Anomaly Detection and Removal
Detecting Anomalous Trips using the Distance Column and the Trip Duration Column using Sk-learn Isolation Forest and removing it from the training set.

In [None]:
sns.scatterplot(data=train, x="distance", y="trip_duration")

In [None]:
clf = IsolationForest(random_state = 42, contamination = 0.01)
train['Anomaly'] = clf.fit_predict(train[['distance', 'trip_duration']])
train.head()

In [None]:
train.Anomaly.value_counts()

In [None]:
plt.title("Outlier vs. Normal Trips")
plt.rcParams['figure.figsize'] = [15, 7]

plt.scatter(train.loc[train.Anomaly == -1, ['distance']], 
                 train.loc[train.Anomaly == -1, ['trip_duration']], c='red')
plt.scatter(train.loc[train.Anomaly == 1, ['distance']], 
                 train.loc[train.Anomaly == 1, ['trip_duration']], c='green')
plt.show()

In [None]:
train = train.loc[train['Anomaly'] == 1].copy()
sns.scatterplot(data=train, x="distance", y="trip_duration")

## Trip Duration and Distance Forecasting using Facebook Prophet

In [None]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
train.info()

In [None]:
test['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
test.info()

In [None]:
train['date'] = train['pickup_datetime'].dt.date
train.head()

In [None]:
test['date'] = test['pickup_datetime'].dt.date
test.head()

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.lineplot(x="date", y="distance", data=train)

In [None]:
sns.lineplot(x="date", y="trip_duration", data=train)

Total Distance Travelled Forecasting for the next 30 days using Facebook Prophet

In [None]:
data = train.groupby(['date'])['distance'].agg('sum')
data = pd.DataFrame({'date':data.index, 'distance':data.values})
data['date'] = pd.to_datetime(data['date'])
data.head()

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.lineplot(x="date", y="distance", data=data)

In [None]:
data.rename(columns = {'distance': 'y', 'date': 'ds'}, inplace = True)
m = Prophet(seasonality_mode='additive').fit(data)
future = m.make_future_dataframe(periods = 30)
forecast = m.predict(future)
fig = m.plot(forecast)

Total Trip Duration Forecasting for the next 30 days using Facebook Prophet

In [None]:
data = train.groupby(['date'])['trip_duration'].agg('sum')
data = pd.DataFrame({'date':data.index, 'trip_duration':data.values})
data['date'] = pd.to_datetime(data['date'])
data.head()

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.lineplot(x="date", y="trip_duration", data=data)

In [None]:
data.rename(columns = {'trip_duration': 'y', 'date': 'ds'}, inplace = True)
m = Prophet(seasonality_mode='additive').fit(data)
future = m.make_future_dataframe(periods = 30)
forecast = m.predict(future)
fig = m.plot(forecast)

## Feature Engineering - Location-based Clustering
Creating separate clusters based on Pick Up location and  Drop Off location.

In [None]:
train.head()

In [None]:
test.head()

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(train[['pickup_longitude','pickup_latitude']])
pickup_clusters = kmeans.predict(train[['pickup_longitude','pickup_latitude']])
pickup_clusters

In [None]:
# kmeans = KMeans(n_clusters=5, random_state=42).fit(train[['pickup_longitude','pickup_latitude']])
pickup_clusters_test = kmeans.predict(test[['pickup_longitude','pickup_latitude']])
pickup_clusters_test

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(train[['dropoff_longitude','dropoff_latitude']])
dropoff_clusters = kmeans.predict(train[['dropoff_longitude','dropoff_latitude']])
dropoff_clusters

In [None]:
# kmeans = KMeans(n_clusters=5, random_state=42).fit(train[['dropoff_longitude','dropoff_latitude']])
dropoff_clusters_test = kmeans.predict(test[['dropoff_longitude','dropoff_latitude']])
dropoff_clusters_test

In [None]:
train['pickup_clusters'] = pickup_clusters
train['dropoff_clusters'] = dropoff_clusters
train.head()

In [None]:
test['pickup_clusters'] = pickup_clusters_test
test['dropoff_clusters'] = dropoff_clusters_test
test.head()

## Creating Backup Before Model Training

In [None]:
train_backup = train.copy()

In [None]:
test_backup = test.copy()

## One-Hot Encoding 

In [None]:
pickup_clusters_encoded = pd.get_dummies(train['pickup_clusters'], prefix='pickup_cluster')
dropoff_clusters_encoded = pd.get_dummies(train['dropoff_clusters'], prefix='dropoff_cluster')
store_and_fwd_flag_encoded = pd.get_dummies(train['store_and_fwd_flag'], prefix='store_and_fwd_flag')
passenger_count_encoded = pd.get_dummies(train['passenger_count'], prefix='passenger_count')
vendor_id_encoded = pd.get_dummies(train['vendor_id'], prefix='vendor_id')

pickup_clusters_encoded.head()

In [None]:
test_pickup_clusters_encoded = pd.get_dummies(test['pickup_clusters'], prefix='pickup_cluster')
test_dropoff_clusters_encoded = pd.get_dummies(test['dropoff_clusters'], prefix='dropoff_cluster')
test_store_and_fwd_flag_encoded = pd.get_dummies(test['store_and_fwd_flag'], prefix='store_and_fwd_flag')
test_passenger_count_encoded = pd.get_dummies(test['passenger_count'], prefix='passenger_count')
test_vendor_id_encoded = pd.get_dummies(test['vendor_id'], prefix='vendor_id')

In [None]:
train.drop('pickup_clusters', axis = 1, inplace = True)
train.drop('dropoff_clusters', axis = 1, inplace = True)
train.drop('store_and_fwd_flag', axis = 1, inplace = True)
train.drop('passenger_count', axis = 1, inplace = True)
train.drop('vendor_id', axis = 1, inplace = True)
train.head()

In [None]:
test.drop('pickup_clusters', axis = 1, inplace = True)
test.drop('dropoff_clusters', axis = 1, inplace = True)
test.drop('store_and_fwd_flag', axis = 1, inplace = True)
test.drop('passenger_count', axis = 1, inplace = True)
test.drop('vendor_id', axis = 1, inplace = True)
test.head()

In [None]:
train = train.join(pickup_clusters_encoded)
train = train.join(dropoff_clusters_encoded)
train = train.join(store_and_fwd_flag_encoded)
train = train.join(passenger_count_encoded)
train = train.join(vendor_id_encoded)
train.head()

In [None]:
test = test.join(test_pickup_clusters_encoded)
test = test.join(test_dropoff_clusters_encoded)
test = test.join(test_store_and_fwd_flag_encoded)
test = test.join(test_passenger_count_encoded)
test = test.join(test_vendor_id_encoded)
test.head()

## Data Preparation for Model Training

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train_cols = train.columns
test_cols = test.columns
print([x for x in train_cols if x not in test_cols])

In [None]:
test['dropoff_cluster_4'] = 0
test['passenger_count_7'] = 0
test['passenger_count_8'] = 0
print([x for x in train.columns if x not in test.columns])
test.head()

In [None]:
X = train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 
              'pickup_longitude', 'pickup_latitude', 
              'dropoff_longitude', 'dropoff_latitude', 
              'date', 'trip_duration', 'Anomaly'], axis = 1).copy()
X.head()

In [None]:
y = train['trip_duration']
y.head()

In [None]:
reg = xgb.XGBRegressor()
reg.fit(X.values, y.values)

In [None]:
X_test = test.drop(['id', 'pickup_datetime', 'dropoff_datetime', 
              'pickup_longitude', 'pickup_latitude', 
              'dropoff_longitude', 'dropoff_latitude', 
              'date'], axis = 1).copy()

In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X_test = X_test[['distance', 'pickup_cluster_0', 'pickup_cluster_1', 'pickup_cluster_2',
               'pickup_cluster_3', 'pickup_cluster_4', 'dropoff_cluster_0',
               'dropoff_cluster_1', 'dropoff_cluster_2', 'dropoff_cluster_3',
               'dropoff_cluster_4', 'store_and_fwd_flag_N', 'store_and_fwd_flag_Y',
               'passenger_count_0', 'passenger_count_1', 'passenger_count_2',
               'passenger_count_3', 'passenger_count_4', 'passenger_count_5',
               'passenger_count_6', 'passenger_count_7', 'passenger_count_8',
               'passenger_count_9', 'vendor_id_1', 'vendor_id_2']]
X_test.columns

In [None]:
pred = reg.predict(X_test.values)
pred

In [None]:
sample_submission.info()

In [None]:
submission = test['id']
submission = {"id":test["id"],"trip_duration":pred}
submission = pd.DataFrame(submission)
submission.info()

In [None]:
submission.to_csv("submission.csv",index=False)