#                               Modelingfor  NYC Taxi trip duration 

### Variable Identification, Range of Variables

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('nyc_taxi_trip_duration.csv')
data.head(2)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100


In [3]:
# Checking number of rows and columns
data.shape

(729322, 11)

In [4]:
data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [5]:
data.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [6]:
# Checking for missing values
data.isna().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

#### INT Data type

In [7]:
data.select_dtypes('int64').head(5)

Unnamed: 0,vendor_id,passenger_count,trip_duration
0,2,1,400
1,1,2,1100
2,2,2,1635
3,2,6,1141
4,1,1,848


#### Float data type

In [8]:
data.select_dtypes('float64').head(5)

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,-73.953918,40.778873,-73.963875,40.771164
1,-73.988312,40.731743,-73.994751,40.694931
2,-73.997314,40.721458,-73.948029,40.774918
3,-73.96167,40.75972,-73.956779,40.780628
4,-74.01712,40.708469,-73.988182,40.740631


In [9]:
# since there are only two vendors this can be comsidered as a category
data.vendor_id = data.vendor_id.astype('category')

#### Object data type

In [10]:
data.select_dtypes('object').head(5)

Unnamed: 0,id,pickup_datetime,dropoff_datetime,store_and_fwd_flag
0,id1080784,2016-02-29 16:40:21,2016-02-29 16:47:01,N
1,id0889885,2016-03-11 23:35:37,2016-03-11 23:53:57,N
2,id0857912,2016-02-21 17:59:33,2016-02-21 18:26:48,N
3,id3744273,2016-01-05 09:44:31,2016-01-05 10:03:32,N
4,id0232939,2016-02-17 06:42:23,2016-02-17 06:56:31,N


In [11]:
# pickup_datetime and dropoff_datetime 
data.pickup_datetime = pd.to_datetime(data.pickup_datetime)
data.dropoff_datetime = pd.to_datetime(data.dropoff_datetime)

In [12]:
# store_and_fwd_flag is a category data type
data.store_and_fwd_flag = data.store_and_fwd_flag.astype('category')

In [13]:
# adding an additional column called trip_duration_in_hrs 
data['trip_duration_in_hrs'] = (data.trip_duration/(60*60)).round(2)

In [14]:
data.describe()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,trip_duration_in_hrs
count,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0
mean,1.662055,-73.973513,40.750919,-73.973422,40.751775,952.2291,0.264504
std,1.312446,0.069754,0.033594,0.069588,0.036037,3864.626,1.073519
min,0.0,-121.933342,34.712234,-121.933304,32.181141,1.0,0.0
25%,1.0,-73.991859,40.737335,-73.991318,40.735931,397.0,0.11
50%,1.0,-73.981758,40.75407,-73.979759,40.754509,663.0,0.18
75%,2.0,-73.967361,40.768314,-73.963036,40.769741,1075.0,0.3
max,9.0,-65.897385,51.881084,-65.897385,43.921028,1939736.0,538.82


## Summary of the dataset
###  * Numerical Variables are 
pickup_datetime,          
dropoff_datetime,         
passenger_count,           
pickup_longitude,        
pickup_latitude,         
dropoff_longitude,      
dropoff_latitude,       
trip_duration,             
trip_duration_in_hrs.          
### * Categorical Variables are 
Vendor_id,
store_and_fwd_flag.
### 1) Vendors
There are two vendors namely Vendor 1 and Vendor 2

Vendor 2 has the highest number of trips
### 2) Passengers

The minimum passenger count is 0 and maximum is 9
### 3) Trip Duration

The Mimimum trip duration is 0 and maximum is 538.82 hours

The mean trip duration is 0.26 hours which is 16 minutes

There are a lot of outliers in trip duration


### Feature Engineering

In [15]:
# Generating detetime related features 
import datetime
data['pickup_hour'] = data.pickup_datetime.dt.hour
data['dropoff_hour'] = data.dropoff_datetime.dt.hour
data['pickup_minute'] = data.pickup_datetime.dt.minute
data['dropoff_minute'] = data.dropoff_datetime.dt.minute
data['pickup_week'] = data.pickup_datetime.dt.week  
data['dropoff_week'] = data.dropoff_datetime.dt.week  
data['day_of_year'] =data.pickup_datetime.dt.dayofyear
data.head(2)

  data['pickup_week'] = data.pickup_datetime.dt.week
  data['dropoff_week'] = data.dropoff_datetime.dt.week


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_duration_in_hrs,pickup_hour,dropoff_hour,pickup_minute,dropoff_minute,pickup_week,dropoff_week,day_of_year
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,0.11,16,16,40,47,9,9,60
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,0.31,23,23,35,53,10,10,71


In [16]:
# Calculating Distance
def Haversine_distance(lat1,lon1,lat2,lon2): 
    latitude = lat1-lat2
    long= lon1-lon2

    d = 2*5574.8*(np.sqrt((np.sin(latitude)**2)+np.cos(lat1)*np.cos(lat2)*np.sin(long/2)))
    return d

In [17]:
# Adding a fewture of distance between pickup and dropoff location
data['distance'] = Haversine_distance(data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [18]:
# binning hour
bins=[0,7,12,16,19,23]
group=['Late night','Morning','Afternoon','Evening', 'Night']
data["Hour_bin"]=pd.cut(data['pickup_hour'], bins, labels=group)

### Model Building
##### 1) Impute missing values
##### 2) Remove categorical variables
##### 3) Treat outliers
##### 4) Feature scaling and variable transformation
##### 5) Model 

##### 1) Imputing missing values



In [19]:
print("Before imputing\n\n",data.isna().sum())
data.distance = data.distance.fillna(data.distance.median())
data.Hour_bin = data.Hour_bin.fillna(data.Hour_bin.mode()[0])
print("\nAfter imputing\n\n",data.isna().sum())

Before imputing

 id                           0
vendor_id                    0
pickup_datetime              0
dropoff_datetime             0
passenger_count              0
pickup_longitude             0
pickup_latitude              0
dropoff_longitude            0
dropoff_latitude             0
store_and_fwd_flag           0
trip_duration                0
trip_duration_in_hrs         0
pickup_hour                  0
dropoff_hour                 0
pickup_minute                0
dropoff_minute               0
pickup_week                  0
dropoff_week                 0
day_of_year                  0
distance                374501
Hour_bin                 26726
dtype: int64

After imputing

 id                      0
vendor_id               0
pickup_datetime         0
dropoff_datetime        0
passenger_count         0
pickup_longitude        0
pickup_latitude         0
dropoff_longitude       0
dropoff_latitude        0
store_and_fwd_flag      0
trip_duration           0
trip_duration_

##### 2) Remove categorical variables

In [20]:
data.dtypes

id                              object
vendor_id                     category
pickup_datetime         datetime64[ns]
dropoff_datetime        datetime64[ns]
passenger_count                  int64
pickup_longitude               float64
pickup_latitude                float64
dropoff_longitude              float64
dropoff_latitude               float64
store_and_fwd_flag            category
trip_duration                    int64
trip_duration_in_hrs           float64
pickup_hour                      int64
dropoff_hour                     int64
pickup_minute                    int64
dropoff_minute                   int64
pickup_week                      int64
dropoff_week                     int64
day_of_year                      int64
distance                       float64
Hour_bin                      category
dtype: object

In [21]:
data = data.drop(['id','vendor_id','store_and_fwd_flag'],axis = 1)
data.dtypes

pickup_datetime         datetime64[ns]
dropoff_datetime        datetime64[ns]
passenger_count                  int64
pickup_longitude               float64
pickup_latitude                float64
dropoff_longitude              float64
dropoff_latitude               float64
trip_duration                    int64
trip_duration_in_hrs           float64
pickup_hour                      int64
dropoff_hour                     int64
pickup_minute                    int64
dropoff_minute                   int64
pickup_week                      int64
dropoff_week                     int64
day_of_year                      int64
distance                       float64
Hour_bin                      category
dtype: object

##### 3) Outlier Treatement

In [22]:
data.describe()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,trip_duration_in_hrs,pickup_hour,dropoff_hour,pickup_minute,dropoff_minute,pickup_week,dropoff_week,day_of_year,distance
count,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0
mean,1.662055,-73.973513,40.750919,-73.973422,40.751775,952.2291,0.264504,13.611698,13.60616,29.597785,29.631301,13.843884,13.842964,91.882577,960.314852
std,1.312446,0.069754,0.033594,0.069588,0.036037,3864.626,1.073519,6.402853,6.48637,17.3324,17.420089,8.558386,8.555423,51.555655,530.222983
min,0.0,-121.933342,34.712234,-121.933304,32.181141,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,1.0,-73.991859,40.737335,-73.991318,40.735931,397.0,0.11,9.0,9.0,15.0,15.0,7.0,7.0,48.0,874.96538
50%,1.0,-73.981758,40.75407,-73.979759,40.754509,663.0,0.18,14.0,14.0,30.0,30.0,13.0,13.0,92.0,874.96538
75%,2.0,-73.967361,40.768314,-73.963036,40.769741,1075.0,0.3,19.0,19.0,45.0,45.0,20.0,20.0,136.0,874.96538
max,9.0,-65.897385,51.881084,-65.897385,43.921028,1939736.0,538.82,23.0,23.0,59.0,59.0,53.0,53.0,182.0,11186.791798


In [23]:
# Outlier treatement for trip duration

Q1 = data.trip_duration_in_hrs.quantile(0.25)
Q3 = data.trip_duration_in_hrs.quantile(0.75)
IQR = Q3 - Q1
whis_low = abs(Q1 - 1.5*IQR)
whis_high =1.5*IQR +Q3

data.trip_duration_in_hrs.loc[data.trip_duration_in_hrs < whis_low] = whis_low
data.trip_duration_in_hrs.loc[data.trip_duration_in_hrs > whis_high] = whis_high

data.trip_duration_in_hrs.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.trip_duration_in_hrs.loc[data.trip_duration_in_hrs < whis_low] = whis_low
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.trip_duration_in_hrs.loc[data.trip_duration_in_hrs > whis_high] = whis_high


count    729322.000000
mean          0.255783
std           0.119874
min           0.175000
25%           0.175000
50%           0.180000
75%           0.300000
max           0.585000
Name: trip_duration_in_hrs, dtype: float64

#### 4) Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [25]:
numerical_columns = data.select_dtypes(include=['int64', 'float64'])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_columns)
scaled_data = pd.DataFrame(scaled_data, columns=numerical_columns.columns)
scaled_data.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,trip_duration_in_hrs,pickup_hour,dropoff_hour,pickup_minute,dropoff_minute,pickup_week,dropoff_week,day_of_year,distance
0,-0.504444,0.280911,0.832127,0.137198,0.538014,-0.142893,-0.6739,0.373006,0.369057,0.600161,0.997051,-0.565981,-0.56607,-0.618411,-0.321851
1,0.257493,-0.212156,-0.570815,-0.3065,-1.577382,0.038237,0.452285,1.466269,1.448244,0.311683,1.341481,-0.449137,-0.449185,-0.405049,-0.397246
2,0.257493,-0.34122,-0.876953,0.364913,0.642175,0.176672,1.62018,0.529187,0.677396,1.696374,-0.208455,-0.79967,-0.79984,-0.773584,-0.160969
3,3.30524,0.169785,0.26198,0.23916,0.800639,0.048846,0.535706,-0.720257,-0.55596,0.830942,-1.528771,-1.500738,-1.50115,-1.68522,-0.160969
4,-0.504444,-0.62516,-1.2636,-0.212103,-0.309245,-0.02697,-0.131663,-1.188799,-1.172638,0.715551,1.513696,-0.79967,-0.79984,-0.85117,-0.160969


In [26]:
x = scaled_data.drop(['trip_duration','trip_duration_in_hrs'],axis = 1)
y = data.trip_duration_in_hrs
x.shape,y.shape

((729322, 13), (729322,))

In [27]:
x.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'pickup_hour', 'dropoff_hour',
       'pickup_minute', 'dropoff_minute', 'pickup_week', 'dropoff_week',
       'day_of_year', 'distance'],
      dtype='object')

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [29]:
# normal split
train_x,test_x,train_y,test_y = train_test_split(x,y,random_state = 101)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((546991, 13), (546991,), (182331, 13), (182331,))

#####  5)  Model

In [30]:
# Ridge linear reression model
from sklearn.linear_model import Ridge
alpha = 1.0  
ridge_model = Ridge(alpha=alpha)

In [31]:
ridge_model.fit(train_x, train_y)
pred_test = ridge_model.predict(test_x)

In [32]:
# r2 score
r2_score(test_y, pred_test).round(2)

0.38

In [33]:
# Decision tree regression model
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor (random_state = 10)

In [34]:
dt.fit(train_x,train_y)

In [35]:
# r2 score
r2_score(dt.predict(test_x),test_y).round(2)

0.54

####  Overall decision tree is a better model for this problem