# Project: Building Basic predictive models over the NYC Taxi Trip dataset.


In [1]:
#importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from haversine import haversine, Unit
from scipy.stats import skew, kurtosis
sns.set() 
%matplotlib inline

In [2]:
# Importing the provided dataset from the data folder.
df = pd.read_csv('nyc_taxi_trip_duration Dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [4]:
df=df[df.trip_duration!=df.trip_duration.max()]
df=df[df.trip_duration!=df.trip_duration.min()]

In [5]:
df['trip_duration_hour']=df['trip_duration']/3600

In [6]:
df.passenger_count.value_counts()

1    517403
2    105096
5     38926
3     29692
6     24107
4     14050
0        32
7         1
9         1
Name: passenger_count, dtype: int64

In [7]:
df=df[df.passenger_count<=6]
df=df[df.passenger_count!=0]

In [8]:
# Changing the pickup_datetime and dropoff_datetime from object to datetime datatype
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime']=pd.to_datetime(df['dropoff_datetime'])

In [9]:
# Creating features based on month
df['pickup_by_month'] = df['pickup_datetime'].dt.month
df['dropoff_by_month'] = df['dropoff_datetime'].dt.month

In [10]:
# Creating features based on month
df['pickup_by_month'] = df['pickup_datetime'].dt.month
df['dropoff_by_month'] = df['dropoff_datetime'].dt.month

In [11]:
df['pickup_day']=df['pickup_datetime'].dt.day
df['dropoff_day']=df['dropoff_datetime'].dt.day
df['pickup_month']=df['pickup_datetime'].dt.month
df['dropoff_month']=df['dropoff_datetime'].dt.month

In [12]:
print(df[df.dropoff_month==7].pickup_datetime.dt.month.value_counts())
print(df[df.dropoff_month==7].pickup_datetime.dt.day.value_counts())

6    53
Name: pickup_datetime, dtype: int64
30    53
Name: pickup_datetime, dtype: int64


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729274 entries, 0 to 729321
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  729274 non-null  object        
 1   vendor_id           729274 non-null  int64         
 2   pickup_datetime     729274 non-null  datetime64[ns]
 3   dropoff_datetime    729274 non-null  datetime64[ns]
 4   passenger_count     729274 non-null  int64         
 5   pickup_longitude    729274 non-null  float64       
 6   pickup_latitude     729274 non-null  float64       
 7   dropoff_longitude   729274 non-null  float64       
 8   dropoff_latitude    729274 non-null  float64       
 9   store_and_fwd_flag  729274 non-null  object        
 10  trip_duration       729274 non-null  int64         
 11  trip_duration_hour  729274 non-null  float64       
 12  pickup_by_month     729274 non-null  int64         
 13  dropoff_by_month    729274 no

In [14]:
df['id'] = df['id'].str.replace('id', '')

In [15]:
df=df.drop(['pickup_datetime', 'dropoff_datetime'],axis=1)

In [16]:
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'N': 0,
                               'Y': 1})

In [17]:
df.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_duration_hour,pickup_by_month,dropoff_by_month,pickup_day,dropoff_day,pickup_month,dropoff_month
0,1080784,2,1,-73.953918,40.778873,-73.963875,40.771164,0,400,0.111111,2,2,29,29,2,2
1,889885,1,2,-73.988312,40.731743,-73.994751,40.694931,0,1100,0.305556,3,3,11,11,3,3
2,857912,2,2,-73.997314,40.721458,-73.948029,40.774918,0,1635,0.454167,2,2,21,21,2,2
3,3744273,2,6,-73.96167,40.75972,-73.956779,40.780628,0,1141,0.316944,1,1,5,5,1,1
4,232939,1,1,-74.01712,40.708469,-73.988182,40.740631,0,848,0.235556,2,2,17,17,2,2


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729274 entries, 0 to 729321
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  729274 non-null  object 
 1   vendor_id           729274 non-null  int64  
 2   passenger_count     729274 non-null  int64  
 3   pickup_longitude    729274 non-null  float64
 4   pickup_latitude     729274 non-null  float64
 5   dropoff_longitude   729274 non-null  float64
 6   dropoff_latitude    729274 non-null  float64
 7   store_and_fwd_flag  729274 non-null  int64  
 8   trip_duration       729274 non-null  int64  
 9   trip_duration_hour  729274 non-null  float64
 10  pickup_by_month     729274 non-null  int64  
 11  dropoff_by_month    729274 non-null  int64  
 12  pickup_day          729274 non-null  int64  
 13  dropoff_day         729274 non-null  int64  
 14  pickup_month        729274 non-null  int64  
 15  dropoff_month       729274 non-nul

In [19]:
enc = LabelEncoder()

In [20]:
cat_col = ['id']

In [21]:
# Encode categorical columns

encoder = LabelEncoder()

for col in cat_col:
    df[col] = encoder.fit_transform(df[col])

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729274 entries, 0 to 729321
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  729274 non-null  int32  
 1   vendor_id           729274 non-null  int64  
 2   passenger_count     729274 non-null  int64  
 3   pickup_longitude    729274 non-null  float64
 4   pickup_latitude     729274 non-null  float64
 5   dropoff_longitude   729274 non-null  float64
 6   dropoff_latitude    729274 non-null  float64
 7   store_and_fwd_flag  729274 non-null  int64  
 8   trip_duration       729274 non-null  int64  
 9   trip_duration_hour  729274 non-null  float64
 10  pickup_by_month     729274 non-null  int64  
 11  dropoff_by_month    729274 non-null  int64  
 12  pickup_day          729274 non-null  int64  
 13  dropoff_day         729274 non-null  int64  
 14  pickup_month        729274 non-null  int64  
 15  dropoff_month       729274 non-nul

# 1. Build a K-Nearest neighbours model for the given dataset and find the best value of K

In [23]:
#seperating independent and dependent variables
x1 = df.drop(['trip_duration_hour'], axis=1)
y1 = df['trip_duration_hour']

In [24]:
from sklearn.model_selection import train_test_split
train_x1,valid_x1,train_y1,valid_y1 = train_test_split(x1,y1, random_state = 101, shuffle=False)

In [25]:
from sklearn.neighbors import KNeighborsRegressor

In [26]:
model2 = KNeighborsRegressor(n_neighbors=5)
model2.fit(train_x1,train_y1)
pred2=model2.predict(valid_x1)
pred2[:10], model2.score(valid_x1, valid_y1)

(array([0.24738889, 0.19577778, 0.26261111, 0.057     , 0.2795    ,
        0.55238889, 0.07466667, 0.69755556, 0.06694444, 0.17805556]),
 0.9979063440475711)

# 2. Build a Linear model for the given dataset with regularisation.


In [27]:
#seperating independent and dependent variables
x2 = df.drop(['trip_duration_hour'], axis=1)
y2 = df['trip_duration_hour']

In [28]:
from sklearn.model_selection import train_test_split
train_x2,valid_x2,train_y2,valid_y2 = train_test_split(x2,y2, random_state = 101, shuffle=False)

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
model1 = LinearRegression()
model1.fit(train_x2,train_y2)
pred1=model1.predict(valid_x2)
pred1[:10], model1.score(valid_x2, valid_y2)

(array([0.24083333, 0.19444444, 0.26111111, 0.05333333, 0.28027778,
        0.56638889, 0.07111111, 0.69722222, 0.06722222, 0.18111111]),
 1.0)

# 3. Build a Random Forest model for the given dataset.


In [31]:
#seperating independent and dependent variables
x3 = df.drop(['trip_duration_hour'], axis=1)
y3 = df['trip_duration_hour']

In [32]:
from sklearn.model_selection import train_test_split
train_x3,test_x3,train_y3,test_y3 = train_test_split(x3,y3, random_state = 101, shuffle=False)

In [33]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(train_x3, train_y3)
y_pred = regressor.predict(test_x3)

In [34]:
regressor.score(train_x3, train_y3)

0.9999957595710197

# 4. Build a Gradient Boosting model for the given dataset.


In [35]:
#seperating independent and dependent variables
X4= df.drop(['trip_duration_hour'], axis=1)
y4 = df['trip_duration_hour']

In [36]:
from sklearn.model_selection import train_test_split

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=0)

In [37]:
#Importing GBDT Regressor 
from sklearn.ensemble import GradientBoostingRegressor

In [38]:
#creating an Gradient boosting instance
regressor = GradientBoostingRegressor(random_state=96)

In [39]:
regressor.fit(X4_train, y4_train)

GradientBoostingRegressor(random_state=96)

In [40]:
regressor.score(X4_train, y4_train)

0.9999795372274976