In [1]:
# importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# reading the data
data = pd.read_csv('TaxiTripDuration.csv')

In [3]:
# first five rows of the data
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [4]:
# columns in the dataset
data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [5]:
# shape of the data
data.shape

(729322, 11)

In [6]:
# statistics of the numerical features in the data
data.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0
mean,1.535403,1.662055,-73.973513,40.750919,-73.973422,40.751775,952.2291
std,0.498745,1.312446,0.069754,0.033594,0.069588,0.036037,3864.626
min,1.0,0.0,-121.933342,34.712234,-121.933304,32.181141,1.0
25%,1.0,1.0,-73.991859,40.737335,-73.991318,40.735931,397.0
50%,2.0,1.0,-73.981758,40.75407,-73.979759,40.754509,663.0
75%,2.0,2.0,-73.967361,40.768314,-73.963036,40.769741,1075.0
max,2.0,9.0,-65.897385,51.881084,-65.897385,43.921028,1939736.0


In [7]:
# overview of type of variables
data.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [8]:
# checking the missing values in the dataset
data.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [9]:
# looking at the longest trips
print('Longest 5 trip duration: \n {} '.format(data['trip_duration'].nlargest(5)))
print('\nThe number of rows with 1 as their trip duration values is {}'.format(len(data[data['trip_duration']==1 ])))

Longest 5 trip duration: 
 21813     1939736
259437      86391
119185      86387
177225      86378
496391      86377
Name: trip_duration, dtype: int64 

The the number of rows with 0 as their trip duration values is 13


In [10]:
# There is 1 record with extremely large value of 1939736 and 13 with 1 seconds each. We can drop these rows. 

In [11]:
# dropping the outliers
data=data[data.trip_duration!=data.trip_duration.max()]
data=data[data.trip_duration!=data.trip_duration.min()]

In [12]:
# converting the date time variables to datatime format
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

In [13]:
# creating datetime features
data['pickup_day']=data['pickup_datetime'].dt.day_name()
data['dropoff_day']=data['dropoff_datetime'].dt.day_name()
data['pickup_month']=data['pickup_datetime'].dt.month
data['dropoff_month']=data['dropoff_datetime'].dt.month

In [14]:
# columns of the data
data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_day', 'dropoff_day', 'pickup_month',
       'dropoff_month'],
      dtype='object')

In [15]:
# dropping the variables which might not be helpful to predict the trip duration
data = data.drop(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude'], axis=1)

In [16]:
# converting the categorical variables to numerical variables
data = pd.get_dummies(data)

In [17]:
# separating dependent and independent variables
X = data.drop(['trip_duration'], 1)
y = data['trip_duration']

In [18]:
# creating a training and validation set
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.25, random_state=10)

In [19]:
# creating the model
lreg = LinearRegression()

In [20]:
# training the model
lreg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [21]:
# rmse on training set
pred_train = lreg.predict(x_train)
rmse_train = np.sqrt(mean_squared_error(y_train, pred_train))
rmse_train

3107.261767998848

In [22]:
# rmse on validation set
pred_val = lreg.predict(x_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_val))
rmse

3114.5711014235185