In [None]:
#import library
import zipfile
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime as dt
import calendar
from math import  atan2, radians, sin, cos, sqrt
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
#import input file and create dataframe

file_path ='C:\\Users\\pgulhare\\OneDrive - Capgemini\\ds code\\nyc-taxi-trip-duration\\'
f_zip_train = file_path + 'train.zip'
zip_train = zipfile.ZipFile(f_zip_train)
df_train = pd.read_csv(zip_train.open('train.csv'))

In [None]:
df_train.head()

In [None]:
#convert the datetime column into the datetime standard
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
df_train['dropoff_datetime'] = pd.to_datetime(df_train['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')

In [None]:
#check for null values
df_train[pd.isnull(df_train)].sum()

In [None]:
#check the duration of complete data  
print('Minimum datetime is ', min(df_train['pickup_datetime']))
print('Maximum datetime is ', max(df_train['dropoff_datetime']))

# Extracting Feature

In [None]:
#extract information from datetime column
#helpful in doing EDA and understanding the data

df_train['month'] = df_train['pickup_datetime'].dt.month
df_train['date'] = df_train['pickup_datetime'].dt.date
df_train['weekday'] = df_train['pickup_datetime'].dt.weekday
df_train['day'] = df_train['pickup_datetime'].apply(lambda x: x.day)
df_train['hour'] = df_train['pickup_datetime'].apply(lambda x : x.hour)

In [None]:
df_train.head()

In [None]:
#using the latitude, longitude, calculate the trip distance

def cal_distance(row):
    radius = 6373.0
    pickup_latitude = radians(row['pickup_latitude'])
    pickup_longitude = radians(row['pickup_longitude'])
    dropoff_latitude = radians(row['dropoff_latitude'])
    dropoff_longitude = radians(row['dropoff_longitude'])
    
    lon = dropoff_longitude - pickup_longitude
    lat = dropoff_latitude - pickup_latitude
    a = sin(lat / 2)** 2 + cos(pickup_latitude) * cos(dropoff_latitude) * sin(lon / 2)**2
    dist = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = radius * dist
    return distance
df_train['trip_distance'] = df_train.apply(lambda row : cal_distance(row), axis = 1)

In [None]:
df_train.head()

# EDA

In [None]:
#distance outliner identification
df_train.boxplot(column='trip_distance')

In [None]:
#visualise outliner trip duration
plt.scatter(range(df_train.shape[0]),np.sort(df_train['trip_duration']))

In [None]:
#visualise week of day trip count
sns.countplot(df_train['weekday'])
#thursday , friday, saturday are the most number of taxis

In [None]:
sns.countplot(df_train['hour'])

In [None]:
sns.countplot(df_train['day'])

In [None]:
sns.countplot(df_train['month'])

In [None]:
pc = df_train.groupby('passenger_count')['trip_duration'].mean()
sns.barplot(pc.index,pc.values)

In [None]:
pickup = df_train.groupby(['pickup_latitude','pickup_longitude'])['id'].count().reset_index().rename(columns = {'id':'Num_Trips'})
pickup['Num_Trips'] = pickup['Num_Trips'].astype('float64')
pickup_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 10,)

hm_wide = folium.plugins.HeatMap(list(zip(pickup.pickup_latitude.values, 
                           pickup.pickup_longitude.values,
                           pickup.Num_Trips.values)),
                  min_opacity = 0.2,
                  radius = 5, blur = 15,
                  max_zoom = 1)
pickup_map.add_child(hm_wide)

In [None]:
#checking the correlation
cor = df_train.corr()
mask = np.array(cor)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(cor,mask= mask,square=True,annot=True)

In [None]:
#encoding the feature columns
df_train = pd.concat([df_train, pd.get_dummies(df_train['store_and_fwd_flag'],prefix = 'store')], axis=1)
df_train.drop(['store_and_fwd_flag'], axis=1, inplace=True)

df_train = pd.concat([df_train, pd.get_dummies(df_train['vendor_id'],prefix = 'vendor')], axis=1)
df_train.drop(['vendor_id'], axis=1, inplace=True)

In [None]:
#drop columns 
df_train = df_train.drop(['id','dropoff_latitude', 'pickup_latitude', 'dropoff_longitude', 'pickup_longitude','pickup_datetime','dropoff_datetime','month','day','date','weekday'], axis=1)

In [None]:
df_train.info()

In [None]:
#create X and y
y = df_train["trip_duration"].copy().to_numpy().reshape(-1,1)
X = df_train.drop(["trip_duration"], axis=1)


In [None]:
X.head()

In [None]:
#divide the training data into the train and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_train.head()
X_val.head()

In [None]:
#using Linear Regression to train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
#validation
y_train_pred = lr_model.predict(X_train)
y_val_pred = lr_model.predict(X_val)

In [None]:
#calculate mae
mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
print(f"mae: {mae}, val_mae: {val_mae}")

In [None]:
#using Linear Regression SGD
sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)

In [None]:
X_val.head()

In [None]:
#fit validate

sgd_model.fit(X_train, y_train)
y_train_pred = sgd_model.predict(X_train)
y_val_pred = sgd_model.predict(X_val)

In [None]:
#calculate mae in SGD
mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
print(f"mae: {mae}, val_mae: {val_mae}")

# Prediction 

In [None]:
#input file

f_zip_test = file_path + 'test.zip'
zip_test = zipfile.ZipFile(f_zip_test)
df_test = pd.read_csv(zip_test.open('test.csv'))


#zip_test = zipfile.ZipFile('C:\\Users\\pgulhare\\OneDrive - Capgemini\\ds code\\nyc-taxi-trip-duration\\test.zip')
#df_test = pd.read_csv(zip_test.open('test.csv'))
df_predict = df_test[['id']].copy()

In [None]:
df_test.head()

In [None]:
#data manupulation

df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
df_test['hour'] = df_test['pickup_datetime'].apply(lambda x : x.hour)


df_test['trip_distance'] = df_test.apply(lambda row : cal_distance(row), axis = 1)

#encoding the feature columns
df_test = pd.concat([df_test, pd.get_dummies(df_test['store_and_fwd_flag'],prefix = 'store')], axis=1)
df_test.drop(['store_and_fwd_flag'], axis=1, inplace=True)

df_test = pd.concat([df_test, pd.get_dummies(df_test['vendor_id'],prefix = 'vendor')], axis=1)
df_test.drop(['vendor_id'], axis=1, inplace=True)
df_test = df_test.drop(['id','dropoff_latitude', 'pickup_latitude', 'dropoff_longitude', 'pickup_longitude','pickup_datetime'], axis=1)


In [None]:
#prediciton on test data
df_test['trip_duration_in_sec'] = lr_model.predict(df_test).astype(int)

In [None]:
df_predict = pd.concat([df_predict, df_test], axis=1)
df_predict = df_predict[['id', 'trip_duration_in_sec']]

In [None]:
df_predict
NYC_Taxi_time_pred_path = file_path + 'NYC_Taxi_time_pred' + dt.datetime.now().strftime("%Y%m%d") + '.csv'
df_predict.to_csv(NYC_Taxi_time_pred_path)