In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns

In [177]:
def haversine_distance(df, lat1, lat2, long1, long2):
    
    r = 6371 #average radius of earth in kilometers
    
    phi1 = np.radians(df[lat1])
    
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2] - df[lat1])
    
    delta_lambda = np.radians(df[long2] - df[long1])
    
    a = np.sin(delta_phi/2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    d = r * c
    
    return d
    
    
    

In [178]:
df = pd.read_csv(r'C:\Users\Asus\OneDrive\Desktop\nyc_taxi\Group5-project\train.csv')

In [179]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [180]:
len(df)

1458644

In [181]:
df.duplicated().sum()

0

In [182]:
df['dist_km'] = haversine_distance(df, 'pickup_latitude', 'dropoff_latitude', 'pickup_longitude', 'dropoff_longitude')

In [183]:
df[df['dist_km'] == 0]

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_km
246,id3487442,2,2016-02-29 18:39:12,2016-02-29 18:42:59,1,-73.981819,40.768963,-73.981819,40.768963,N,227,0.0
291,id0924324,2,2016-05-10 18:07:52,2016-05-10 18:26:21,2,-73.959068,40.775661,-73.959068,40.775661,N,1109,0.0
407,id1145809,2,2016-05-16 23:15:13,2016-05-16 23:31:00,6,-73.808487,40.687336,-73.808487,40.687336,N,947,0.0
702,id3499387,1,2016-01-25 19:45:12,2016-01-25 19:54:52,1,-73.786720,40.647041,-73.786720,40.647041,N,580,0.0
1620,id0753277,1,2016-01-27 22:29:31,2016-01-27 22:29:58,1,-74.209854,40.816853,-74.209854,40.816853,N,27,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1456399,id0618137,2,2016-02-17 08:57:04,2016-02-17 09:18:28,1,-73.978859,40.756721,-73.978859,40.756721,N,1284,0.0
1456627,id1048767,2,2016-02-06 22:01:40,2016-02-06 22:03:47,1,-73.963554,40.768517,-73.963554,40.768517,N,127,0.0
1456989,id2595295,2,2016-03-12 07:37:16,2016-03-12 07:46:42,1,-73.984848,40.728222,-73.984848,40.728222,N,566,0.0
1457114,id0051314,2,2016-04-09 20:34:21,2016-04-09 20:44:53,1,-73.965981,40.770988,-73.965981,40.770988,N,632,0.0


In [184]:
df = df.loc[df['dist_km'] != 0]


In [185]:
len(df)

1452747

In [186]:
#finding upper and lower bounds using IQR method - any data point which is beyond (lower limit - 1.5IQR) and (upper limit + 1.5IQR) will be termed as an outlier

#IQR = difference between the 75th quantile and the 25th quantile

In [187]:
def find_limits(data, variable, fold):
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
    lower_limit = df[variable].quantile(0.25) - (IQR * fold)
    upper_limit = df[variable].quantile(0.75) + (IQR * fold)
    return lower_limit, upper_limit

In [188]:
#CHECKING IF THERE ANY ROWS WITH 0 TRIP DURATION

In [189]:
df[df.trip_duration == 0]  # there arent any

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_km


In [190]:
#removing data points where we have 0 passenger count

In [191]:
len(df[df['passenger_count'] == 0])

52

In [192]:
df = df.loc[df['passenger_count'] != 0]

In [193]:
df[df.passenger_count == 0]

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist_km


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1452695 entries, 0 to 1458643
Data columns (total 12 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1452695 non-null  object 
 1   vendor_id           1452695 non-null  int64  
 2   pickup_datetime     1452695 non-null  object 
 3   dropoff_datetime    1452695 non-null  object 
 4   passenger_count     1452695 non-null  int64  
 5   pickup_longitude    1452695 non-null  float64
 6   pickup_latitude     1452695 non-null  float64
 7   dropoff_longitude   1452695 non-null  float64
 8   dropoff_latitude    1452695 non-null  float64
 9   store_and_fwd_flag  1452695 non-null  object 
 10  trip_duration       1452695 non-null  int64  
 11  dist_km             1452695 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 144.1+ MB


In [195]:
df.pickup_latitude.min(), df.pickup_latitude.max()

(34.71223449707031, 51.88108444213867)

In [196]:
lower_limit_pickup_latitude, upper_limit_pickup_latitude = find_limits(df, 'pickup_latitude', 1.5)

In [197]:
lower_limit_pickup_latitude, upper_limit_pickup_latitude

(40.69091796875, 40.8148193359375)

In [198]:
df.pickup_longitude.min(), df.pickup_longitude.max()

(-121.93334197998048, -66.97216033935547)

In [199]:
lower_limit_pickup_longitude, upper_limit_pickup_longitude = find_limits(df, 'pickup_longitude', 1.5)

In [200]:
lower_limit_pickup_longitude, upper_limit_pickup_longitude

(-74.0285758972168, -73.93070602416992)

In [201]:
df.dropoff_latitude.min(), df.dropoff_latitude.max()

(32.1811408996582, 43.92102813720703)

In [202]:
lower_limit_drop_latitude, upper_limit_drop_latitude = find_limits(df, 'dropoff_latitude', 1.5)

In [203]:
lower_limit_drop_latitude, upper_limit_drop_latitude

(40.68507957458496, 40.82065391540527)

In [204]:
df.dropoff_longitude.min(), df.dropoff_longitude.max()

(-121.9333038330078, -69.04801940917969)

In [205]:
lower_limit_drop_longitude, upper_limit_drop_longitude = find_limits(df, 'dropoff_longitude', 1.5)

In [206]:
lower_limit_drop_longitude, upper_limit_drop_longitude

(-74.03371047973633, -73.92070388793945)

In [207]:
df.dist_km.min(), df.dist_km.max()

(0.0004241749810421957, 1240.9086766508526)

In [208]:
lower_limit_dist_km, upper_limit_dist_km = find_limits(df, 'dist_km', 1.5)

In [209]:
lower_limit_dist_km, upper_limit_dist_km

(-2.730769788637214, 7.859490350047735)

In [210]:
df.trip_duration.min(), df.trip_duration.max()

(1, 3526282)

In [211]:
lower_limit_trip_duration, upper_limit_trip_duration = find_limits(df, 'trip_duration', 1.5)

In [212]:
lower_limit_trip_duration, upper_limit_trip_duration

(-619.0, 2093.0)

In [213]:
df['trip_duration'].clip(lower=lower_limit_trip_duration, upper=upper_limit_trip_duration, inplace=True)

In [214]:
df['pickup_latitude'].clip(lower=lower_limit_pickup_latitude, upper=upper_limit_pickup_latitude, inplace=True)

In [215]:
df['pickup_longitude'].clip(lower=lower_limit_pickup_longitude, upper = upper_limit_pickup_longitude, inplace=True)

In [216]:
df['dropoff_latitude'].clip(lower = lower_limit_drop_latitude, upper = upper_limit_drop_latitude, inplace=True)

In [217]:
df['dropoff_longitude'].clip(lower=lower_limit_drop_longitude, upper = upper_limit_drop_longitude, inplace=True)

In [218]:
df['dist_km'].clip(lower=lower_limit_dist_km, upper = upper_limit_dist_km, inplace=True)

In [219]:
df = pd.concat([df, pd.get_dummies(df['store_and_fwd_flag'],dtype=int)], axis=1)
df.drop(['store_and_fwd_flag'], axis=1, inplace=True)
df = pd.concat([df, pd.get_dummies(df['vendor_id'],dtype=int)], axis=1)
df.drop(['vendor_id'], axis=1, inplace=True)


In [220]:
df.head()

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,dist_km,N,Y,1,2
0,id2875421,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,455,1.498521,1,0,0,1
1,id2377394,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,663,1.805507,1,0,1,0
2,id3858529,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,2093,6.385098,1,0,0,1
3,id3504673,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,429,1.485498,1,0,0,1
4,id2181028,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,435,1.188588,1,0,0,1


In [221]:
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)
df['month'] = df.pickup_datetime.dt.month
df['week'] = df['pickup_datetime'].dt.isocalendar().week
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour
df['minute'] = df.pickup_datetime.dt.minute
df['minute_oftheday'] = df['hour'] * 60 + df['minute']
df.drop(['minute'], axis=1, inplace=True)

In [222]:
df.head()

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,dist_km,N,Y,1,2,month,week,weekday,hour,minute_oftheday
0,id2875421,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,455,1.498521,1,0,0,1,3,11,0,17,1044
1,id2377394,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,663,1.805507,1,0,1,0,6,23,6,0,43
2,id3858529,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,2093,6.385098,1,0,0,1,1,3,1,11,695
3,id3504673,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,429,1.485498,1,0,0,1,4,14,2,19,1172
4,id2181028,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,435,1.188588,1,0,0,1,3,12,5,13,810


In [223]:
y = df["trip_duration"]
df.drop(["trip_duration"], axis=1, inplace=True)
df.drop(['id'], axis=1, inplace=True)
X = df

In [224]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((973305, 17), (973305,), (479390, 17), (479390,))

In [171]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl (101.0 MB)
                                              0.0/101.0 MB ? eta -:--:--
                                             0.3/101.0 MB 10.6 MB/s eta 0:00:10
                                             1.5/101.0 MB 19.0 MB/s eta 0:00:06
     -                                       3.4/101.0 MB 24.4 MB/s eta 0:00:05
     --                                      5.5/101.0 MB 29.4 MB/s eta 0:00:04
     ---                                     7.8/101.0 MB 33.2 MB/s eta 0:00:03
     ---                                    10.0/101.0 MB 37.7 MB/s eta 0:00:03
     ----                                   13.0/101.0 MB 54.7 MB/s eta 0:00:02
     -----                                  15.7/101.0 MB 59.5 MB/s eta 0:00:02
     ------                                 18.1/101.0 MB 59.5 MB/s eta 0:00:02
     -------                                20.7/101.0 MB 59.5 MB/s eta 0:00:02
     --------                            

In [225]:
from catboost import CatBoostRegressor, CatBoostClassifier


In [226]:
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6)

# Fit the model
model.fit(X_train, y_train)

# Generate predictions
predictions = model.predict(X_test)

0:	learn: 495.4554156	total: 308ms	remaining: 30.5s
1:	learn: 466.1586086	total: 399ms	remaining: 19.6s
2:	learn: 440.6320434	total: 486ms	remaining: 15.7s
3:	learn: 418.3469310	total: 591ms	remaining: 14.2s
4:	learn: 399.3155136	total: 699ms	remaining: 13.3s
5:	learn: 382.7056972	total: 798ms	remaining: 12.5s
6:	learn: 368.5253686	total: 900ms	remaining: 12s
7:	learn: 356.4022155	total: 1s	remaining: 11.5s
8:	learn: 345.8804102	total: 1.12s	remaining: 11.3s
9:	learn: 337.0310708	total: 1.25s	remaining: 11.2s
10:	learn: 329.1192458	total: 1.33s	remaining: 10.8s
11:	learn: 322.6888077	total: 1.45s	remaining: 10.6s
12:	learn: 317.2662131	total: 1.5s	remaining: 10s
13:	learn: 312.4724635	total: 1.55s	remaining: 9.53s
14:	learn: 308.4578116	total: 1.61s	remaining: 9.13s
15:	learn: 304.9834715	total: 1.67s	remaining: 8.74s
16:	learn: 302.0336267	total: 1.72s	remaining: 8.4s
17:	learn: 299.4492723	total: 1.77s	remaining: 8.09s
18:	learn: 296.9610501	total: 1.83s	remaining: 7.82s
19:	learn: 2

In [227]:
predictions

array([ 960.56951428,  751.96748053,  626.86084334, ...,  343.28002371,
       1987.77499829,  601.30122637])

In [232]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error



In [229]:
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 67345.13598971754


In [230]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))


In [231]:
print(rmse)

259.5094140676163


In [233]:
predictions = np.maximum(predictions, 0)
y_test = np.maximum(y_test, 0)

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_test + 1, predictions + 1))

In [234]:
rmsle

0.40278754195757094