<a href="https://colab.research.google.com/github/rautaditya2606/ACME-Insurance-Premium-Predictor/blob/main/NYC_Taxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# !pip install opendatasets --quiet
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import matplotlib
%matplotlib inline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import random

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [7]:
database_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction'
od.download(database_url)

Skipping, found downloaded files in "./new-york-city-taxi-fare-prediction" (use force=True to force download)


In [8]:
data_dir = '/content/new-york-city-taxi-fare-prediction/'

In [9]:
!head {data_dir + '/train.csv'}

key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
2012-12-03 13:10:00.000000125,9,2012-12-03 13:10:00 UTC,-74.006462,40.7267

In [10]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')

In [11]:
selected_cols
dtypes = {
 'fare_amount': 'float32',
 'pickup_longitude':'float32',
 'pickup_latitude':'float32',
 'dropoff_longitude':'float32',
 'dropoff_latitude':'float32',
 'passenger_count': 'uint8'
}

In [12]:
def skip_rows(row_idx):
    if row_idx == 0:
        return False
    return random.random() > 0.01
df = pd.read_csv(
    data_dir + '/train.csv',
    usecols=selected_cols,
    dtype=dtypes,
    parse_dates=['pickup_datetime'],
    skiprows=skip_rows
)


In [13]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,5.7,2010-11-16 19:40:00+00:00,-74.000549,40.727493,-74.005035,40.719124,1
1,5.0,2015-02-01 17:24:59+00:00,-73.983711,40.749619,-73.992615,40.742867,1
2,5.5,2013-09-25 14:47:00+00:00,-74.003639,40.747478,-73.999893,40.740650,1
3,9.7,2012-03-29 18:39:00+00:00,-73.980110,40.751820,-73.950905,40.775948,1
4,7.3,2011-12-31 21:05:00+00:00,-73.988548,40.748764,-74.000816,40.718128,1
...,...,...,...,...,...,...,...
553688,7.7,2009-01-19 11:48:47+00:00,-73.995087,40.739735,-73.974739,40.755974,1
553689,6.0,2013-02-24 21:25:24+00:00,-73.993835,40.729855,-73.993988,40.720776,1
553690,4.5,2009-06-09 10:24:59+00:00,-73.983200,40.734810,-73.991753,40.726433,1
553691,7.0,2015-05-03 11:05:29+00:00,-73.960236,40.758030,-73.981026,40.763073,1


In [14]:
test_df = pd.read_csv(data_dir + '/test.csv', parse_dates=['pickup_datetime'])

In [15]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6


In [16]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,553693.0,553693.0,553693.0,553688.0,553688.0,553693.0
mean,11.339559,-72.51767,39.929035,-72.522392,39.909504,1.687074
std,9.767504,13.446193,8.455087,12.228749,9.802295,1.310125
min,-57.330002,-3337.128174,-2485.697266,-2256.421143,-3472.655518,0.0
25%,6.0,-73.992065,40.734928,-73.991379,40.734081,1.0
50%,8.5,-73.981758,40.752689,-73.980133,40.75317,1.0
75%,12.5,-73.967102,40.767105,-73.963737,40.768089,2.0
max,465.0,2553.027588,1703.092773,2832.669189,2023.273804,34.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553693 entries, 0 to 553692
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        553693 non-null  float32            
 1   pickup_datetime    553693 non-null  datetime64[ns, UTC]
 2   pickup_longitude   553693 non-null  float32            
 3   pickup_latitude    553693 non-null  float32            
 4   dropoff_longitude  553688 non-null  float32            
 5   dropoff_latitude   553688 non-null  float32            
 6   passenger_count    553693 non-null  uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 15.3 MB


In [18]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [19]:
train_df.shape, val_df.shape

((442954, 7), (110739, 7))

In [20]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [21]:
list(train_df.columns)

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [22]:
input_cols = [
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']
target_cols = 'fare_amount'

In [23]:
train_inputs = train_df[input_cols]
train_targets = train_df[target_cols]
val_inputs = val_df[input_cols]
val_targets = val_df[target_cols]

In [24]:
def add_dateparts(df, col):
  df[col+'_year'] = df[col].dt.year
  df[col+'_month'] = df[col].dt.month
  df[col+'_day'] = df[col].dt.day
  df[col+'_weekday'] = df[col].dt.weekday
  df[col+'_hour'] = df[col].dt.hour

In [25]:
add_dateparts(train_df, 'pickup_datetime')
add_dateparts(val_df, 'pickup_datetime')
add_dateparts(test_df, 'pickup_datetime')

In [26]:
train_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
121432,8.1,2010-09-08 09:46:06+00:00,-73.874496,40.774097,-73.985855,40.741676,1,2010,9,8,2,9
498657,11.3,2011-11-14 06:44:19+00:00,-73.926598,40.765701,-73.958199,40.768700,1,2011,11,14,0,6
286115,16.5,2013-02-06 21:16:30+00:00,-74.009521,40.705925,-73.962799,40.719734,1,2013,2,6,2,21
17908,11.0,2014-06-16 20:32:37+00:00,0.000000,0.000000,0.000000,0.000000,1,2014,6,16,0,20
18842,10.1,2012-06-30 19:28:00+00:00,-73.955223,40.764717,-73.988647,40.750027,1,2012,6,30,5,19
...,...,...,...,...,...,...,...,...,...,...,...,...
110268,6.1,2010-06-28 11:30:00+00:00,-73.789490,40.655788,-73.789001,40.654480,5,2010,6,28,0,11
259178,7.3,2011-03-22 09:00:23+00:00,-73.984894,40.742836,-73.977356,40.752766,1,2011,3,22,1,9
365838,7.0,2013-10-19 18:24:13+00:00,-73.977051,40.743366,-73.961365,40.758675,1,2013,10,19,5,18
131932,6.5,2009-09-09 23:22:14+00:00,-74.001251,40.737549,-73.992218,40.758480,1,2009,9,9,2,23


In [27]:
val_df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
53296,9.3,2011-05-17 20:39:35+00:00,-73.983658,40.749035,-73.994003,40.722694,1,2011,5,17,1,20
349746,7.7,2011-12-11 21:54:58+00:00,-73.976685,40.765263,-73.945694,40.773602,1,2011,12,11,6,21
125418,6.0,2013-05-15 21:40:00+00:00,-73.975082,40.750896,-73.985847,40.758083,1,2013,5,15,2,21
289289,9.7,2009-05-18 12:07:00+00:00,-73.986732,40.745678,-74.007828,40.740215,1,2009,5,18,0,12
371713,16.0,2014-06-01 12:48:38+00:00,-73.998405,40.716888,-73.985497,40.737564,1,2014,6,1,6,12
...,...,...,...,...,...,...,...,...,...,...,...,...
468086,5.5,2015-02-07 12:18:58+00:00,-73.982216,40.767849,-73.969048,40.761757,1,2015,2,7,5,12
386563,4.1,2010-07-08 14:32:41+00:00,-73.991562,40.747311,-73.998116,40.745304,1,2010,7,8,3,14
487538,10.0,2014-07-16 19:41:00+00:00,-74.006615,40.731117,-73.981941,40.737461,1,2014,7,16,2,19
448442,7.3,2012-01-01 01:52:11+00:00,-73.936394,40.814602,-73.957809,40.801144,4,2012,1,1,6,1


In [28]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1,2015,1,27,1,13
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,1,13
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1,2011,10,8,5,11
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1,2012,12,1,5,21
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,5,21
...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6,2015,5,10,6,12
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6,2015,1,12,0,17
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6,2015,4,19,6,20
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6,2015,1,31,5,1


In [54]:
def haversine_vectorized(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c
    return km

def add_trip_dist(df):
    df['trip_distance'] = haversine_vectorized(
        df['pickup_latitude'],
        df['pickup_longitude'],
        df['dropoff_latitude'],
        df['dropoff_longitude']
    )


In [55]:
add_trip_dist(train_df)
add_trip_dist(val_df)
add_trip_dist(test_df)

In [64]:
val_df.head(1)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
53296,9.3,2011-05-17 20:39:35+00:00,-73.983658,40.749035,-73.994003,40.722694,1,2011,5,17,1,20,3.055787,24.138298,13.447196,20.096348,3.841302,1.794937


In [65]:
train_df.head(1)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
121432,8.1,2010-09-08 09:46:06+00:00,-73.874496,40.774097,-73.985855,40.741676,1,2010,9,8,2,9,10.048539,23.308048,12.484447,21.037176,2.771911,2.818758


In [66]:
test_df.head(1)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,27,1,13,2.32326,22.829806,11.988963,21.532282,2.30245,3.307377


In [57]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [58]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
  lon, lat = landmark_lonlat
  df[landmark_name+'_drop_distance'] = haversine_vectorized(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

In [59]:
def add_landmarks(a_df):
  landmarks = [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]
  for name, lonlat in landmarks:
    add_landmark_dropoff_distance(a_df, name, lonlat)

In [60]:
add_landmarks(train_df)
add_landmarks(val_df)
add_landmarks(test_df)

In [61]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) &
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) &
              (df['pickup_longitude'] <= -72) &
              (df['dropoff_longitude'] >= -75) &
              (df['dropoff_longitude'] <= -72) &
              (df['pickup_latitude'] >= 40) &
              (df['pickup_latitude'] <= 42) &
              (df['dropoff_latitude'] >=40) &
              (df['dropoff_latitude'] <= 42) &
              (df['passenger_count'] >= 1) &
              (df['passenger_count'] <= 6)]

In [68]:
remove_outliers(train_df)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
121432,8.1,2010-09-08 09:46:06+00:00,-73.874496,40.774097,-73.985855,40.741676,1,2010,9,8,2,9,10.048539,23.308048,12.484447,21.037176,2.771911,2.818758
498657,11.3,2011-11-14 06:44:19+00:00,-73.926598,40.765701,-73.958199,40.768700,1,2011,11,14,0,6,2.681721,20.409166,9.365546,24.173241,0.646507,6.001367
286115,16.5,2013-02-06 21:16:30+00:00,-74.009521,40.705925,-73.962799,40.719734,1,2013,2,6,2,21,4.226365,20.680344,10.029797,23.558235,1.833370,5.241961
18842,10.1,2012-06-30 19:28:00+00:00,-73.955223,40.764717,-73.988647,40.750027,1,2012,6,30,5,19,3.254616,23.651114,12.774714,20.748545,2.969262,2.627185
475058,10.5,2012-08-24 22:34:06+00:00,-73.985214,40.719093,-73.988693,40.748459,3,2012,8,24,4,22,3.278507,23.649561,12.783232,20.739050,2.989264,2.601744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,6.1,2010-06-28 11:30:00+00:00,-73.789490,40.655788,-73.789001,40.654480,5,2010,6,28,0,11,0.151242,1.279389,10.183492,42.879410,19.750950,24.628471
259178,7.3,2011-03-22 09:00:23+00:00,-73.984894,40.742836,-73.977356,40.752766,1,2011,3,22,1,9,1.273621,22.422098,11.516483,22.006498,1.773463,3.822734
365838,7.0,2013-10-19 18:24:13+00:00,-73.977051,40.743366,-73.961365,40.758675,1,2013,10,19,5,18,2.154794,20.698509,9.730742,23.793514,0.668580,5.578905
131932,6.5,2009-09-09 23:22:14+00:00,-74.001251,40.737549,-73.992218,40.758480,1,2009,9,9,2,23,2.448577,24.082129,13.157333,20.377920,3.289533,2.417360


In [69]:
remove_outliers(val_df)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
53296,9.3,2011-05-17 20:39:35+00:00,-73.983658,40.749035,-73.994003,40.722694,1,2011,5,17,1,20,3.055787,24.138298,13.447196,20.096348,3.841302,1.794937
349746,7.7,2011-12-11 21:54:58+00:00,-73.976685,40.765263,-73.945694,40.773602,1,2011,12,11,6,21,2.769913,19.078747,7.972175,25.571941,1.955452,7.381263
125418,6.0,2013-05-15 21:40:00+00:00,-73.975082,40.750896,-73.985847,40.758083,1,2013,5,15,2,21,1.208499,23.380011,12.450004,21.081688,2.601333,3.016427
289289,9.7,2009-05-18 12:07:00+00:00,-73.986732,40.745678,-74.007828,40.740215,1,2009,5,18,0,12,1.877899,25.726032,14.923263,18.597994,5.105231,0.876855
371713,16.0,2014-06-01 12:48:38+00:00,-73.998405,40.716888,-73.985497,40.737564,1,2014,6,1,6,12,2.543322,23.252350,12.456769,21.067459,2.791848,2.819468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468086,5.5,2015-02-07 12:18:58+00:00,-73.982216,40.767849,-73.969048,40.761757,1,2015,2,7,5,12,1.299586,21.556307,10.579240,22.951385,0.846314,4.786108
386563,4.1,2010-07-08 14:32:41+00:00,-73.991562,40.747311,-73.998116,40.745304,1,2010,7,8,3,14,0.595264,24.674047,13.834753,19.687281,4.020246,1.650013
487538,10.0,2014-07-16 19:41:00+00:00,-74.006615,40.731117,-73.981941,40.737461,1,2014,7,16,2,19,2.195705,22.859591,12.063345,21.462006,2.449008,3.201377
448442,7.3,2012-01-01 01:52:11+00:00,-73.936394,40.814602,-73.957809,40.801144,4,2012,1,1,6,1,2.342546,20.583920,9.348730,24.335146,0.897925,6.397895


In [71]:
train_df.to_parquet('train.parquet')

In [72]:
val_df.to_parquet('val.parquet')

In [73]:
test_df.to_parquet('test.parquet')