# NYC Taxi Fare Prediction

In [14]:
import os
os.environ["MODIN_ENGINE"] = "ray"
import numpy as np
import modin.pandas as pd
from scipy import stats
import geopy.distance
from scipy.stats import norm, skew
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import xgboost as xgb

In [15]:
df = pd.read_csv('../data/train.csv')
df = df.sample(frac=0.1)
test_df = pd.read_csv('../data/test.csv')
test_df = df.sample(frac=0.1)
df.shape

(55423856, 8)

In [3]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2014-11-29 02:25:34.0000003,8.0,2014-11-29 02:25:34 UTC,-73.997652,40.720614,-73.997613,40.739611,1
1,2009-03-18 20:31:12.0000003,15.0,2009-03-18 20:31:12 UTC,-74.00514,40.73344,-73.979652,40.787983,1
2,2011-11-17 18:28:20.0000005,11.7,2011-11-17 18:28:20 UTC,-73.996736,40.763419,-73.960357,40.769577,1
3,2014-02-06 11:12:00.000000144,6.0,2014-02-06 11:12:00 UTC,-73.980562,40.755187,-73.98787,40.743105,1
4,2015-02-11 09:17:31.0000001,13.0,2015-02-11 09:17:31 UTC,-73.969574,40.785194,-73.982025,40.754986,2


In [4]:
df.dropna(subset=['dropoff_latitude', 'dropoff_longitude'], inplace = True)

In [5]:
df.drop(df[df['fare_amount'] < 2.5].index, axis=0, inplace = True)
df.drop(df[df['fare_amount'] > 500].index, axis=0, inplace = True)
df.drop(df[df['pickup_longitude'] == 0].index, axis=0, inplace = True)
df.drop(df[df['pickup_latitude'] == 0].index, axis=0, inplace = True)
df.drop(df[df['dropoff_longitude'] == 0].index, axis=0, inplace = True)
df.drop(df[df['dropoff_latitude'] == 0].index, axis=0, inplace = True)
df.drop(df[df['passenger_count'] == 208].index, axis=0, inplace = True)
df.drop(df[df['passenger_count'] > 5].index, axis=0, inplace = True)
df.drop(df[df['passenger_count'] == 0].index, axis=0, inplace = True)

In [6]:
df['key'] = pd.to_datetime(df['key'])
key = test_df.key
test_df['key'] = pd.to_datetime(test_df['key'])
df['pickup_datetime']  = pd.to_datetime(df['pickup_datetime'])
test_df['pickup_datetime']  = pd.to_datetime(test_df['pickup_datetime'])

In [7]:
df['Year'] = df['pickup_datetime'].dt.year
df['Month'] = df['pickup_datetime'].dt.month
df['Date'] = df['pickup_datetime'].dt.day
df['Day of Week'] = df['pickup_datetime'].dt.dayofweek
df['Hour'] = df['pickup_datetime'].dt.hour
df.drop('pickup_datetime', axis = 1, inplace = True)
df.drop('key', axis = 1, inplace = True)

test_df['Year'] = test_df['pickup_datetime'].dt.year
test_df['Month'] = test_df['pickup_datetime'].dt.month
test_df['Date'] = test_df['pickup_datetime'].dt.day
test_df['Day of Week'] = test_df['pickup_datetime'].dt.dayofweek
test_df['Hour'] = test_df['pickup_datetime'].dt.hour
test_df.drop('pickup_datetime', axis = 1, inplace = True)
test_df.drop('key', axis = 1, inplace = True)

In [8]:
df.dropna(inplace=True)

df.drop(df.index[(df.pickup_longitude < -75) | 
           (df.pickup_longitude > -72) | 
           (df.pickup_latitude < 40) | 
           (df.pickup_latitude > 42)],inplace=True)
df.drop(df.index[(df.dropoff_longitude < -75) | 
           (df.dropoff_longitude > -72) | 
           (df.dropoff_latitude < 40) | 
           (df.dropoff_latitude > 42)],inplace=True)

In [9]:
def jfk_dist(trip):
    jfk_lat = 40.6413
    jfk_long = -73.7781
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    jfk_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (jfk_lat, jfk_long)).miles
    return jfk_distance

def lga_dist(trip):
    lga_lat = 40.7769
    lga_long = -73.8740
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    lga_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (lga_lat, lga_long)).miles
    return lga_distance

def ewr_dist(trip):
    ewr_lat = 40.6895
    ewr_long = -74.1745
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    ewr_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (ewr_lat, ewr_long)).miles
    return ewr_distance

def tsq_dist(trip):
    tsq_lat = 40.7580
    tsq_long = -73.9855
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    tsq_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (tsq_lat, tsq_long)).miles
    return tsq_distance

def cpk_dist(trip):
    cpk_lat = 40.7812
    cpk_long = -73.9665
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    cpk_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (cpk_lat, cpk_long)).miles
    return cpk_distance

def lib_dist(trip):
    lib_lat = 40.6892
    lib_long = -74.0445
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    lib_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (lib_lat, lib_long)).miles
    return lib_distance

def gct_dist(trip):
    gct_lat = 40.7527
    gct_long = -73.9772
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    gct_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (gct_lat, gct_long)).miles
    return gct_distance

def met_dist(trip):
    met_lat = 40.7794
    met_long = -73.9632
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    met_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (met_lat, met_long)).miles
    return met_distance

def wtc_dist(trip):
    wtc_lat = 40.7126
    wtc_long = -74.0099
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    wtc_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (wtc_lat, wtc_long)).miles
    return wtc_distance

In [10]:
def optimize_floats(df):
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df):
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df

def optimize(df):
    return optimize_floats(optimize_ints(df))

In [11]:
df = optimize(df)
test_df = optimize(test_df)

To request implementation, send an email to feature_requests@modin.org.


In [12]:
def calc_dists(df):
    df['jfk'] = df.apply(lambda x: jfk_dist(x), axis = 1 )
    df['lga'] = df.apply(lambda x: lga_dist(x), axis = 1 )
    df['ewr'] = df.apply(lambda x: ewr_dist(x), axis = 1 )
    df['tsq'] = df.apply(lambda x: tsq_dist(x), axis = 1 )
    df['met'] = df.apply(lambda x: met_dist(x), axis = 1 )
    df['wtc'] = df.apply(lambda x: wtc_dist(x), axis = 1 )
    return df

In [13]:
df = calc_dists(df)
test_df = calc_dists(test_df)

2021-11-01 16:10:31,598	ERROR worker.py:79 -- Unhandled error (suppress with RAY_IGNORE_UNHANDLED_ERRORS=1): [36mray::apply_func()[39m (pid=340899, ip=10.1.0.7)
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: [36mray::deploy_ray_func()[39m (pid=340899, ip=10.1.0.7)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/ray/pandas_on_ray/frame/axis_partition.py", line 207, in deploy_ray_func
    result = func(*args)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/base/frame/axis_partition.py", line 303, in deploy_axis_func
    result = func(dataframe, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/base/frame/data.py", line 1153, in _map_reduce_func
    series_result = func(df, *args, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/backends/pandas/query_compiler

RayTaskError(ValueError): [36mray::apply_func()[39m (pid=340899, ip=10.1.0.7)
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: [36mray::deploy_ray_func()[39m (pid=340899, ip=10.1.0.7)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/ray/pandas_on_ray/frame/axis_partition.py", line 207, in deploy_ray_func
    result = func(*args)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/base/frame/axis_partition.py", line 303, in deploy_axis_func
    result = func(dataframe, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/engines/base/frame/data.py", line 1153, in _map_reduce_func
    series_result = func(df, *args, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/backends/pandas/query_compiler.py", line 2478, in <lambda>
    axis, lambda df: df.apply(func, axis=axis, *args, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/pandas/core/frame.py", line 8740, in apply
    return op.apply()
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/pandas/core/apply.py", line 688, in apply
    return self.apply_standard()
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/pandas/core/apply.py", line 812, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/pandas/core/apply.py", line 828, in apply_series_generator
    results[i] = self.f(v)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/modin/utils.py", line 500, in wrapper
    result = func(*args, **kwargs)
  File "/tmp/ipykernel_340742/3149247693.py", line 2, in <lambda>
  File "/tmp/ipykernel_340742/2034051946.py", line 6, in jfk_dist
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/distance.py", line 522, in __init__
    super().__init__(*args, **kwargs)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/distance.py", line 276, in __init__
    kilometers += self.measure(a, b)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/distance.py", line 538, in measure
    a, b = Point(a), Point(b)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/point.py", line 175, in __new__
    return cls.from_sequence(seq)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/point.py", line 472, in from_sequence
    return cls(*args)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/point.py", line 188, in __new__
    _normalize_coordinates(latitude, longitude, altitude)
  File "/home/nvidiatest/miniconda3/envs/mlops/lib/python3.8/site-packages/geopy/point.py", line 74, in _normalize_coordinates
    raise ValueError('Latitude must be in the [-90; 90] range.')
ValueError: Latitude must be in the [-90; 90] range.

In [None]:
# df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1,inplace=True)
# test_df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1,inplace=True)

In [None]:
df.describe()

In [None]:
df = optimize(df)
test_df = optimize(test_df)

In [None]:
X, y = df.drop('fare_amount', axis = 1), df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [None]:
%%timeit
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(test_df)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

"""xgb_params = {
    'min_child_weight': 1, 
    'learning_rate': 0.05, 
    'colsample_bytree': 0.7, 
    'max_depth': 10,
    'subsample': 0.7,
    'n_estimators': 5000,
    'n_jobs': -1, 
    'booster' : 'gbtree', 
    'eval_metric': 'rmse'}"""

xgb_params = {
    'n_jobs': -1, 
    'booster' : 'gbtree', 
    'eval_metric': 'rmse'}

#model = xgb.train(xgb_params, dtrain, 700, watchlist, early_stopping_rounds=100, maximize=False, verbose_eval=50)
model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=watchlist, early_stopping_rounds=10, verbose_eval=100)
print(model)

In [None]:
y_train_pred = model.predict(dtrain)
y_pred = model.predict(dvalid)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')

In [None]:
test_preds = model.predict(dtest)

In [None]:
test_preds = model.predict(dtest)

submission = pd.DataFrame(
    {'key': key, 'fare_amount': test_preds},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission1.csv', index = False)