# NYC Taxi Fare Prediction using CPU

## 1. Importing necessary modules

In [1]:
import numpy as np
import modin.pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import geopy.distance
import time
import ray
start = time.perf_counter()

## 2. Create Ray Execution Environment

In [2]:
ray.init()
env_creation_time = time.perf_counter()

2021-11-08 21:58:42,690	INFO services.py:1250 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


## 3. Reading Data

In [3]:
df = pd.read_csv('../data/train.csv')
read_time = time.perf_counter()

## 4. Dtype Conversion

In [4]:
def dtype_conversion(df):
    df['key'] = df['key'].astype('datetime64')
    df['fare_amount'] = df ['fare_amount'].astype('float32')
    df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64')
    df['pickup_longitude'] = df ['pickup_longitude'].astype('float32')
    df['pickup_latitude'] = df ['pickup_latitude'].astype('float32')
    df['dropoff_longitude'] = df ['dropoff_longitude'].astype('float32')
    df['dropoff_latitude'] = df ['dropoff_latitude'].astype('float32')
    df['passenger_count'] = df ['passenger_count'].astype('uint8')
    return df

df = dtype_conversion(df)
dtype_conversion_time = time.perf_counter()

## 5. Applying Constraints for NYC

In [5]:
def apply_constraints(df):
    query_frags = [
        'fare_amount >= 2.5 and fare_amount < 500',
        'passenger_count > 0 and passenger_count < 6',
        'pickup_longitude > -75 and pickup_longitude < -73',
        'dropoff_longitude > -75 and dropoff_longitude < -73',
        'pickup_latitude > 40 and pickup_latitude < 42',
        'dropoff_latitude > 40 and dropoff_latitude < 42'
    ]
    df = df.query(' and '.join(query_frags))
    return df

df = apply_constraints(df)
apply_constraints_time = time.perf_counter()



## 6. Feature Engineering

In [6]:
def feature_engg(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['weekday'] = df['pickup_datetime'].dt.weekday

    def jfk_dist(trip):
        jfk_lat = 40.6413
        jfk_long = -73.7781
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        jfk_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (jfk_lat, jfk_long)).miles
        return jfk_distance

    def lga_dist(trip):
        lga_lat = 40.7769
        lga_long = -73.8740
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        lga_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (lga_lat, lga_long)).miles
        return lga_distance

    def ewr_dist(trip):
        ewr_lat = 40.6895
        ewr_long = -74.1745
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        ewr_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (ewr_lat, ewr_long)).miles
        return ewr_distance

    def tsq_dist(trip):
        tsq_lat = 40.7580
        tsq_long = -73.9855
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        tsq_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (tsq_lat, tsq_long)).miles
        return tsq_distance

    def met_dist(trip):
        met_lat = 40.7794
        met_long = -73.9632
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        met_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (met_lat, met_long)).miles
        return met_distance

    def wtc_dist(trip):
        wtc_lat = 40.7126
        wtc_long = -74.0099
        dropoff_lat = trip['dropoff_latitude']
        dropoff_long = trip['dropoff_longitude']
        wtc_distance = geopy.distance.geodesic((dropoff_lat, dropoff_long), (wtc_lat, wtc_long)).miles
        return wtc_distance

    def calc_dists(df):
        df['jfk'] = df.apply(lambda x: jfk_dist(x), axis = 1 )
        df['lga'] = df.apply(lambda x: lga_dist(x), axis = 1 )
        df['ewr'] = df.apply(lambda x: ewr_dist(x), axis = 1 )
        df['tsq'] = df.apply(lambda x: tsq_dist(x), axis = 1 )
        df['met'] = df.apply(lambda x: met_dist(x), axis = 1 )
        df['wtc'] = df.apply(lambda x: wtc_dist(x), axis = 1 )
        return df

    df = calc_dists(df)
    df = df.drop(['pickup_datetime','key'], axis=1)
    return df

df = feature_engg(df)
feature_engg_time = time.perf_counter()

## 7. Splitting Data

In [7]:
X, y = df.drop('fare_amount', axis = 1), df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
split_data_time = time.perf_counter()

## 8. Training

In [8]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_params = {
    'min_child_weight': 1, 
    'learning_rate': 0.05, 
    'colsample_bytree': 0.7, 
    'subsample': 0.7,
    'booster' : 'gbtree',
    'objective': 'reg:squarederror',
    'n_jobs' : -1,
    'eval_metric': 'rmse'}

model = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100, verbose_eval=100)
training_time = time.perf_counter()

[2m[36m(apply_func pid=1510486)[0m 
[0]	train-rmse:13.91973	valid-rmse:13.92266
[100]	train-rmse:4.38640	valid-rmse:4.43147
[200]	train-rmse:4.11563	valid-rmse:4.16624
[300]	train-rmse:3.99647	valid-rmse:4.04973
[400]	train-rmse:3.93042	valid-rmse:3.98586
[500]	train-rmse:3.88584	valid-rmse:3.94280
[600]	train-rmse:3.85965	valid-rmse:3.91793
[700]	train-rmse:3.84029	valid-rmse:3.89994
[800]	train-rmse:3.82041	valid-rmse:3.88115
[900]	train-rmse:3.80722	valid-rmse:3.86908
[999]	train-rmse:3.79582	valid-rmse:3.85911


## 9. Testing

In [9]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return (rmse, mae, r2)

def test(model):
    df = pd.read_csv('../data/test.csv')
    df = dtype_conversion(df)
    df = apply_constraints(df)
    df = feature_engg(df)
    df, actual = df.drop('fare_amount', axis = 1), df['fare_amount']
    df = xgb.DMatrix(df)

    pred = model.predict(df)
    rmse, mae, r2 = eval_metrics(actual, pred)
    print("Root Mean Squared Error : ", rmse)
    print("Mean Absolute Error : ", mae)
    print("R-squared Score : ", r2)

test(model)
test_time = time.perf_counter()
stop = time.perf_counter()

Root Mean Squared Error :  3.8050902
Mean Absolute Error :  1.7102177
R-squared Score :  0.8453670447259642


## 10. Time Taken

In [1]:
print("Creating Execution Environment : ", env_creation_time - start, "s")
print("Reading : ", read_time - env_creation_time, "s")
print("Data Type Conversion : ", dtype_conversion_time - read_time, "s")
print("Applying Constraints : ", apply_constraints_time - dtype_conversion_time, "s")
print("Feature Engineering : ", feature_engg_time - apply_constraints_time, "s")
print("Splitting Data : ", split_data_time - feature_engg_time, "s")
print("Training : ", training_time - split_data_time, "s")
print("Testing : ", test_time - training_time, "s")
print("Total : ", stop - start, "s")

Creating Execution Environment :  2.429901456926018s
Reading :  13.60694275307469s
Data Type Conversion :  0.42991262290161103s
Applying Constraints :  265.9359644301003s
Feature Engineering :  2613.31897444895s
Splitting Data :  2613.31897444895s
Training :  16987.63809621299s
Testing :  329.0208089299267s
Total :  20572.9191167939s
