#CMPE-258: Deep Learning
#New York Taxi Fare Prediction- XGBoost
#Spring 2021, Vijay Eranti
#Samer Baslan

Reference: https://github.com/haleyhfeng/cmpe258-deep_learning/blob/main/HW5/hw5.4_MLOps_XGBoost.ipynb

In [1]:
# load some default Python modules
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
!pip install --upgrade --force-reinstall --no-deps kaggle
%env KAGGLE_USERNAME=samerb
%env KAGGLE_KEY=a527e9226f4a16c1953611848fe1a088
NUM_ROWS = 15000

!kaggle competitions download -c new-york-city-taxi-fare-prediction
!unzip *.zip

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/3a/e7/3bac01547d2ed3d308ac92a0878fbdb0ed0f3d41fb1906c319ccbba1bfbc/kaggle-1.5.12.tar.gz (58kB)
[K     |█████▋                          | 10kB 20.8MB/s eta 0:00:01[K     |███████████▏                    | 20kB 28.5MB/s eta 0:00:01[K     |████████████████▊               | 30kB 23.3MB/s eta 0:00:01[K     |██████████████████████▎         | 40kB 27.1MB/s eta 0:00:01[K     |███████████████████████████▉    | 51kB 26.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 7.3MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-cp37-none-any.whl size=73053 sha256=ddc1f579eab19102c179507b2399bbce4326962fd9809ec22a14d76246c8eb67
  Stored in directory: /root/.cache/pip/wheels/a1/6a/26/d30b7499ff85a4a4593377a87ecf55f7d08af42f0de9b60303
Successfully built kaggle
Installing collected packa

In [3]:
train_df = pd.read_csv("train.csv", nrows=NUM_ROWS)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                15000 non-null  object 
 1   fare_amount        15000 non-null  float64
 2   pickup_datetime    15000 non-null  object 
 3   pickup_longitude   15000 non-null  float64
 4   pickup_latitude    15000 non-null  float64
 5   dropoff_longitude  15000 non-null  float64
 6   dropoff_latitude   15000 non-null  float64
 7   passenger_count    15000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 937.6+ KB


In [4]:
print("old size: %d" % len(train_df))
train_df = train_df[train_df.fare_amount >=0]
print("New size: %d" % len(train_df))

old size: 15000
New size: 14997


##Feature Engineering Functions

In [5]:
def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df["weekday"] = df.pickup_datetime.dt.weekday
    
    return df

In [6]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))   # 2*R*asin...

In [7]:
def transform(data):
    # Distances to nearby airports, 
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)

    data['pickup_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                         data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_ewr'] = distance(ewr[1], ewr[0], 
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_ewr'] = distance(ewr[1], ewr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    
    return data

##Apply Feature Engineering

In [8]:
train_df = prepare_time_features(train_df)
train_df['distance_miles'] = distance(train_df.pickup_latitude, train_df.pickup_longitude, train_df.dropoff_latitude, train_df.dropoff_longitude)
train_df = transform(train_df)

In [9]:
print(len(train_df))
train_df[(train_df['distance_miles']==0)&(train_df['fare_amount']==0)]
print(len(train_df))

14997
14997


In [10]:
train_df = train_df.drop(columns = ['key', 'pickup_datetime'])
train_df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,pickup_distance_to_jfk,dropoff_distance_to_jfk,pickup_distance_to_ewr,dropoff_distance_to_ewr,pickup_distance_to_lgr,dropoff_distance_to_lgr
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,17,6,2009,0,0.640487,6.527098,5.926672,17.433727,17.508086,4.14279,4.776039
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,16,1,2010,1,5.25067,13.373879,14.335733,8.435586,12.054959,8.708241,5.518847
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,0,8,2011,3,0.863411,13.550258,13.478114,11.198778,10.482997,5.79158,6.399678
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,4,4,2012,5,1.739386,12.657169,13.789559,10.266,10.687245,6.64828,6.287901
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,7,3,2010,1,1.242218,13.254398,13.577,12.092157,13.133498,4.961705,4.350323


In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('fare_amount', axis=1),
                                                    train_df['fare_amount'], test_size=0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11997, 16)
(3000, 16)
(11997,)
(3000,)


##XGBoost Model

In [12]:
params = {
    'max_depth': 8,
    'gamma' :0,
    'eta':.05, 
    'subsample': 1,
    'colsample_bytree': 0.9, 
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}

In [13]:
def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

[0]	test-rmse:13.649
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:13.0679
[2]	test-rmse:12.552
[3]	test-rmse:12.0301
[4]	test-rmse:11.5406
[5]	test-rmse:11.0788
[6]	test-rmse:10.6472
[7]	test-rmse:10.2435
[8]	test-rmse:9.86466
[9]	test-rmse:9.50789
[10]	test-rmse:9.18857
[11]	test-rmse:8.87486
[12]	test-rmse:8.57942
[13]	test-rmse:8.30236
[14]	test-rmse:8.04573
[15]	test-rmse:7.80676
[16]	test-rmse:7.58384
[17]	test-rmse:7.36418
[18]	test-rmse:7.17167
[19]	test-rmse:6.995
[20]	test-rmse:6.82872
[21]	test-rmse:6.67804
[22]	test-rmse:6.54785
[23]	test-rmse:6.4045
[24]	test-rmse:6.28175
[25]	test-rmse:6.16855
[26]	test-rmse:6.06007
[27]	test-rmse:5.96602
[28]	test-rmse:5.86694
[29]	test-rmse:5.78704
[30]	test-rmse:5.71696
[31]	test-rmse:5.65165
[32]	test-rmse:5.57573
[33]	test-rmse:5.51747
[34]	test-rmse:5.45281
[35]	test-rmse:5.3936
[36]	test-rmse:5.34539
[37]	test-rmse:5.30154
[38]	test-rmse:5.26617
[39]	test-rmse:5.23251
[40]	test-rmse:5.20003
[41]	test-rmse:5