In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_df = pd.read_csv("/content/drive/MyDrive/258/Assignment 5/train.csv",nrows = 5_000_000)


In [4]:
train_df.shape

(5000000, 8)

In [5]:
train_df.describe()


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5000000.0,5000000.0,5000000.0,4999964.0,4999964.0,5000000.0
mean,11.3408,-72.50678,39.91974,-72.50652,39.91725,1.684695
std,9.820175,12.8097,8.963509,12.84777,9.486767,1.331854
min,-100.0,-3426.609,-3488.08,-3412.653,-3488.08,0.0
25%,6.0,-73.99206,40.73491,-73.99139,40.73404,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75315,1.0
75%,12.5,-73.96711,40.76712,-73.96367,40.76811,2.0
max,1273.31,3439.426,3310.364,3457.622,3345.917,208.0


In [22]:
train = train_df[train_df.fare_amount >=0]
train = train.dropna(how='any', axis=0)
print("new size: %d" % len(train))

old size: 5000000
new size: 4999753


In [7]:
test = pd.read_csv("/content/drive/MyDrive/258/Assignment 5/test.csv")
print("shape of test data", test.shape)
test.head()

shape of test data (9914, 7)


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [23]:
def select_boundary(df, Bound):
    return (df.pickup_longitude >= Bound[0]) & (df.pickup_longitude <= Bound[1]) & (df.dropoff_longitude >= Bound[0]) & (df.dropoff_longitude <= Bound[1]) & \
           (df.pickup_latitude >= Bound[2]) & (df.pickup_latitude <= Bound[3]) &  (df.dropoff_latitude >= Bound[2]) & (df.dropoff_latitude <= Bound[3])

In [24]:
Bound = (-73.5, -71.8, 39.5, 40.8)
train = train[select_boundary(train, Bound)]
print('New size: %d' % len(train))

New size: 4893706


In [11]:
def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df["weekday"] = df.pickup_datetime.dt.weekday
    return df

In [26]:
def distTo(lat1, lon1, lat2, lon2):
    a = 0.5 - np.cos((lat2 - lat1) * 0.017)/2 + np.cos(lat1 * 0.017) * np.cos(lat2 * 0.017) * (1 - np.cos((lon2 - lon1) * 0.017)) / 2
    return 0.62 * 12742 * np.arcsin(np.sqrt(a)) 

In [13]:
train = prepare_time_features(train)
test = prepare_time_features(test)
train['distance_miles'] = distance(train.pickup_latitude, train.pickup_longitude,train.dropoff_latitude, train.dropoff_longitude)
test['distance_miles'] = distance(test.pickup_latitude, test.pickup_longitude, test.dropoff_latitude, test.dropoff_longitude)

In [27]:
# Function to add distance from near by coordinates
def transform(data):

    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)

    data['pickup_to_jfk'] = distTo(jfk[1], jfk[0],
                                         data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_to_jfk'] = distTo(jfk[1], jfk[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_to_ewr'] = distTo(ewr[1], ewr[0], 
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_to_ewr'] = distTo(ewr[1], ewr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_to_lgr'] = distTo(lgr[1], lgr[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_to_lgr'] = distTo(lgr[1], lgr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    
    return data

train = transform(train)
test = transform(test)

In [16]:
train[(train['distance_miles']==0)&(train['fare_amount']==0)]


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,pickup_distance_to_jfk,dropoff_distance_to_jfk,pickup_distance_to_ewr,dropoff_distance_to_ewr,pickup_distance_to_lgr,dropoff_distance_to_lgr
331597,2013-08-05 06:45:54.0000005,0.0,2013-08-05 06:45:00+00:00,-73.982354,40.679971,-73.982354,40.679971,1,6,8,2013,0,0.0,11.033947,11.033947,10.088806,10.088806,8.777184,8.777184
436658,2015-03-04 22:33:39.0000008,0.0,2015-03-04 22:33:00+00:00,-74.043442,40.788208,-74.043442,40.788208,1,22,3,2015,2,0.0,17.208424,17.208424,9.674078,9.674078,8.899057,8.899057
689250,2015-05-27 23:10:03.0000004,0.0,2015-05-27 23:10:00+00:00,-73.994125,40.741131,-73.994125,40.741131,1,23,5,2015,2,0.0,13.253658,13.253658,10.09737,10.09737,6.755154,6.755154
1662545,2010-03-13 02:39:29.0000001,0.0,2010-03-13 02:39:00+00:00,-74.035278,40.738683,-74.035278,40.738683,2,2,3,2010,5,0.0,15.060166,15.060166,8.044207,8.044207,8.844117,8.844117
2214498,2010-02-03 11:44:07.0000002,0.0,2010-02-03 11:44:00+00:00,-74.014147,40.710546,-74.014147,40.710546,1,11,2,2010,2,0.0,13.262164,13.262164,8.524551,8.524551,8.651064,8.651064
2329503,2015-02-19 09:43:11.0000007,0.0,2015-02-19 09:43:00+00:00,-73.953857,40.813251,-73.953857,40.813251,1,9,2,2015,3,0.0,15.028026,15.028026,14.369537,14.369537,4.874081,4.874081
2581417,2015-05-07 19:12:43.0000005,0.0,2015-05-07 19:12:00+00:00,-73.913727,40.779018,-73.913727,40.779018,1,19,5,2015,3,0.0,11.874358,11.874358,14.988477,14.988477,2.08368,2.08368
3070651,2015-01-25 20:54:02.0000004,0.0,2015-01-25 20:54:00+00:00,-73.981293,40.769012,-73.981293,40.769012,1,20,1,2015,6,0.0,13.825115,13.825115,11.511639,11.511639,5.640439,5.640439
3252539,2015-01-14 10:15:51.0000002,0.0,2015-01-14 10:15:00+00:00,-73.937508,40.758129,-73.937508,40.758129,1,10,1,2015,2,0.0,11.613939,11.613939,13.284777,13.284777,3.567391,3.567391
3712453,2010-03-29 12:47:22.0000005,0.0,2010-03-29 12:47:00+00:00,-73.836776,40.869879,-73.836776,40.869879,1,12,3,2010,0,0.0,16.089089,16.089089,21.622569,21.622569,6.712571,6.712571


In [19]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('fare_amount', axis=1),df_train['fare_amount'], test_size=0.3, random_state = 40)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3914832, 16)
(978708, 16)
(3914832,)
(978708,)


In [20]:
params = {
    'max_depth': 7,
    'gamma' :0,
    'eta':.03, 
    'subsample': 1,
    'colsample_bytree': 0.9, 
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}

In [21]:
def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

[0]	test-rmse:14.125
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:13.7389
[2]	test-rmse:13.3895
[3]	test-rmse:13.0275
[4]	test-rmse:12.6785
[5]	test-rmse:12.3408
[6]	test-rmse:12.0147
[7]	test-rmse:11.6985
[8]	test-rmse:11.3925
[9]	test-rmse:11.0971
[10]	test-rmse:10.8283
[11]	test-rmse:10.5525
[12]	test-rmse:10.2869
[13]	test-rmse:10.028
[14]	test-rmse:9.77928
[15]	test-rmse:9.53946
[16]	test-rmse:9.30771
[17]	test-rmse:9.08474
[18]	test-rmse:8.86859
[19]	test-rmse:8.66108
[20]	test-rmse:8.46012
[21]	test-rmse:8.2664
[22]	test-rmse:8.09036
[23]	test-rmse:7.91179
[24]	test-rmse:7.73926
[25]	test-rmse:7.57358
[26]	test-rmse:7.41307
[27]	test-rmse:7.25895
[28]	test-rmse:7.11081
[29]	test-rmse:6.96905
[30]	test-rmse:6.83142
[31]	test-rmse:6.69941
[32]	test-rmse:6.5729
[33]	test-rmse:6.45134
[34]	test-rmse:6.33484
[35]	test-rmse:6.22878
[36]	test-rmse:6.12134
[37]	test-rmse:6.01789
[38]	test-rmse:5.91912
[39]	test-rmse:5.82416
[40]	test-rmse:5.73329
[41]	test-rmse