In [1]:
import xgboost
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
import seaborn as sns
%matplotlib inline

In [2]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

In [3]:
train_raw.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [4]:
train_raw.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
train_raw.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [6]:
train_raw.drop(['id', 'store_and_fwd_flag'], inplace=True, axis=1)

In [7]:
test_raw.drop(['id', 'store_and_fwd_flag'], inplace=True, axis=1)

In [8]:
test_raw.columns

Index(['vendor_id', 'pickup_datetime', 'passenger_count', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'],
      dtype='object')

In [9]:
train_raw.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'trip_duration'],
      dtype='object')

In [10]:
train_raw.pickup_datetime = pd.to_datetime(train_raw.pickup_datetime)
train_raw.dropoff_datetime = pd.to_datetime(train_raw.dropoff_datetime)

test_raw.pickup_datetime = pd.to_datetime(test_raw.pickup_datetime)

In [11]:
train_raw['pickup_hour'] = train_raw.pickup_datetime.dt.hour
train_raw['pickup_day'] = train_raw.pickup_datetime.dt.day
train_raw['pickup_week'] = train_raw.pickup_datetime.dt.week
train_raw['pickup_month'] = train_raw.pickup_datetime.dt.month

In [12]:
train_raw['dropoff_hour'] = train_raw.dropoff_datetime.dt.hour
train_raw['dropoff_day'] = train_raw.dropoff_datetime.dt.day
train_raw['dropoff_week'] = train_raw.dropoff_datetime.dt.week
train_raw['dropoff_month'] = train_raw.dropoff_datetime.dt.month

In [13]:
test_raw['pickup_hour'] = test_raw.pickup_datetime.dt.hour
test_raw['pickup_day'] = test_raw.pickup_datetime.dt.day
test_raw['pickup_week'] = test_raw.pickup_datetime.dt.week
test_raw['pickup_month'] = test_raw.pickup_datetime.dt.month

In [14]:
import math
def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    km = 6371 * c
    return km

In [15]:
train_raw.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_hour,pickup_day,pickup_week,pickup_month,dropoff_hour,dropoff_day,dropoff_week,dropoff_month
0,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,455,17,14,11,3,17,14,11,3
1,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,663,0,12,23,6,0,12,23,6
2,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,2124,11,19,3,1,12,19,3,1
3,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,429,19,6,14,4,19,6,14,4
4,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,435,13,26,12,3,13,26,12,3


In [16]:

print (train_raw.pickup_datetime.min())
print (train_raw.pickup_datetime.max())

2016-01-01 00:00:17
2016-06-30 23:59:39


In [17]:
train_raw['dist'] = test_raw.apply(lambda x: haversine(
    x['pickup_longitude'], x['pickup_latitude'], x['dropoff_longitude'], x['dropoff_latitude']), axis=1)

In [18]:
test_raw['dist'] = test_raw.apply(lambda x: haversine(
    x['pickup_longitude'], x['pickup_latitude'], x['dropoff_longitude'], x['dropoff_latitude']), axis=1)

In [19]:
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()

holidays_train = cal.holidays(start=train_raw['pickup_datetime'].min(), end=train_raw['pickup_datetime'].max())
train_raw['is_holiday']  = train_raw['pickup_datetime'].isin(holidays_train).astype(int)

In [20]:
holidays_test = cal.holidays(start=test_raw['pickup_datetime'].min(), end=test_raw['pickup_datetime'].max())
test_raw['is_holiday']  = test_raw['pickup_datetime'].isin(holidays_test).astype(int)

In [21]:
train_raw['is_holiday']

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
1458614    0
1458615    0
1458616    0
1458617    0
1458618    0
1458619    0
1458620    0
1458621    0
1458622    0
1458623    0
1458624    0
1458625    0
1458626    0
1458627    0
1458628    0
1458629    0
1458630    0
1458631    0
1458632    0
1458633    0
1458634    0
1458635    0
1458636    0
1458637    0
1458638    0
1458639    0
1458640    0
1458641    0
1458642    0
1458643    0
Name: is_holiday, Length: 1458644, dtype: int64

In [22]:
train_raw.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'trip_duration', 'pickup_hour', 'pickup_day',
       'pickup_week', 'pickup_month', 'dropoff_hour', 'dropoff_day',
       'dropoff_week', 'dropoff_month', 'dist', 'is_holiday'],
      dtype='object')

In [23]:
train_feats = train_raw[['vendor_id', 'pickup_hour', 'pickup_day', 'pickup_week', 
'pickup_month', 'passenger_count', 'pickup_longitude', 
'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'dist', 'is_holiday']]
target = train_raw[['trip_duration']]

In [24]:
test_raw.drop('pickup_datetime', axis=1, inplace=True)
test_raw.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_hour,pickup_day,pickup_week,pickup_month,dist,is_holiday
0,1,1,-73.988129,40.732029,-73.990173,40.75668,23,30,26,6,2.746426,0
1,1,1,-73.964203,40.679993,-73.959808,40.655403,23,30,26,6,2.759239,0
2,1,1,-73.997437,40.737583,-73.98616,40.729523,23,30,26,6,1.306155,0
3,2,1,-73.95607,40.7719,-73.986427,40.730469,23,30,26,6,5.269088,0
4,1,1,-73.970215,40.761475,-73.96151,40.75589,23,30,26,6,0.960842,0


In [25]:
target.head()

Unnamed: 0,trip_duration
0,455
1,663
2,2124
3,429
4,435


In [26]:
def  rmsle(evaluator,X,real):
    sum = 0.0
    predicted = evaluator.predict(X)
    print("Number predicted less than 0: {}".format(np.where(predicted < 0)[0].shape))

    predicted[predicted < 0] = 0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p-r)**2
    return (sum/len(predicted))**0.5

In [27]:
from sklearn.metrics import mean_squared_log_error

def  rmsle_new(evaluator,X,real):
    predictions = evaluator.predict(X)
    predictions[predictions < 0] = 0
    return np.sqrt(mean_squared_log_error(real, predictions))

In [28]:
reg = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [29]:
cv = ShuffleSplit(n_splits=4, test_size=0.1, random_state=0)

In [30]:
print(cross_val_score(reg, train_feats, np.ravel(target), cv=cv,scoring=rmsle_new, n_jobs=-1))

[0.6405754  0.66028028 0.65756845 0.65068162]
