In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import time
import xgboost as xgb
import datetime as dt
import datetime



In [7]:
start = time.time()
print('Fun started at: '+ str(datetime.datetime.now()))

train = pd.read_csv('train.csv', low_memory=False, parse_dates=['pickup_datetime'], nrows=10000)

done = time.time()
elapsed = done - start
print(datetime.datetime.now())
print('Everything done in '+str(elapsed)+' seconds')

test = pd.read_csv('test.csv', low_memory=False, parse_dates=['pickup_datetime'])
sample_submission = pd.read_csv('sample_submission.csv', low_memory=False)
holiday = pd.read_csv('usaholidayint.csv', low_memory=False)
weather = pd.read_csv('central_park_weather.csv', low_memory=False)

Fun started at: 2018-08-20 16:03:07.821011
2018-08-20 16:03:09.846733
Everything done in 2.025721311569214 seconds


In [8]:
#Cleaning training set
train = train[train.passenger_count < 10]
train = train[train.passenger_count > 0]
train = train[train.fare_amount >= 0]
train = train.dropna(how = 'any', axis = 'rows')

#Picking just NYC locations

In [9]:
#Combining all the data sets
test['fare_amount'] = 11.35
test['dataset'] = 'test'
train['dataset'] = 'train'
data = pd.concat([train,test], axis=0)
data = data[['dataset', 'key', 'fare_amount', 'pickup_datetime', 'pickup_latitude', 'pickup_longitude', 
      'dropoff_latitude', 'dropoff_longitude', 'passenger_count']]

In [10]:
#Adding calendar and usa holiday variables
#data_d = pd.to_datetime(data.pickup_datetime)
#data['DATE'] = data_d.dt.date
data['year'] = data.pickup_datetime.dt.year
data['month'] = data.pickup_datetime.dt.month
data['day'] = data.pickup_datetime.dt.day
data['day_of_week'] = data.pickup_datetime.dt.weekday+1
data['hour'] = data.pickup_datetime.dt.hour

holiday_d = pd.to_datetime(holiday.DATE)
holiday['year'] = holiday_d.dt.year
holiday['month'] = holiday_d.dt.month
holiday['day'] = holiday_d.dt.day
holiday.drop(columns='DATE', inplace=True)

data_t = data.merge(holiday, on=['year', 'month', 'day'], how='left')

In [11]:
#Adding weather conditions
weather.drop(columns=['STATION', 'NAME'], inplace = True)
weather_d = pd.to_datetime(weather.DATE)
weather['year'] = weather_d.dt.year
weather['month'] = weather_d.dt.month
weather['day'] = weather_d.dt.day
weather.drop(columns='DATE', inplace=True)

data_t = data_t.merge(weather, on=['year', 'month', 'day'], how='left')

In [12]:
#Creating function for calculating trip_distnce
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

data_t['trip_distance'] = haversine_np(data_t.pickup_longitude, data_t.pickup_latitude, 
                                       data_t.dropoff_longitude, data_t.dropoff_latitude)

In [13]:
#Additional surcharges
data_t.loc[(data_t.hour >= 20) | (data_t.hour < 6), 'surcharge'] = 0.5
data_t.loc[(data_t.hour >= 16) & (data_t.hour < 20) & (data_t.day_of_week<6), 'rush_hour_surcharge'] = 1

In [14]:
######################################FILL NA
data_t.fillna(0, inplace=True)

In [15]:
data_t.drop(columns='pickup_datetime', inplace=True)

In [16]:
train = data_t[data_t.dataset == 'train']
valid = data_t[data_t.dataset == 'test']
train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

In [18]:
start = time.time()
print('Fun started at: '+ str(datetime.datetime.now()))

train_x = train.iloc[:,3:]
train_y = train.iloc[:,2]
valid_x = valid.iloc[:,3:]
valid_y = valid.iloc[:,2]
train_xgb = xgb.DMatrix(train_x, label = train_y)
valid_xgb = xgb.DMatrix(valid_x)

param = {'objective' : 'reg:linear', 'seed' :12345, 'eval_metric':'rmse'}

model = xgb.train(params = param,
                  dtrain = train_xgb,
                  num_boost_round = 100)


prediction = model.predict(valid_xgb)
prediction[prediction < 0.001] = 0

results = DataFrame()
results['key'] = valid.key
results['fare_amount'] = prediction

done = time.time()
elapsed = done - start
print(datetime.datetime.now())
print('Everything done in '+str(elapsed)+' seconds')

print(results.head(15))

#results.to_csv('wyniki.csv')

Fun started at: 2018-08-20 16:03:56.301605
2018-08-20 16:03:57.363142
Everything done in 1.0615367889404297 seconds
                            key  fare_amount
0   2015-01-27 13:08:24.0000002     9.886044
1   2015-01-27 13:08:24.0000003     9.796896
2   2011-10-08 11:53:44.0000002     4.683535
3   2012-12-01 21:12:12.0000002     9.330774
4   2012-12-01 21:12:12.0000003    13.742714
5   2012-12-01 21:12:12.0000005    10.258887
6   2011-10-06 12:10:20.0000001     5.247399
7   2011-10-06 12:10:20.0000003    54.283623
8   2011-10-06 12:10:20.0000002    10.354189
9   2014-02-18 15:22:20.0000002     7.002374
10  2014-02-18 15:22:20.0000003    10.966789
11  2014-02-18 15:22:20.0000001    17.550972
12  2010-03-29 20:20:32.0000002     4.122919
13  2010-03-29 20:20:32.0000001     6.200359
14  2011-10-06 03:59:12.0000002     7.064167


In [19]:
results.shape

(9914, 2)