**IMPORTS**

In [29]:
import numpy as np
import pandas as pd 
import os
import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


FUNCTIONS

In [30]:
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h

def bearing_array(lat1, lng1, lat2, lng2): 
    AVG_EARTH_RADIUS = 6371 # in km 
    lng_delta_rad = np.radians(lng2 - lng1) 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    y = np.sin(lng_delta_rad) * np.cos(lat2) 
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad) 
    return np.degrees(np.arctan2(y, x))

In [31]:
def preprocess(df):
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['pickup_year'] = df['pickup_datetime'].dt.year
    df['pickup_month'] = df['pickup_datetime'].dt.month
    df['pickup_day'] = df['pickup_datetime'].dt.day
    df['pickup_hour'] = df['pickup_datetime'].dt.hour + 1
    df['pickup_weekday'] = df['pickup_datetime'].dt.weekday + 1
    df['pickup_minute'] = df['pickup_datetime'].dt.minute
    df['pickup_seconde'] = df['pickup_datetime'].dt.minute * 60
    df['store_and_fwd_flag'] = pd.get_dummies(df['store_and_fwd_flag'], drop_first=True)
    df['pickup_datetime'] = pd.to_numeric(df['pickup_datetime'], errors='coerce')
    
    df['bearing'] = df.apply(lambda x: bearing_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)
    df.loc[:, 'center_latitude'] = (df['pickup_latitude'].values + df['dropoff_latitude'].values) / 2 
    df.loc[:, 'center_longitude'] = (df['pickup_longitude'].values + df['dropoff_longitude'].values) / 2
    df['distance'] = df.apply(lambda x: haversine_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)
    
    

In [32]:
def get_columns_selected(df, excludes):
    preprocess(df)
    columns = df.columns.tolist()
    return [c for c in columns if c not in excludes]

**DATA LOADING**

In [33]:
dataDir = '../input/'
df_train = pd.read_csv(dataDir + 'train.csv', index_col='id')
df_test = pd.read_csv(dataDir + 'test.csv', index_col='id')

**FILTERING DATA
**

In [34]:
df_train = df_train[df_train.trip_duration < 3600]

In [35]:
EXCLUDES = ['trip_duration', 'dropoff_datetime']
df_train['passenger_count'] = df_train.passenger_count.map(lambda x: 1 if x == 0 else x)
df_train = df_train[df_train.passenger_count <= 6]
X = df_train[get_columns_selected(df_train, EXCLUDES)]
y = df_train.trip_duration
X.shape, y.shape

((1446305, 19), (1446305,))

**MODELING / CROSS-VALIDATION**

In [36]:
cv = ShuffleSplit(4, test_size=0.01, train_size=0.02, random_state=0)
rf = RandomForestRegressor()
losses = -cross_val_score(rf, X, y, cv=cv, scoring='neg_mean_squared_log_error')
losses = [np.sqrt(l) for l in losses]
np.mean(losses)



0.43915264105028223

In [37]:
rf.fit(X, y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [39]:
df_test.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


**PREDICT / SUBMISSION**

In [40]:
X_test = df_test[get_columns_selected(df_test, [])]
y_pred = rf.predict(X_test)
y_pred.mean()

835.4206926834888

In [41]:
submission = pd.read_csv(dataDir + 'sample_submission.csv') 
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [42]:
submission['trip_duration'] = y_pred
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,821.9
1,id3505355,708.8
2,id1217141,524.8
3,id2150126,1282.2
4,id1598245,476.7


In [43]:
submission.describe()

Unnamed: 0,trip_duration
count,625134.0
mean,835.420693
std,557.007286
min,3.8
25%,437.6
50%,687.4
75%,1069.9
max,3521.6


In [44]:
submission.to_csv('submission.csv', index=False)