 **KAGGLE NYC TAXI**

In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit

# Any results you write to the current directory are saved as output.

from sklearn.model_selection import cross_val_score

['train.csv', 'sample_submission.csv', 'test.csv']


**CONFIG**

In [2]:
TARGET = 'trip_duration'

> > > TEST ON TRAIN DATASET

In [35]:
df = pd.read_csv('../input/train.csv');
df = df[df.trip_duration < 3600]
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [36]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1446310.0,1446310.0,1446310.0,1446310.0,1446310.0,1446310.0,1446310.0
mean,1.534107,1.663428,-73.97416,40.75126,-73.97372,40.75201,811.6869
std,0.4988355,1.31355,0.07026399,0.03239597,0.07022386,0.03548116,584.8416
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99191,40.73757,-73.99133,40.73611,395.0
50%,2.0,1.0,-73.9818,40.75421,-73.97977,40.75459,657.0
75%,2.0,2.0,-73.9676,40.76839,-73.96313,40.76988,1060.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3599.0


PREPARATION OF TRAIN DATASET

In [37]:
# function that calculates the distance between the pickup point and the dropoff 
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h

In [38]:
# function that calculates in radius the direction of one point according to another one
def bearing_array(lat1, lng1, lat2, lng2): 
    AVG_EARTH_RADIUS = 6371 # in km 
    lng_delta_rad = np.radians(lng2 - lng1) 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    y = np.sin(lng_delta_rad) * np.cos(lat2) 
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad) 
    return np.degrees(np.arctan2(y, x))

In [39]:
# function drops all values of dataset that are too abnormal
def drop_odd_values(dataframe):
    dataframe['passenger_count'] = dataframe.passenger_count.map(lambda x: 1 if x == 0 else x)
    dataframe = dataframe[dataframe.passenger_count <= 6]

    dataframe = dataframe[dataframe.haversine_distance <= 90]


In [40]:
def prep_dataset(dataframe, target='trip_duration'):
    
    # split pickup_datetime
    dataframe['datetime'] = pd.to_datetime(dataframe['pickup_datetime'])
    dataframe['year'] = dataframe['datetime'].dt.year
    dataframe['month'] = dataframe['datetime'].dt.month
    dataframe['day'] = dataframe['datetime'].dt.day
    dataframe['weekday'] = dataframe['datetime'].dt.weekday + 1
    dataframe['hour'] = dataframe['datetime'].dt.hour
    
    # calcul of haversine_distance and bearing of latitudes and longitudes from pickup to dropoff
    dataframe['haversine_distance'] = dataframe.apply(lambda x: haversine_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']), axis=1)
    dataframe['bearing'] = dataframe.apply(lambda x: bearing_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

    dataframe.loc[:, 'center_latitude'] = (dataframe['pickup_latitude'].values + dataframe['dropoff_latitude'].values) / 2 
    dataframe.loc[:, 'center_longitude'] = (dataframe['pickup_longitude'].values + dataframe['dropoff_longitude'].values) / 2
    
    # calcul of speed for trips 
    # dataframe['speed'] = (dataframe.haversine_distance/(dataframe.trip_duration/3600))

    # drop_odd_values(dataframe)
    
    #drop odd values

    # all selected_columns
    selected_columns = ['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'day', 'weekday', 'hour', 'month', 'haversine_distance', 'center_latitude', 'center_longitude', 'bearing']
    return selected_columns

In [42]:
X_train = df[prep_dataset(df)]
y_train = df[TARGET]

df['passenger_count'] = df.passenger_count.map(lambda x: 1 if x == 0 else x)
df = df[df.passenger_count <= 6]
df = df[df.haversine_distance <= 90]
    
X_train.shape, y_train.shape

((1446310, 14), (1446310,))

In [43]:
df.haversine_distance.describe()
# df.head()

count    1.446292e+06
mean     3.332722e+00
std      3.716423e+00
min      0.000000e+00
25%      1.227045e+00
50%      2.078471e+00
75%      3.812857e+00
max      8.588488e+01
Name: haversine_distance, dtype: float64

In [45]:
cv = ShuffleSplit(n_splits=4, test_size=0.1, train_size=0.2, random_state=0)

In [46]:
rf = RandomForestRegressor()
times = -cross_val_score(rf, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error')
time = [np.sqrt(l) for l in times]
# np.mean(loses)
time[:5]
np.mean(time)
#times.mean()



0.40470683973181676

In [47]:
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

TEST ON TEST DATASET

In [48]:
df_test = pd.read_csv('../input/test.csv')
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [49]:
df_test.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,625134.0,625134.0,625134.0,625134.0,625134.0,625134.0
mean,1.534884,1.661765,-73.973614,40.750927,-73.973458,40.751816
std,0.498782,1.311293,0.073389,0.029848,0.072565,0.035824
min,1.0,0.0,-121.933128,37.389587,-121.933327,36.601322
25%,1.0,1.0,-73.991852,40.737392,-73.991318,40.736
50%,2.0,1.0,-73.981743,40.754093,-73.979774,40.754543
75%,2.0,2.0,-73.9674,40.768394,-73.963013,40.769852
max,2.0,9.0,-69.248917,42.814938,-67.496796,48.857597


In [50]:
X_test = df_test[prep_dataset(df_test)]

In [52]:
y_pred = rf.predict(X_test)
y_pred.mean()

835.1467054183585

In [53]:
submission = pd.read_csv('../input/sample_submission.csv', index_col=0)
submission.head()

Unnamed: 0_level_0,trip_duration
id,Unnamed: 1_level_1
id3004672,959
id3505355,959
id1217141,959
id2150126,959
id1598245,959


In [54]:
submission['trip_duration'] = y_pred
submission.head()

Unnamed: 0_level_0,trip_duration
id,Unnamed: 1_level_1
id3004672,783.7
id3505355,592.9
id1217141,535.9
id2150126,1150.7
id1598245,377.2


In [55]:
submission.describe()

Unnamed: 0,trip_duration
count,625134.0
mean,835.146705
std,557.318547
min,5.7
25%,437.6
50%,686.8
75%,1070.4
max,3495.5


In [56]:
submission.to_csv('submission_taxi.csv')