## Import

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from os import path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from math import acos, cos, radians, sin
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


## Data Loading

In [2]:
PATH = "../input"
df = pd.read_csv(PATH + '/train.csv')
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 122.4+ MB


> ## Form calcul distance

In [5]:
def distance(latitude1, longitude1, latitude2, longitude2):
    return (6366 * acos(
    cos(radians(latitude1)) *
    cos(radians(latitude2)) *
    cos(radians(longitude2) - radians(longitude1)) +
    sin(radians(latitude1)) *
    sin(radians(latitude2))   
    ))

## Split

In [6]:
pickup_latitude = df['pickup_latitude']
pickup_longitude = df['pickup_longitude']
dropoff_latitude = df['dropoff_latitude']
dropoff_longitude = df['dropoff_longitude']
test = []

for la1, lo1, la2, lo2 in zip(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    test = distance(la1, lo1, la2, lo2)

ValueError: math domain error

In [36]:
df['year_pickup_datetime'] = df['pickup_datetime'].dt.year
df['month_pickup_datetime'] = df['pickup_datetime'].dt.month
df['day_pickup_datetime'] = df['pickup_datetime'].dt.day
df['hour_pickup_datetime'] = df['pickup_datetime'].dt.hour
df['minute_pickup_datetime'] = df['pickup_datetime'].dt.minute
df['seconde_pickup_datetime'] = df['pickup_datetime'].dt.minute * 60
df['weekday_pickup_datetime'] = df['pickup_datetime'].dt.weekday
df['distance'] = test

df_filter = (df['passenger_count'] > 0) & (df['trip_duration'] > 120) & (df['trip_duration'] < 10800)
df_result = df.loc[df_filter]
df_result.shape

(1428201, 19)

In [37]:
columns = ['passenger_count', 'vendor_id', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'year_pickup_datetime', 'month_pickup_datetime', 
          'day_pickup_datetime', 'hour_pickup_datetime', 'minute_pickup_datetime', 'seconde_pickup_datetime', 'weekday_pickup_datetime','distance']
X = df_result[columns]
y = df_result['trip_duration']
X.shape, y.shape

((1428201, 14), (1428201,))

## Modeling

In [38]:
rf = RandomForestRegressor()
rs = ShuffleSplit(n_splits=3, test_size=.7, random_state=42)
#rs.get_n_splits(rf, X, y)
score = -cross_val_score(rf, X, y, cv=rs, scoring='neg_mean_squared_log_error')
np.sqrt(score.mean())



0.35667318578205653

In [39]:
rf.fit(X, y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## Data_test

In [40]:
df_test = pd.read_csv(PATH + '/test.csv')
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [41]:
colomns_test = ['passenger_count', 'vendor_id', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'year_pickup_datetime', 'month_pickup_datetime', 
          'day_pickup_datetime', 'hour_pickup_datetime', 'minute_pickup_datetime', 'seconde_pickup_datetime', 'weekday_pickup_datetime','distance']
df_test['year_pickup_datetime'] = df_test['pickup_datetime'].dt.year
df_test['month_pickup_datetime'] = df_test['pickup_datetime'].dt.month
df_test['day_pickup_datetime'] = df_test['pickup_datetime'].dt.day
df_test['hour_pickup_datetime'] = df_test['pickup_datetime'].dt.hour
df_test['minute_pickup_datetime'] = df_test['pickup_datetime'].dt.minute
df_test['seconde_pickup_datetime'] = df_test['pickup_datetime'].dt.minute * 60
df_test['weekday_pickup_datetime'] = df_test['pickup_datetime'].dt.weekday
df_test['distance'] = df['distance']
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,year_pickup_datetime,month_pickup_datetime,day_pickup_datetime,hour_pickup_datetime,minute_pickup_datetime,seconde_pickup_datetime,weekday_pickup_datetime,distance
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,2016,6,30,23,59,3540,3,0.281799
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,2016,6,30,23,59,3540,3,0.281799
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,2016,6,30,23,59,3540,3,0.281799
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,2016,6,30,23,59,3540,3,0.281799
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,2016,6,30,23,59,3540,3,0.281799


In [42]:
X_test = df_test[colomns_test]

In [43]:
y_pred = rf.predict(X_test)
y_pred.mean()

853.5750698250293

## Data_Submission

In [44]:
submission = pd.read_csv(PATH + '/sample_submission.csv')
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [45]:
submission['trip_duration'] = y_pred
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,733.1
1,id3505355,572.7
2,id1217141,449.7
3,id2150126,1317.2
4,id1598245,273.6


In [46]:
submission.describe()

Unnamed: 0,trip_duration
count,625134.0
mean,853.57507
std,594.694252
min,132.7
25%,446.3
50%,689.0
75%,1071.1
max,6394.9


## Post_Data

In [48]:
submission.to_csv('result.csv', index=False)