## Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import pathlib as Path
import matplotlib.pyplot as plt
#import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import os
print(os.listdir("../input"))

['test.csv', 'test', 'sample_submission', 'train', 'train.csv', 'sample_submission.csv']


## Load Data

In [2]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

## File structure and content

In [3]:
df_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [None]:
df_test.head(1)

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [5]:
df_train['passenger_count'].value_counts()

1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: passenger_count, dtype: int64

In [11]:
#Visualise the minimum and maximum trip duration
df_train.sort_values(by='trip_duration', ascending=False)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
978383,id0053347,1,2016-02-13 22:46:52,2016-03-25 18:18:14,1,-73.783905,40.648632,-73.978271,40.750202,N,3526282
924150,id1325766,1,2016-01-05 06:14:15,2016-01-31 01:01:07,1,-73.983788,40.742325,-73.985489,40.727676,N,2227612
680594,id0369307,1,2016-02-13 22:38:00,2016-03-08 15:57:38,2,-73.921677,40.735252,-73.984749,40.759979,N,2049578
355003,id1864733,1,2016-01-05 00:19:42,2016-01-27 11:08:38,1,-73.789650,40.643559,-73.956810,40.773087,N,1939736
1234291,id1942836,2,2016-02-15 23:18:06,2016-02-16 23:17:58,2,-73.794525,40.644825,-73.991051,40.755573,N,86392
295382,id0593332,2,2016-05-31 13:00:39,2016-06-01 13:00:30,1,-73.781952,40.644688,-73.993874,40.745926,N,86391
73816,id0953667,2,2016-05-06 00:00:10,2016-05-07 00:00:00,1,-73.996010,40.753220,-73.979027,40.740601,N,86390
59891,id2837671,2,2016-06-30 16:37:52,2016-07-01 16:37:39,1,-73.992279,40.749729,-73.962524,40.800770,N,86387
1360439,id1358458,2,2016-06-23 16:01:45,2016-06-24 16:01:30,1,-73.782089,40.644806,-73.985016,40.666828,N,86385
753765,id2589925,2,2016-05-17 22:22:56,2016-05-18 22:22:35,4,-74.006111,40.734680,-73.958809,40.815449,N,86379


## DATA CLEANING

In [None]:
#Extreme trip durations and trip with zero passenger
df_train = df_train = df_train[(df_train['passenger_count'] > 0) & (df_train['trip_duration'] < 8000)]

## Reformating features

In [None]:
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])
#df_train['year'] = df_train['pickup_datetime'].dt.year
df_train['month'] = df_train['pickup_datetime'].dt.month
df_train['day'] = df_train['pickup_datetime'].dt.day
df_train['hour'] = df_train['pickup_datetime'].dt.hour
df_train['minute'] = df_train['pickup_datetime'].dt.minute
df_train['second'] = df_train['pickup_datetime'].dt.second
df_train['weekday'] = df_train['pickup_datetime'].dt.weekday
df_train.head(1)

In [None]:
df_train.loc[df_train['store_and_fwd_flag'] == 'N', 'store_and_fwd_flag'] = 0
df_train.loc[df_train['store_and_fwd_flag'] == 'Y', 'store_and_fwd_flag'] = 1
df_train.head(2)

## SPLIT Train data

In [None]:
def split_dataset(df, features, target='trip_duration'):
    X = df[features]
    y = df[target]
    return X, y

In [None]:
# predictor features
selected_columns = ['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 
                    'month', 'day', 'hour', 'minute', 'second', 'weekday', 'store_and_fwd_flag']
X_train, y_train = split_dataset(df_train, selected_columns)
# X_train = df_train[selected_columns]
# y_train = df_train['trip_duration']
X_train.shape, y_train.shape

> ## Modeling & Cross-Validation

In [None]:
rf = RandomForestRegressor()
score = -cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_log_error')
score.mean()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.feature_importances_

## Test Prediction

In [None]:
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])
#df_test['year'] = df_test['pickup_datetime'].dt.year
df_test['month'] = df_test['pickup_datetime'].dt.month
df_test['day'] = df_test['pickup_datetime'].dt.day
df_test['hour'] = df_test['pickup_datetime'].dt.hour
df_test['minute'] = df_test['pickup_datetime'].dt.minute
df_test['second'] = df_test['pickup_datetime'].dt.second
df_test['weekday'] = df_test['pickup_datetime'].dt.weekday
df_test.loc[df_test['store_and_fwd_flag'] == 'N', 'store_and_fwd_flag'] = 0
df_test.loc[df_test['store_and_fwd_flag'] == 'Y', 'store_and_fwd_flag'] = 1
df_test.head()

In [None]:
X_test = df_test[selected_columns]

In [None]:
y_test_pred = rf.predict(X_test)
y_test_pred.mean()

## SUBMISSION

In [None]:
submission.head()

In [None]:
submission['trip_duration'] = y_test_pred
submission.head()

In [None]:
submission.describe()

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
!ls