In [258]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import pandas as pd
import seaborn as sns
import pathlib as Path
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split, ShuffleSplit

In [259]:
df = pd.read_csv('../input/train.csv', index_col='id')

In [260]:
def split_datetime(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name])
    df['year_' + column_name] = df[column_name].dt.year
    df['month_' + column_name] = df[column_name].dt.month
    df['day_' + column_name] = df[column_name].dt.day
    df['weekday_' + column_name] = df[column_name].dt.weekday
    df['hour_' + column_name] = df[column_name].dt.hour
    df['minute_' + column_name] = df[column_name].dt.minute
    return df

In [261]:
df.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


**PREPROCESSING**

Splitting a datetime

In [262]:
new_df = split_datetime(df, 'pickup_datetime')
new_df.shape

(1458644, 16)

Applying some filters

1st filter

In [263]:
new_df = new_df[new_df['passenger_count'] >= 1]
new_df.shape

(1458584, 16)

2nd filter

In [264]:
new_df = new_df[new_df['trip_duration'] <= 7200]
new_df.shape

(1456332, 16)

In [265]:
new_df = new_df[new_df['trip_duration'] >= 300]
new_df.shape

(1235894, 16)

Selecting columns to use

In [266]:
selected_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                   'dropoff_latitude', 'day_pickup_datetime',
                   'hour_pickup_datetime', 'minute_pickup_datetime']

**Defining target & features**

In [267]:
X_full = new_df[selected_columns]
y_full = new_df['trip_duration']
X_full.shape, y_full.shape

((1235894, 7), (1235894,))

Splitting my dataset

In [268]:
X_train_used, X_train_unused, y_train_used, y_train_unused = train_test_split(
            X_full, y_full, test_size=0.60, random_state=50)
X_train_used.shape, X_train_unused.shape, y_train_used.shape, y_train_unused.shape

((494357, 7), (741537, 7), (494357,), (741537,))

In [269]:
X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_used, y_train_used, test_size=0.33, random_state=50)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((331219, 7), (163138, 7), (331219,), (163138,))

Creating a RandomForestRegressor

In [270]:
rf = RandomForestRegressor()

In [271]:
params_grid = {
    'max_depth': [1, 3, 5, 10, 15],
    'min_samples_leaf': [1, 3, 8, 12]
}

In [272]:
# kf = KFold(n_splits=5, random_state=1)

In [273]:
# gsc = GridSearchCV(rf, params_grid, n_jobs=-1, cv=kf, verbose=3, scoring='neg_mean_squared_log_error')#

In [274]:
# gsc.fit(X_train, y_train)

In [275]:
# gsc.best_estimator_

In [276]:
# gsc.best_index_

In [277]:
cv = ShuffleSplit(1, test_size=0.01, train_size=0.5, random_state=0)

Finding validation score

In [278]:
losses = -cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_log_error')
losses.mean()



0.10661804610543768

Real score value

In [279]:
losses = [np.sqrt(l) for l in losses]
np.mean(losses)

0.32652323371792114

In [280]:
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [281]:
rf.feature_importances_

array([0.29529035, 0.16092182, 0.23530646, 0.17025576, 0.02958381,
       0.074545  , 0.03409679])

In [282]:
y_pred = rf.predict(X_valid)

In [283]:
y_pred.mean()

959.7082525224043

In [284]:
np.mean(y_valid)

948.2804312912994

In [285]:
df_test = pd.read_csv('../input/test.csv', index_col='id')

In [286]:
df_test.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [287]:
df_test = split_datetime(df_test, 'pickup_datetime')

In [288]:
X_test = df_test[selected_columns]

In [289]:
y_pred_test = rf.predict(X_test)

In [290]:
y_pred_test.mean()

889.1532607408969

In [291]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='id') 
submission.head()

Unnamed: 0_level_0,trip_duration
id,Unnamed: 1_level_1
id3004672,959
id3505355,959
id1217141,959
id2150126,959
id1598245,959


In [292]:
submission['trip_duration'] = y_pred_test

In [293]:
submission.describe()

Unnamed: 0,trip_duration
count,625134.0
mean,889.153261
std,553.344054
min,302.6
25%,515.4
50%,707.3
75%,1071.7
max,5981.2


In [294]:
submission.to_csv('submission.csv', index=False)

In [295]:
!ls

__notebook_source__.ipynb  submission.csv
