In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR

train = pd.read_csv('./data/input/train.csv')
test = pd.read_csv('./data/input/test.csv')

In [3]:
def rmsle(y, y_):
    log1 = np.nan_to_num(np.log(y + 1))
    log2 = np.nan_to_num(np.log(y_ + 1))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [4]:
train['year'] = train.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
train['month'] = train.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
train['day'] = train.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
train['hour'] = train.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

test['year'] = test.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
test['month'] = test.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
test['day'] = test.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
test['hour'] = test.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

In [5]:
drop_cols = ['datetime', 'count', 'casual', 'registered']
test_use_cols = [col for col in train.columns if col not in drop_cols]

train_X, valid_X, train_y, valid_y = train_test_split(
    train.drop(columns=drop_cols),
    train.loc[:, 'count'],
    test_size=0.3, random_state=0)

In [6]:
model_rf = RFR(random_state=0)
model_rf.fit(train_X, train_y)

train_pred = model_rf.predict(train_X)
print('train score: ', rmsle(train_y, train_pred))

valid_pred = model_rf.predict(valid_X)
print('valid score: ', rmsle(valid_y, valid_pred))

train score:  0.15852735466708315
valid score:  0.35234408097265035


In [None]:
# GridSearchCV
if False:
    params = [{
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [5, 10, 15]
    }]

    gscv = GridSearchCV(
        RFR(random_state=0, max_features='log2'),
        param_grid=params,
        cv=3,
        verbose=2
        )

    gscv.fit(train_X, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=100; total time=   0.5s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=100; total time=   0.6s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=100; total time=   0.6s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=200; total time=   1.1s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=200; total time=   1.1s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=200; total time=   1.1s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=300; total time=   1.6s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=300; total time=   1.5s
[CV] END .max_depth=10, min_samples_leaf=5, n_estimators=300; total time=   1.5s
[CV] END max_depth=10, min_samples_leaf=10, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=10, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=10, n_es

In [8]:
# GridSearchCV check params
if False:
    model_rf_best = gscv.best_estimator_
    print(gscv.best_params_)

    train_pred = model_rf_best.predict(train_X)
    print('train score: ', rmsle(train_y, train_pred))

    valid_pred = model_rf_best.predict(valid_X)
    print('valid score: ', rmsle(valid_y, valid_pred))

    print(pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').T)

{'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 200}
train score:  0.566182482821103
valid score:  0.6407919575221911


In [11]:
model_rf_best = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_best.fit(train_X, train_y)

pred = model_rf_best.predict(train_X)
print('train score: ', rmsle(train_y, pred))

pred = model_rf_best.predict(valid_X)
print('valid score: ', rmsle(valid_y, pred))

train score:  0.28304786806763416
valid score:  0.3629093510948012


In [8]:
model_rf_test = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_test.fit(train.drop(columns=drop_cols), train.loc[:, 'count'])

pred = model_rf_test.predict(test.loc[:, test_use_cols])
df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred})
df_submit.to_csv('./data/output/submittion.csv', index=False)