In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('./data/input/train.csv', parse_dates=['datetime'])
test = pd.read_csv('./data/input/test.csv', parse_dates=['datetime'])

In [2]:
def rmsle(y, y_):
    log1 = np.log(y + 1)
    log2 = np.log(y_ + 1)
    mean = np.square((log1- log2)).mean()
    return np.sqrt(mean)

In [3]:
scores_cols = ['conditions', 'descriptions', 'train', 'valid', 'submissions']
scores = pd.DataFrame(columns=scores_cols)

# データ前処理

## datetime の処理

In [4]:
train['year'] = train.loc[:, 'datetime'].dt.year
train['month'] = train.loc[:, 'datetime'].dt.month
train['day'] = train.loc[:, 'datetime'].dt.day
train['hour'] = train.loc[:, 'datetime'].dt.hour
train['weekday'] = train.loc[:, 'datetime'].dt.weekday

test['year'] = test.loc[:, 'datetime'].dt.year
test['month'] = test.loc[:, 'datetime'].dt.month
test['day'] = test.loc[:, 'datetime'].dt.day
test['hour'] = test.loc[:, 'datetime'].dt.hour
test['weekday'] = test.loc[:, 'datetime'].dt.weekday

## 検証用データの作成

In [5]:
drop_cols = ['datetime', 'count', 'casual', 'registered']
test_use_cols = [col for col in train.columns if col not in drop_cols]

train_X, valid_X, train_y, valid_y = train_test_split(
    train.drop(columns=drop_cols),
    train.loc[:, ['registered', 'casual', 'count']],
    test_size=0.3, random_state=0)

## RF 実装
count を直接予測する．

In [6]:
model_rf = RFR(random_state=0)
model_rf.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

valid_pred = model_rf.predict(valid_X)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred)

score = pd.DataFrame([['First', '', train_score, valid_score, None]], columns=scores_cols)
if score.values[0][0] not in scores.values:
    scores = pd.concat([scores, score], axis=0)
print(scores)

  conditions descriptions     train     valid submissions
0      First               0.152028  0.339617        None


### Highper parameter Tuning 1

In [7]:
# RF GridSearchCV
if False:
    params = [{
        'n_estimators': [20, 30],
        'max_depth': [15, 40],
        'min_samples_leaf': [1],
        'max_features': ['sqrt', 'log2','auto', None]
    }]

    gscv = GridSearchCV(
        RFR(random_state=0, n_jobs=-1),
        param_grid=params,
        cv=5,
        verbose=2
        )

    gscv.fit(train_X, train_y.loc[:, 'count'])

In [8]:
# RF GridSearchCV check params
if False:
    model_rf_best = gscv.best_estimator_
    train_pred = model_rf_best.predict(train_X)
    train_score = rmsle(train_y.loc[:, 'count'], train_pred)

    valid_pred = model_rf_best.predict(valid_X)
    valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred)

    print('train_score: {}, valid_score: {}'.format(train_score, valid_score))

    print(pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').loc[:, ['params', 'mean_test_score']].head(3).values)

In [9]:
model_rf_best = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_best.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf_best.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

test_pred = model_rf_best.predict(valid_X)
test_score = rmsle(valid_y.loc[:, 'count'], test_pred)

sc = pd.DataFrame([['0-1', 'Hparam Tuning 1', train_score, test_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.152028  0.339617        None
1        0-1  Hparam Tuning 1  0.270274  0.351236        None


In [10]:
# submission
if False:
    model_rf_test = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
    model_rf_test.fit(train.drop(columns=drop_cols), train.loc[:, 'count'])

    pred = model_rf_test.predict(test.loc[:, test_use_cols])
    df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred})
    df_submit.to_csv('./data/output/submittion_RFR.csv', index=False)

In [11]:
scores.loc[scores['conditions'] == '0-1', 'submissions'] = 0.48367
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.152028  0.339617        None
1        0-1  Hparam Tuning 1  0.270274  0.351236     0.48367


### Highper parameter Tuning 2

In [12]:
model_rf = RFR(random_state=0, max_depth=20, min_samples_split=5, n_estimators=700, n_jobs=-1)
model_rf.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

test_pred = model_rf.predict(valid_X)
test_score = rmsle(valid_y.loc[:, 'count'], test_pred)

sc = pd.DataFrame([['0-2', 'Hparam Tuning 2', train_score, test_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.152028  0.339617        None
1        0-1  Hparam Tuning 1  0.270274  0.351236     0.48367
2        0-2  Hparam Tuning 2  0.186266  0.339103        None


## 仮説検証

### 仮説 1
count を registered と casual の和として予測する．

In [13]:
rf_1_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
rf_1_registered.fit(train_X, train_y.loc[:, 'registered'])

rf_1_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
rf_1_casual.fit(train_X, train_y.loc[:, 'casual'])

In [14]:
train_pred_registered = rf_1_registered.predict(train_X)
train_pred_casual = rf_1_casual.predict(train_X)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = rf_1_registered.predict(valid_X)
valid_pred_casual = rf_1_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['1', 'split obj var', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.152028  0.339617        None
1        0-1  Hparam Tuning 1  0.270274  0.351236     0.48367
2        0-2  Hparam Tuning 2  0.186266  0.339103        None
3          1    split obj var  0.156426  0.328699        None


**仮説 1 の下で submit**

In [15]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train.drop(drop_cols, axis=1), train.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train.drop(drop_cols, axis=1), train.loc[:, 'casual'])

pred_registered = model_registered.predict(test.loc[:, test_use_cols])
pred_casual = model_casual.predict(test.loc[:, test_use_cols])
pred_count = pred_registered + pred_casual

df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred_count})
df_submit.to_csv('./data/output/submission_rf_1.csv', index=False)

### 仮説 2-1
風速 0 を削除

In [16]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X.loc[train_X.loc[:, 'windspeed'] != 0, :], train_y.loc[train_X.loc[:, 'windspeed'] != 0, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X.loc[train_X.loc[:, 'windspeed'] != 0, :], train_y.loc[train_X.loc[:, 'windspeed'] != 0, 'casual'])

train_pred_registered = model_registered.predict(train_X)
train_pred_casual = model_casual.predict(train_X)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X)
valid_pred_casual = model_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-1', 'windspeed == 0 -> drop', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions            descriptions     train     valid submissions
0      First                          0.152028  0.339617        None
1        0-1         Hparam Tuning 1  0.270274  0.351236     0.48367
2        0-2         Hparam Tuning 2  0.186266  0.339103        None
3          1           split obj var  0.156426  0.328699        None
4        2-1  windspeed == 0 -> drop  0.190929  0.333091        None


### 仮説 2-2
風速 0 を平均値に置き換え

In [17]:
train_X_2 = train_X.copy()
train_X_2.loc[train_X_2.loc[:, 'windspeed'] == 0, 'windspeed'] = train.loc[train.loc[:, 'windspeed'] != 0, 'windspeed'].mean()

model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_2, train_y.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_2, train_y.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_2)
train_pred_casual = model_casual.predict(train_X_2)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X)
valid_pred_casual = model_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-2', 'windspeed == 0 -> mean', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions            descriptions     train     valid submissions
0      First                          0.152028  0.339617        None
1        0-1         Hparam Tuning 1  0.270274  0.351236     0.48367
2        0-2         Hparam Tuning 2  0.186266  0.339103        None
3          1           split obj var  0.156426  0.328699        None
4        2-1  windspeed == 0 -> drop  0.190929  0.333091        None
5        2-2  windspeed == 0 -> mean  0.156678  0.329988        None


### 仮説 2-3
風速 0 を線形補完

In [18]:
train_2 = train.copy()
train_2.loc[:, 'datetime'] = pd.to_datetime(train_2.loc[:, 'datetime'])
train_2.set_index('datetime', inplace=True)
train_2.replace({'windspeed': 0}, np.nan, inplace=True)
train_2.interpolate(method='time', inplace=True, limit_direction='both')

test_2 = test.copy()
test_2.loc[:, 'datetime'] = pd.to_datetime(test_2.loc[:, 'datetime'])
test_2.set_index('datetime', inplace=True)
test_2.replace({'windspeed': 0}, np.nan, inplace=True)
test_2.interpolate(method='time', inplace=True, limit_direction='both')

In [19]:
train_X_2, valid_X_2, train_y_2, valid_y_2 = train_test_split(
    train_2.drop(columns=['registered', 'casual', 'count']),
    train_2.loc[:, ['registered', 'casual', 'count']],
    test_size=0.3, random_state=0)

In [20]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_2, train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_2, train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_2)
train_pred_casual = model_casual.predict(train_X_2)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2)
valid_pred_casual = model_casual.predict(valid_X_2)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-3', 'windspeed == 0 -> linear replacement', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.152028  0.339617  \
1        0-1                       Hparam Tuning 1  0.270274  0.351236   
2        0-2                       Hparam Tuning 2  0.186266  0.339103   
3          1                         split obj var  0.156426  0.328699   
4        2-1                windspeed == 0 -> drop  0.190929  0.333091   
5        2-2                windspeed == 0 -> mean  0.156678  0.329988   
6        2-3  windspeed == 0 -> linear replacement  0.156353  0.329178   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  


### 仮説 3
天候 4 を 3 とする．  
仮説 1, 仮説 2-3 の手法を用いる．

In [21]:
print(len(train_X_2[train_X_2.weather == 4]), len(test_2[test_2.weather == 4]))

1 2


In [22]:
train_X_3 = train_X_2.replace({'weather': 4}, 3, inplace=False)
test_3 = test_2.replace({'weather': 4}, 3, inplace=False)

In [23]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_3, train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_3, train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_3)
train_pred_casual = model_casual.predict(train_X_3)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2)
valid_pred_casual = model_casual.predict(valid_X_2)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['3', 'weather == 4 -> 3', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.152028  0.339617  \
1        0-1                       Hparam Tuning 1  0.270274  0.351236   
2        0-2                       Hparam Tuning 2  0.186266  0.339103   
3          1                         split obj var  0.156426  0.328699   
4        2-1                windspeed == 0 -> drop  0.190929  0.333091   
5        2-2                windspeed == 0 -> mean  0.156678  0.329988   
6        2-3  windspeed == 0 -> linear replacement  0.156353  0.329178   
7          3                     weather == 4 -> 3  0.156367  0.329180   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  
7        None  


### 仮説 4
atemp を使用しない．  
仮説 3 の下で行う．

In [24]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_3.drop('atemp', axis=1), train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_3.drop('atemp', axis=1), train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_3.drop('atemp', axis=1))
train_pred_casual = model_casual.predict(train_X_3.drop('atemp', axis=1))
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2.drop('atemp', axis=1))
valid_pred_casual = model_casual.predict(valid_X_2.drop('atemp', axis=1))
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['4', 'atemp -> drop', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.152028  0.339617  \
1        0-1                       Hparam Tuning 1  0.270274  0.351236   
2        0-2                       Hparam Tuning 2  0.186266  0.339103   
3          1                         split obj var  0.156426  0.328699   
4        2-1                windspeed == 0 -> drop  0.190929  0.333091   
5        2-2                windspeed == 0 -> mean  0.156678  0.329988   
6        2-3  windspeed == 0 -> linear replacement  0.156353  0.329178   
7          3                     weather == 4 -> 3  0.156367  0.329180   
8          4                         atemp -> drop  0.156701  0.330585   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  
7        None  
8        None  


**仮説 3 の下で submit**

In [25]:
if False:
    model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
    model_registered.fit(train_2.drop(['casual', 'registered', 'count'], axis=1), train_2.loc[:, 'registered'])

    model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
    model_casual.fit(train_2.drop(['casual', 'registered', 'count'], axis=1), train_2.loc[:, 'casual'])

    test_pred_registered = model_registered.predict(test_2)
    test_pred_casual = model_casual.predict(test_2)
    test_pred_count = test_pred_registered + test_pred_casual

    test_submittion = pd.DataFrame({'datetime': test_2.index, 'count': test_pred_count})
    test_submittion.to_csv('./data/output/submission_rf_3.csv', index=False)

### 仮説 5
目的変数を対数変換する．  
仮説 3 の下で行う．

In [27]:
train_y_log = np.log1p(train_y.loc[:, ['casual', 'registered']])
valid_y_log = np.log1p(valid_y.loc[:, ['casual', 'registered']])

model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_3, train_y_log.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_3, train_y_log.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_3)
train_pred_casual = model_casual.predict(train_X_3)
train_pred_count = np.expm1(train_pred_registered) + np.expm1(train_pred_casual)

valid_pred_registered = model_registered.predict(valid_X_2)
valid_pred_casual = model_casual.predict(valid_X_2)
valid_pred_count = np.expm1(valid_pred_registered) + np.expm1(valid_pred_casual)

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['5', 'log', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.152028  0.339617  \
1        0-1                       Hparam Tuning 1  0.270274  0.351236   
2        0-2                       Hparam Tuning 2  0.186266  0.339103   
3          1                         split obj var  0.156426  0.328699   
4        2-1                windspeed == 0 -> drop  0.190929  0.333091   
5        2-2                windspeed == 0 -> mean  0.156678  0.329988   
6        2-3  windspeed == 0 -> linear replacement  0.156353  0.329178   
7          3                     weather == 4 -> 3  0.156367  0.329180   
8          4                         atemp -> drop  0.156701  0.330585   
9          5                                   log  0.119066  0.304063   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  
7        None  
8        None  
9        None  


### 仮説 6
連続変数を標準化する．  
仮説 5 の下で行う．

In [39]:
train_6 = train_2.copy()
test_6 = test_2.copy()
cols = ['temp', 'atemp', 'humidity', 'windspeed']
obj_cols = ['registered', 'casual', 'count', 'casual_log', 'registered_log']

scaler = StandardScaler()
train_6.loc[:, cols] = scaler.fit_transform(train_6.loc[:, cols])
test_6.loc[:, cols] = scaler.transform(test_6.loc[:, cols])

train_6['registered_log'] = np.log1p(train_6.loc[:, 'registered'])
train_6['casual_log'] = np.log1p(train_6.loc[:, 'casual'])

train_X_6, valid_X_6, train_y_6, valid_y_6 = train_test_split(
    train_6.drop(obj_cols, axis=1), 
    train_6.loc[:, obj_cols])

In [38]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_6, train_y_6.loc[:, 'registered_log'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_6, train_y_6.loc[:, 'casual_log'])

train_pred_registered = model_registered.predict(train_X_6)
train_pred_casual = model_casual.predict(train_X_6)
train_pred_count = np.expm1(train_pred_registered) + np.expm1(train_pred_casual)

valid_pred_registered = model_registered.predict(valid_X_6)
valid_pred_casual = model_casual.predict(valid_X_6)
valid_pred_count = np.expm1(valid_pred_registered) + np.expm1(valid_pred_casual)

train_score = rmsle(train_y_6.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_6.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['6', 'log + scaler', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

   conditions                          descriptions     train     valid   
0       First                                        0.152028  0.339617  \
1         0-1                       Hparam Tuning 1  0.270274  0.351236   
2         0-2                       Hparam Tuning 2  0.186266  0.339103   
3           1                         split obj var  0.156426  0.328699   
4         2-1                windspeed == 0 -> drop  0.190929  0.333091   
5         2-2                windspeed == 0 -> mean  0.156678  0.329988   
6         2-3  windspeed == 0 -> linear replacement  0.156353  0.329178   
7           3                     weather == 4 -> 3  0.156367  0.329180   
8           4                         atemp -> drop  0.156701  0.330585   
9           5                                   log  0.119066  0.304063   
10          6                          log + scaler  0.120785  0.288149   

   submissions  
0         None  
1      0.48367  
2         None  
3         None  
4         None

**仮説 6 の下で submit**

In [36]:
if True:
    model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
    model_registered.fit(train_6.drop(obj_cols, axis=1), train_6.loc[:, 'registered_log'])

    model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
    model_casual.fit(train_6.drop(obj_cols, axis=1), train_6.loc[:, 'casual_log'])

    test_pred_registered = model_registered.predict(test_6)
    test_pred_casual = model_casual.predict(test_6)
    test_pred_count = np.expm1(test_pred_registered) + np.expm1(test_pred_casual)

    test_submittion = pd.DataFrame({'datetime': test_6.index, 'count': test_pred_count})
    test_submittion.to_csv('./data/output/submission_rf_6.csv', index=False)

KeyError: 'registered_log'