In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

train = pd.read_csv('./data/input/train.csv', parse_dates=['datetime'], index_col='datetime')
test = pd.read_csv('./data/input/test.csv', parse_dates=['datetime'], index_col='datetime')
data = pd.concat([train, test], axis=0).sort_index()

# データ前処理

In [2]:
data_0 = data.copy()
data_0['year'] = data_0.index.year
data_0['month'] = data_0.index.month
# data['day'] = data.index.day
data_0['hour'] = data_0.index.hour
data_0['weekday'] = data_0.index.weekday

train_0 = data_0.loc[train.index, :]
test_0 = data_0.loc[test.index, :].drop(columns=['count', 'casual', 'registered'])

# 仮説検証

## 仮説 1
count を registered と casual の和として予測する．

### t_1_1 : count

In [3]:
rf = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'count'])
pred = rf.predict(test_0)

In [4]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/t_1_1.csv', index=False)

### t_1_2 : registered + casual

In [5]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'registered'])
pred_r = rf_r.predict(test_0)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'casual'])
pred_c = rf_c.predict(test_0)

pred = pred_r + pred_c

In [6]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/t_1_2.csv', index=False)

### R: 1_2

## 仮説 2 : windspeed
風速 0 について

### 仮説 2-1 : delete
風速 を削除

In [7]:
# delete windspeed
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_0.drop(columns=['casual', 'registered', 'count', 'windspeed']), train_0.loc[:, 'registered'])
pred_r = rf_r.predict(test_0.drop(columns=['windspeed']))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_0.drop(columns=['casual', 'registered', 'count', 'windspeed']), train_0.loc[:, 'casual'])
pred_c = rf_c.predict(test_0.drop(columns=['windspeed']))

pred = pred_r + pred_c

In [10]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_1.csv', index=False)

### 仮説 2-2 : mean
風速 0 を平均値に置き換え

In [9]:
data_2_2 = data_0.copy()
data_2_2.loc[data_2_2.loc[:, 'windspeed'] == 0, 'windspeed'] = data_2_2.loc[data_2_2.loc[:, 'windspeed'] != 0, 'windspeed'].mean()

train_2_2 = data_2_2.loc[train.index, :]
test_2_2 = data_2_2.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [11]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_2.drop(columns=['casual', 'registered', 'count']), train_2_2.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_2)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_2.drop(columns=['casual', 'registered', 'count']), train_2_2.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_2)

pred = pred_r + pred_c

In [12]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_2.csv', index=False)

### 仮説 2-3 : interpolate
風速 0 を線形補完

In [13]:
data_2_3 = data_0.copy()
data_2_3.replace({'windspeed': {0: np.nan}}, inplace=True)
data_2_3.interpolate(method='time', inplace=True, limit_direction='both')

train_2_3 = data_2_3.loc[train.index, :]
test_2_3 = data_2_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [14]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_3.drop(columns=['casual', 'registered', 'count']), train_2_3.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_3)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_3.drop(columns=['casual', 'registered', 'count']), train_2_3.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_3)

pred = pred_r + pred_c

In [15]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_3.csv', index=False)

### 仮説 2-4 : predict by RF

In [16]:
data_2_4 = data_0.copy()

rf_w = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_w.fit(data_2_4.loc[data_2_4.loc[:, 'windspeed'] != 0, :].drop(columns=['casual', 'registered', 'count', 'windspeed']),
         data_2_4.loc[data_2_4.loc[:, 'windspeed'] != 0, 'windspeed'])
pred_w = rf_w.predict(data_2_4.loc[data_2_4.loc[:, 'windspeed'] == 0, :].drop(columns=['casual', 'registered', 'count', 'windspeed']))

data_2_4.loc[data_2_4.loc[:, 'windspeed'] == 0, 'windspeed'] = pred_w

train_2_4 = data_2_4.loc[train.index, :]
test_2_4 = data_2_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [17]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_4.drop(columns=['casual', 'registered', 'count']), train_2_4.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_4)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_4.drop(columns=['casual', 'registered', 'count']), train_2_4.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_4)

pred = pred_r + pred_c

In [18]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_4.csv', index=False)

### R: 2-1

## 仮説 3 : weather
天候 4 を 3 とする．  
仮説 1, 仮説 2-1 の手法を用いる．

In [22]:
data_3 = data_0.copy()
data_3.drop(columns=['windspeed'], inplace=True)
data_3.replace({'weather': {4: 3}}, inplace=True)

train_3 = data_3.loc[train.index, :]
test_3 = data_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [23]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_3.drop(columns=['casual', 'registered', 'count']), train_3.loc[:, 'registered'])
pred_r = rf_r.predict(test_3)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_3.drop(columns=['casual', 'registered', 'count']), train_3.loc[:, 'casual'])
pred_c = rf_c.predict(test_3)

pred = pred_r + pred_c

In [24]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_3.csv', index=False)

## 仮説 4 : atemp
atemp を使用しない．  
仮説 3 の下で行う．

In [25]:
data_4 = data_0.copy()
data_4.drop(columns=['windspeed', 'atemp'], inplace=True)
data_4.replace({'weather': {4: 3}}, inplace=True)

train_4 = data_4.loc[train.index, :]
test_4 = data_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [26]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_4.drop(columns=['casual', 'registered', 'count']), train_4.loc[:, 'registered'])
pred_r = rf_r.predict(test_4)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_4.drop(columns=['casual', 'registered', 'count']), train_4.loc[:, 'casual'])
pred_c = rf_c.predict(test_4)

pred = pred_r + pred_c

In [27]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_4.csv', index=False)

## 仮説 5 : log
目的変数を対数変換する．log(x + 1)  
仮説 3 の下で行う．

In [28]:
data_5 = data_0.copy()
data_5.drop(columns=['windspeed'], inplace=True)
data_5.replace({'weather': {4: 3}}, inplace=True)

train_5 = data_5.loc[train.index, :]
test_5 = data_5.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [29]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_5.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_5))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_5.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_5))

pred = pred_r + pred_c

In [30]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_5.csv', index=False)

## 仮説 6 : scaler
連続変数を標準化する．  
仮説 5 の下で行う．

In [31]:
data_6 = data_0.copy()
data_6.drop(columns=['windspeed'], inplace=True)
data_6.replace({'weather': {4: 3}}, inplace=True)

scaler = StandardScaler()
data_6.loc[:, ['temp', 'atemp', 'humidity']] = scaler.fit_transform(data_6.loc[:, ['temp', 'atemp', 'humidity']])

train_6 = data_6.loc[train.index, :]
test_6 = data_6.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [32]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_6.drop(columns=['casual', 'registered', 'count']), np.log1p(train_6.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_6))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_6.drop(columns=['casual', 'registered', 'count']), np.log1p(train_6.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_6))

pred = pred_r + pred_c

In [33]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_6.csv', index=False)

## 仮説 7 : datetime
datetime について  
仮説 6 の下で行う

### 仮説 7-1 : month

In [34]:
data_7_1 = data_0.copy()
data_7_1.drop(columns=['windspeed', 'month'], inplace=True)
data_7_1.replace({'weather': {4: 3}}, inplace=True)

train_7_1 = data_7_1.loc[train.index, :]
test_7_1 = data_7_1.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [35]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_1.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_1))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_1.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_1))

In [36]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred_r + pred_c})
submission.to_csv('./data/output/t_7_1.csv', index=False)

### 仮説 7-2 : season

In [37]:
data_7_2 = data_0.copy()
data_7_2.drop(columns=['windspeed', 'season'], inplace=True)
data_7_2.replace({'weather': {4: 3}}, inplace=True)

train_7_2 = data_7_2.loc[train.index, :]
test_7_2 = data_7_2.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_2.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_2))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_2.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_2))

pred = pred_r + pred_c

: 

: 

In [None]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_7_2.csv', index=False)

### 仮説 7-3 : weekday

In [None]:
# Delete weekday
sc = RF_Model_Sum_Logex(train_X_6.drop(columns=['weekday'], axis=1), train_y, valid_X_6.drop(columns=['weekday'], axis=1), valid_y)
scores = add_score('73', '[weekday] : deleted', sc, scores)

   number                       descriptions     rmsle
11     40               [atemp] : Do nothing  0.324384
12     41                  [atemp] : Deleted  0.325791
13     50  [registered, casual] : Do nothing  0.324384
14     51         [registered, casual] : Log  0.302610
15     60                         Do nothing  0.302610
16     61                     StandardScaler  0.302298
17     70            [datetime] : Do nothing  0.302298
18     71                  [month] : Deleted  0.310272
19     72                 [season] : Deleted  0.302249
20     73                [weekday] : deleted  0.326193


### 仮説 7-4 : day

In [None]:
# Add day
train_X_6_day = pd.DataFrame(train_X_6.index.day, index=train_X_6.index).rename(columns={'datetime': 'day'})
valid_X_6_day = pd.DataFrame(valid_X_6.index.day, index=valid_X_6.index).rename(columns={'datetime': 'day'})

In [None]:
sc = RF_Model_Sum_Logex(
    pd.concat([train_X_6, train_X_6_day], axis=1),
    train_y,
    pd.concat([valid_X_6, valid_X_6_day], axis=1),
    valid_y
)
scores = add_score('74', '[day] : Added', sc, scores)

   number                       descriptions     rmsle
12     41                  [atemp] : Deleted  0.325791
13     50  [registered, casual] : Do nothing  0.324384
14     51         [registered, casual] : Log  0.302610
15     60                         Do nothing  0.302610
16     61                     StandardScaler  0.302298
17     70            [datetime] : Do nothing  0.302298
18     71                  [month] : Deleted  0.310272
19     72                 [season] : Deleted  0.302249
20     73                [weekday] : deleted  0.326193
21     74                      [day] : Added  0.300317


## 仮説 8 : datetime, SIO

In [None]:
# saparate in order
train_8 = pd.concat([pd.concat([train_X_6, train_y], axis=1), pd.concat([valid_X_6, valid_y], axis=1)], axis=0).sort_index()

train_X_8 = train_8.loc[train_8.index.day <= 14, :].drop(columns=train_y.columns)
valid_X_8 = train_8.loc[train_8.index.day > 14, :].drop(columns=train_y.columns)

train_y_8 = train_8.loc[train_X_8.index, train_y.columns]
valid_y_8 = train_8.loc[valid_X_8.index, train_y.columns]

In [None]:
print(train_X_6.shape, train_X_8.shape)

(7620, 11) (8026, 11)


In [None]:
sc = RF_Model_Sum_Logex(train_X_8, train_y_8, valid_X_8, valid_y_8)
scores = add_score('80', 'separate in order, sorted', sc, scores, False)

In [None]:
# shuffle
shuffle_index = np.random.permutation(train_X_8.index)
sc = RF_Model_Sum_Logex(train_X_8.loc[shuffle_index, :], train_y_8.loc[shuffle_index, :], valid_X_8, valid_y_8)
scores = add_score('81', 'separate in order, shuffle', sc, scores)

   number                descriptions     rmsle
14     51  [registered, casual] : Log  0.302610
15     60                  Do nothing  0.302610
16     61              StandardScaler  0.302298
17     70     [datetime] : Do nothing  0.302298
18     71           [month] : Deleted  0.310272
19     72          [season] : Deleted  0.302249
20     73         [weekday] : deleted  0.326193
21     74               [day] : Added  0.300317
22     80   separate in order, sorted  0.320048
23     81  separate in order, shuffle  0.320231


In [None]:
# SIO sorted, delete month
sc = RF_Model_Sum_Logex(train_X_8.drop(columns=['month'], axis=1), train_y_8, valid_X_8.drop(columns=['month'], axis=1), valid_y_8)
scores = add_score('82', 'SIO sorted, [month] : deleted', sc, scores, False)

#SIO sorted, delete season
sc = RF_Model_Sum_Logex(train_X_8.drop(columns=['season'], axis=1), train_y_8, valid_X_8.drop(columns=['season'], axis=1), valid_y_8)
scores = add_score('83', 'SIO sorted, [season] : deleted', sc, scores, False)

# SIO sorted, delete weekday
sc = RF_Model_Sum_Logex(train_X_8.drop(columns=['weekday'], axis=1), train_y_8, valid_X_8.drop(columns=['weekday'], axis=1), valid_y_8)
scores = add_score('84', 'SIO sorted, [weekday] : deleted', sc, scores)

   number                     descriptions     rmsle
17     70          [datetime] : Do nothing  0.302298
18     71                [month] : Deleted  0.310272
19     72               [season] : Deleted  0.302249
20     73              [weekday] : deleted  0.326193
21     74                    [day] : Added  0.300317
22     80        separate in order, sorted  0.320048
23     81       separate in order, shuffle  0.320231
24     82    SIO sorted, [month] : deleted  0.342248
25     83   SIO sorted, [season] : deleted  0.319927
26     84  SIO sorted, [weekday] : deleted  0.350749


In [None]:
# SIO sorted, add day
train_X_8_day = pd.DataFrame(train_X_8.index.day, index=train_X_8.index).rename(columns={'datetime': 'day'})
valid_X_8_day = pd.DataFrame(valid_X_8.index.day, index=valid_X_8.index).rename(columns={'datetime': 'day'})

sc = RF_Model_Sum_Logex(
    pd.concat([train_X_8, train_X_8_day], axis=1),
    train_y_8,
    pd.concat([valid_X_8, valid_X_8_day], axis=1),
    valid_y_8
)
scores = add_score('85', 'SIO sorted, [day] : Added', sc, scores, False)

# SIO shuffle, add day
sc = RF_Model_Sum_Logex(
    pd.concat([train_X_8, train_X_8_day], axis=1).loc[shuffle_index, :],
    train_y_8.loc[shuffle_index, :],
    pd.concat([valid_X_8, valid_X_8_day], axis=1),
    valid_y_8
)
scores = add_score('86', 'SIO shuffle, [day] : Added', sc, scores)

   number                     descriptions     rmsle
19     72               [season] : Deleted  0.302249
20     73              [weekday] : deleted  0.326193
21     74                    [day] : Added  0.300317
22     80        separate in order, sorted  0.320048
23     81       separate in order, shuffle  0.320231
24     82    SIO sorted, [month] : deleted  0.342248
25     83   SIO sorted, [season] : deleted  0.319927
26     84  SIO sorted, [weekday] : deleted  0.350749
27     85        SIO sorted, [day] : Added  0.319094
28     86       SIO shuffle, [day] : Added  0.318503


## 仮説 9 : humidity

In [None]:
# check the number of zero in humidity
print('The number of zero in humidity: ', train_0.loc[train_0.loc[:, 'humidity'] == 0, 'humidity'].shape[0])

The number of zero in humidity:  22


In [None]:
train_X_9 = pd.concat([train_X_8, train_X_8_day], axis=1).sample(frac=1, random_state=0)
valid_X_9 = pd.concat([valid_X_8, valid_X_8_day], axis=1)

train_y_9 = train_y_8.loc[train_X_9.index, :]
valid_y_9 = valid_y_8.loc[valid_X_9.index, :]

In [None]:
# Do nothing
sc = RF_Model_Sum_Logex(train_X_9, train_y_9, valid_X_9, valid_y_9)
scores = add_score('90', '[humidity] : Do nothing', sc, scores)

   number                     descriptions     rmsle
20     73              [weekday] : deleted  0.326193
21     74                    [day] : Added  0.300317
22     80        separate in order, sorted  0.320048
23     81       separate in order, shuffle  0.320231
24     82    SIO sorted, [month] : deleted  0.342248
25     83   SIO sorted, [season] : deleted  0.319927
26     84  SIO sorted, [weekday] : deleted  0.350749
27     85        SIO sorted, [day] : Added  0.319094
28     86       SIO shuffle, [day] : Added  0.318503
29     90          [humidity] : Do nothing  0.319291


### 仮説 9-1 : delete

In [None]:
# Delete humidity
sc = RF_Model_Sum_Logex(train_X_9.drop(columns=['humidity'], axis=1), train_y_9, valid_X_9.drop(columns=['humidity'], axis=1), valid_y_9)
scores = add_score('91', '[humidity] : Deleted', sc, scores, False)

### 仮説 9-2: mean

In [None]:
train_9_2 = train_0.copy()
train_9_2.drop(columns=['season', 'windspeed'], axis=1)
train_9_2.replace({'weather': 4}, 3).replace({'humidity': 0}, train_9_2.loc[train_9_2.loc[:, 'humidity'] != 0, 'humidity'].mean(), inplace=True)
train_9_2.loc[:, 'day'] = train_9_2.index.day

scaler = StandardScaler()
train_9_2.loc[:, ['temp', 'atemp', 'humidity']] = scaler.fit_transform(train_9_2.loc[:, ['temp', 'atemp', 'humidity']])

train_X_9_2 = train_9_2.loc[train_X_9.index, train_X_9.columns]
valid_X_9_2 = train_9_2.loc[valid_X_9.index, train_X_9.columns]

In [None]:
# Replace zero with mean
sc = RF_Model_Sum_Logex(train_X_9_2, train_y_9, valid_X_9_2, valid_y_9)
scores = add_score('92', '[humidity] : Replaced zero with mean', sc, scores)

   number                          descriptions     rmsle
22     80             separate in order, sorted  0.320048
23     81            separate in order, shuffle  0.320231
24     82         SIO sorted, [month] : deleted  0.342248
25     83        SIO sorted, [season] : deleted  0.319927
26     84       SIO sorted, [weekday] : deleted  0.350749
27     85             SIO sorted, [day] : Added  0.319094
28     86            SIO shuffle, [day] : Added  0.318503
29     90               [humidity] : Do nothing  0.319291
30     91                  [humidity] : Deleted  0.333830
31     92  [humidity] : Replaced zero with mean  0.319134


### 仮説 9-3 : interpolate

In [None]:
train_9_3 = train_0.copy()
train_9_3.drop(columns=['season', 'windspeed'], axis=1)
train_9_3.replace({'weather': 4}, 3).replace({'humidity': 0}, np.nan, inplace=True)
train_9_3.interpolate(method='time', inplace=True, limit_direction='both')
train_9_3.loc[:, 'day'] = train_9_3.index.day

train_X_9_3 = train_9_3.loc[train_X_9.index, train_X_9.columns]
valid_X_9_3 = train_9_3.loc[valid_X_9.index, train_X_9.columns]

In [None]:
sc = RF_Model_Sum_Logex(train_X_9_3, train_y_9, valid_X_9_3, valid_y_9)
scores = add_score('93', '[humidity] : Replaced zero with interpolate', sc, scores)

   number                                 descriptions     rmsle
23     81                   separate in order, shuffle  0.320231
24     82                SIO sorted, [month] : deleted  0.342248
25     83               SIO sorted, [season] : deleted  0.319927
26     84              SIO sorted, [weekday] : deleted  0.350749
27     85                    SIO sorted, [day] : Added  0.319094
28     86                   SIO shuffle, [day] : Added  0.318503
29     90                      [humidity] : Do nothing  0.319291
30     91                         [humidity] : Deleted  0.333830
31     92         [humidity] : Replaced zero with mean  0.319134
32     93  [humidity] : Replaced zero with interpolate  0.319005


### 仮説 9-4 : RF

In [None]:
train_9_4 = train_9_3.copy()
train_9_4.loc[:, 'humidity'] = train_0.loc[train_9_4.index, 'humidity']

model = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
model.fit(train_9_4.loc[train_9_4.loc[:, 'humidity'] != 0, :].drop(columns=['registered', 'casual', 'count', 'humidity'], axis=1),
            train_9_4.loc[train_9_4.loc[:, 'humidity'] != 0, 'humidity'])
pred = model.predict(train_9_4.loc[train_9_4.loc[:, 'humidity'] == 0, :].drop(columns=['registered', 'casual', 'count', 'humidity'], axis=1))

train_9_4.loc[train_9_4.loc[:, 'humidity'] == 0, 'humidity'] = pred

scaler = StandardScaler()
train_9_4.loc[:, 'humidity'] = scaler.fit_transform(train_9_4.loc[:, ['humidity']])

train_X_9_4 = train_9_4.loc[train_X_9.index, train_X_9.columns]
valid_X_9_4 = train_9_4.loc[valid_X_9.index, train_X_9.columns]

In [None]:
sc = RF_Model_Sum_Logex(train_X_9_4, train_y_9, valid_X_9_4, valid_y_9)
scores = add_score('94', '[humidity] : Replaced zero with RF', sc, scores)

   number                                 descriptions     rmsle
24     82                SIO sorted, [month] : deleted  0.342248
25     83               SIO sorted, [season] : deleted  0.319927
26     84              SIO sorted, [weekday] : deleted  0.350749
27     85                    SIO sorted, [day] : Added  0.319094
28     86                   SIO shuffle, [day] : Added  0.318503
29     90                      [humidity] : Do nothing  0.319291
30     91                         [humidity] : Deleted  0.333830
31     92         [humidity] : Replaced zero with mean  0.319134
32     93  [humidity] : Replaced zero with interpolate  0.319005
33     94           [humidity] : Replaced zero with RF  0.320194


# 提出

In [None]:
print(train_0.columns.values)
print(test_0.columns.values)

['season' 'holiday' 'workingday' 'weather' 'temp' 'atemp' 'humidity'
 'windspeed' 'casual' 'registered' 'count' 'year' 'month' 'hour' 'weekday']
['season' 'holiday' 'workingday' 'weather' 'temp' 'atemp' 'humidity'
 'windspeed' 'year' 'month' 'hour' 'weekday']


## Submission 1
- predict count by sum of registered and casual
- log convertion and scaler
- datetime: year, month, day, weekday
- delete: windspeed, season
- weather: 4 -> 3
- humidity: interpolate

In [None]:
data_s_1 = pd.concat([train_0, test_0], axis=0)
data_s_1.loc[:, 'day'] = data_s_1.index.day
data_s_1.drop(columns=['season', 'windspeed'], axis=1, inplace=True)
data_s_1.replace({'weather': 4}, 3).replace({'humidity': 0}, np.nan, inplace=True)
data_s_1.interpolate(method='time', inplace=True, limit_direction='both')

numerical_cols = ['temp', 'atemp', 'humidity']
scaler = StandardScaler()
data_s_1.loc[:, numerical_cols] = scaler.fit_transform(data_s_1.loc[:, numerical_cols])

train_s_1 = data_s_1.loc[train_0.index, :]
train_s_1.loc[:, 'registered'] = np.log1p(train_s_1.loc[:, 'registered'])
train_s_1.loc[:, 'casual'] = np.log1p(train_s_1.loc[:, 'casual'])

test_s_1 = data_s_1.loc[test_0.index, :].drop(columns=['registered', 'casual', 'count'], axis=1)

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_s_1.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_1.loc[:, 'registered'])
pred_r = rf_r.predict(test_s_1)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_s_1.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_1.loc[:, 'casual'])
pred_c = rf_c.predict(test_s_1)

pred = np.expm1(pred_r) + np.expm1(pred_c)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/submission_s_1.csv', index=False)

## Submission 2
**1->deleted day**
- predict count by sum of registered and casual
- log convertion and scaler
- datetime: year, month, weekday
- delete: windspeed, season
- weather: 4 -> 3
- humidity: interpolate

In [None]:
train_s_2 = train_s_1.drop(columns=['day'], axis=1)
test_s_2 = test_s_1.drop(columns=['day'], axis=1)

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_s_2.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_2.loc[:, 'registered'])
pred_r = rf_r.predict(test_s_2)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_s_2.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_2.loc[:, 'casual'])
pred_c = rf_c.predict(test_s_2)

pred = np.expm1(pred_r) + np.expm1(pred_c)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/submission_s_2.csv', index=False)

## Submission 3
**2->using windspeed interpolated**
- predict count by sum of registered and casual
- log convertion and scaler
- datetime: year, month, weekday
- delete: season
- weather: 4 -> 3
- humidity, windspeed: interpolate

In [None]:
train_s_3 = train_s_2.copy()
test_s_3 = test_s_2.copy()

wind = pd.DataFrame(pd.concat([train_0.loc[:, 'windspeed'], test_0.loc[:, 'windspeed']], axis=0), columns=['windspeed'])
wind.replace({'windspeed': 0}, np.nan, inplace=True)
wind.interpolate(method='time', inplace=True, limit_direction='both')

scaler = StandardScaler()
wind.loc[:, 'windspeed'] = scaler.fit_transform(wind.loc[:, ['windspeed']])

train_s_3.loc[:, 'windspeed'] = wind.loc[train_s_3.index]
test_s_3.loc[:, 'windspeed'] = wind.loc[test_s_3.index]

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_s_3.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_3.loc[:, 'registered'])
pred_r = rf_r.predict(test_s_3)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_s_3.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_3.loc[:, 'casual'])
pred_c = rf_c.predict(test_s_3)

pred = np.expm1(pred_r) + np.expm1(pred_c)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/submission_s_3.csv', index=False)

: 

## Submission 4
**3->wind, humidity: no convertion**
- predict count by sum of registered and casual
- log convertion and scaler
- datetime: year, month, weekday
- delete: season
- weather: 4 -> 3

In [None]:
train_s_4 = train_s_3.copy()
test_s_4 = test_s_3.copy()

train_s_4.loc[:, 'windspeed'] = train_0.loc[:, 'windspeed']
train_s_4.loc[:, 'humidity'] = train_0.loc[:, 'humidity']
test_s_4.loc[:, 'windspeed'] = test_0.loc[:, 'windspeed']
test_s_4.loc[:, 'humidity'] = test_0.loc[:, 'humidity']

scaler = StandardScaler()
train_s_4.loc[:, ['windspeed', 'humidity']] = scaler.fit_transform(train_s_4.loc[:, ['windspeed', 'humidity']])
test_s_4.loc[:, ['windspeed', 'humidity']] = scaler.transform(test_s_4.loc[:, ['windspeed', 'humidity']])

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_s_4.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_4.loc[:, 'registered'])
pred_r = rf_r.predict(test_s_4)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_s_4.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_4.loc[:, 'casual'])
pred_c = rf_c.predict(test_s_4)

pred = np.expm1(pred_r) + np.expm1(pred_c)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/submission_s_4.csv', index=False)

## Submission 5
**3->atemp: delete**
- predict count by sum of registered and casual
- log convertion and scaler
- datetime: year, month, weekday
- delete: season, atemp
- weather: 4 -> 3
- humidity, windspeed: interpolate

In [None]:
train_s_5 = train_s_3.copy()
test_s_5 = test_s_3.copy()

train_s_5.drop(columns=['atemp'], axis=1, inplace=True)
test_s_5.drop(columns=['atemp'], axis=1, inplace=True)

In [None]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_s_5.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_5.loc[:, 'registered'])
pred_r = rf_r.predict(test_s_5)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_s_5.drop(columns=['registered', 'casual', 'count'], axis=1), train_s_5.loc[:, 'casual'])
pred_c = rf_c.predict(test_s_5)

pred = np.expm1(pred_r) + np.expm1(pred_c)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/submission_s_5.csv', index=False)