In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

train = pd.read_csv('./data/input/train.csv', parse_dates=['datetime'], index_col='datetime')
test = pd.read_csv('./data/input/test.csv', parse_dates=['datetime'], index_col='datetime')
data = pd.concat([train, test], axis=0).sort_index()

# データ前処理

In [2]:
data_0 = data.copy()
data_0['year'] = data_0.index.year
data_0['month'] = data_0.index.month
# data['day'] = data.index.day
data_0['hour'] = data_0.index.hour
data_0['weekday'] = data_0.index.weekday

train_0 = data_0.loc[train.index, :]
test_0 = data_0.loc[test.index, :].drop(columns=['count', 'casual', 'registered'])

# 仮説検証

In [25]:
rf = RFR()
rf.fit(train_0.drop(columns=['count', 'casual', 'registered']), train_0['count'])
pred = rf.predict(test_0)

In [None]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/t_0.csv', index=False)

## 仮説 1
count を registered と casual の和として予測する．

### t_1_1 : count

In [3]:
rf = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'count'])
pred = rf.predict(test_0)

In [4]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/t_1_1.csv', index=False)

### t_1_2 : registered + casual

In [5]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'registered'])
pred_r = rf_r.predict(test_0)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_0.drop(columns=['casual', 'registered', 'count']), train_0.loc[:, 'casual'])
pred_c = rf_c.predict(test_0)

pred = pred_r + pred_c

In [6]:
submission = pd.DataFrame({'datetime': test_0.index, 'count': pred})
submission.to_csv('./data/output/t_1_2.csv', index=False)

### R: 1_2

## 仮説 2 : windspeed
風速 0 について

### 仮説 2-1 : delete
風速 を削除

In [7]:
# delete windspeed
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_0.drop(columns=['casual', 'registered', 'count', 'windspeed']), train_0.loc[:, 'registered'])
pred_r = rf_r.predict(test_0.drop(columns=['windspeed']))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_0.drop(columns=['casual', 'registered', 'count', 'windspeed']), train_0.loc[:, 'casual'])
pred_c = rf_c.predict(test_0.drop(columns=['windspeed']))

pred = pred_r + pred_c

In [10]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_1.csv', index=False)

### 仮説 2-2 : mean
風速 0 を平均値に置き換え

In [9]:
data_2_2 = data_0.copy()
data_2_2.loc[data_2_2.loc[:, 'windspeed'] == 0, 'windspeed'] = data_2_2.loc[data_2_2.loc[:, 'windspeed'] != 0, 'windspeed'].mean()

train_2_2 = data_2_2.loc[train.index, :]
test_2_2 = data_2_2.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [11]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_2.drop(columns=['casual', 'registered', 'count']), train_2_2.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_2)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_2.drop(columns=['casual', 'registered', 'count']), train_2_2.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_2)

pred = pred_r + pred_c

In [12]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_2.csv', index=False)

### 仮説 2-3 : interpolate
風速 0 を線形補完

In [13]:
data_2_3 = data_0.copy()
data_2_3.replace({'windspeed': {0: np.nan}}, inplace=True)
data_2_3.interpolate(method='time', inplace=True, limit_direction='both')

train_2_3 = data_2_3.loc[train.index, :]
test_2_3 = data_2_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [14]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_3.drop(columns=['casual', 'registered', 'count']), train_2_3.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_3)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_3.drop(columns=['casual', 'registered', 'count']), train_2_3.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_3)

pred = pred_r + pred_c

In [15]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_3.csv', index=False)

### 仮説 2-4 : predict by RF

In [16]:
data_2_4 = data_0.copy()

rf_w = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_w.fit(data_2_4.loc[data_2_4.loc[:, 'windspeed'] != 0, :].drop(columns=['casual', 'registered', 'count', 'windspeed']),
         data_2_4.loc[data_2_4.loc[:, 'windspeed'] != 0, 'windspeed'])
pred_w = rf_w.predict(data_2_4.loc[data_2_4.loc[:, 'windspeed'] == 0, :].drop(columns=['casual', 'registered', 'count', 'windspeed']))

data_2_4.loc[data_2_4.loc[:, 'windspeed'] == 0, 'windspeed'] = pred_w

train_2_4 = data_2_4.loc[train.index, :]
test_2_4 = data_2_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [17]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_2_4.drop(columns=['casual', 'registered', 'count']), train_2_4.loc[:, 'registered'])
pred_r = rf_r.predict(test_2_4)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_2_4.drop(columns=['casual', 'registered', 'count']), train_2_4.loc[:, 'casual'])
pred_c = rf_c.predict(test_2_4)

pred = pred_r + pred_c

In [18]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_2_4.csv', index=False)

### R: 2-1

## 仮説 3 : weather
天候 4 を 3 とする．  
仮説 1, 仮説 2-1 の手法を用いる．

In [22]:
data_3 = data_0.copy()
data_3.drop(columns=['windspeed'], inplace=True)
data_3.replace({'weather': {4: 3}}, inplace=True)

train_3 = data_3.loc[train.index, :]
test_3 = data_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [23]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_3.drop(columns=['casual', 'registered', 'count']), train_3.loc[:, 'registered'])
pred_r = rf_r.predict(test_3)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_3.drop(columns=['casual', 'registered', 'count']), train_3.loc[:, 'casual'])
pred_c = rf_c.predict(test_3)

pred = pred_r + pred_c

In [24]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_3.csv', index=False)

## 仮説 4 : atemp
atemp を使用しない．  
仮説 3 の下で行う．

In [25]:
data_4 = data_0.copy()
data_4.drop(columns=['windspeed', 'atemp'], inplace=True)
data_4.replace({'weather': {4: 3}}, inplace=True)

train_4 = data_4.loc[train.index, :]
test_4 = data_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [26]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_4.drop(columns=['casual', 'registered', 'count']), train_4.loc[:, 'registered'])
pred_r = rf_r.predict(test_4)

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_4.drop(columns=['casual', 'registered', 'count']), train_4.loc[:, 'casual'])
pred_c = rf_c.predict(test_4)

pred = pred_r + pred_c

In [27]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_4.csv', index=False)

## 仮説 5 : log
目的変数を対数変換する．log(x + 1)  
仮説 3 の下で行う．

In [28]:
data_5 = data_0.copy()
data_5.drop(columns=['windspeed'], inplace=True)
data_5.replace({'weather': {4: 3}}, inplace=True)

train_5 = data_5.loc[train.index, :]
test_5 = data_5.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [29]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_5.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_5))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_5.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_5))

pred = pred_r + pred_c

In [30]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_5.csv', index=False)

## 仮説 6 : scaler
連続変数を標準化する．  
仮説 5 の下で行う．

In [31]:
data_6 = data_0.copy()
data_6.drop(columns=['windspeed'], inplace=True)
data_6.replace({'weather': {4: 3}}, inplace=True)

scaler = StandardScaler()
data_6.loc[:, ['temp', 'atemp', 'humidity']] = scaler.fit_transform(data_6.loc[:, ['temp', 'atemp', 'humidity']])

train_6 = data_6.loc[train.index, :]
test_6 = data_6.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [32]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_6.drop(columns=['casual', 'registered', 'count']), np.log1p(train_6.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_6))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_6.drop(columns=['casual', 'registered', 'count']), np.log1p(train_6.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_6))

pred = pred_r + pred_c

In [33]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_6.csv', index=False)

## 仮説 7 : datetime
datetime について  
仮説 6 の下で行う

### 仮説 7-1 : month

In [34]:
data_7_1 = data_0.copy()
data_7_1.drop(columns=['windspeed', 'month'], inplace=True)
data_7_1.replace({'weather': {4: 3}}, inplace=True)

train_7_1 = data_7_1.loc[train.index, :]
test_7_1 = data_7_1.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [35]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_1.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_1))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_1.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_1))

In [36]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred_r + pred_c})
submission.to_csv('./data/output/t_7_1.csv', index=False)

### 仮説 7-2 : season

In [3]:
data_7_2 = data_0.copy()
data_7_2.drop(columns=['windspeed', 'season'], inplace=True)
data_7_2.replace({'weather': {4: 3}}, inplace=True)

train_7_2 = data_7_2.loc[train.index, :]
test_7_2 = data_7_2.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [4]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_2.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_2))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_2.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_2))

pred = pred_r + pred_c

In [5]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_7_2.csv', index=False)

### 仮説 7-3 : weekday

In [6]:
data_7_3 = data_0.copy()
data_7_3.drop(columns=['windspeed', 'weekday'], inplace=True)
data_7_3.replace({'weather': {4: 3}}, inplace=True)

train_7_3 = data_7_3.loc[train.index, :]
test_7_3 = data_7_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [7]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_3.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_3.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_3))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_3.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_3.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_3))

pred = pred_r + pred_c

In [8]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_7_3.csv', index=False)

### 仮説 7-4 : day

In [10]:
data_7_4 = data_0.copy()
data_7_4.drop(columns=['windspeed'], inplace=True)
data_7_4.replace({'weather': {4: 3}}, inplace=True)
data_7_4.loc[:, 'data'] = data_7_4.index.day

train_7_4 = data_7_4.loc[train.index, :]
test_7_4 = data_7_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [11]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_4.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_4.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_4))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_4.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_4.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_4))

pred = pred_r + pred_c

In [12]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_7_4.csv', index=False)

### R: 7-1

### t_7_5 : 7-1 + weather 4

In [25]:
data_7_5 = data_0.copy()
data_7_5.drop(columns=['windspeed', 'month'], inplace=True)

train_7_5 = data_7_5.loc[train.index, :]
test_7_5 = data_7_5.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [26]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_7_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_5.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_7_5))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_7_5.drop(columns=['casual', 'registered', 'count']), np.log1p(train_7_5.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_7_5))

pred = pred_r + pred_c

In [27]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_7_5.csv', index=False)

: 

## 仮説 8 : humidity

### 仮説 8-1 : delete

In [13]:
data_8_1 = data_0.copy()
data_8_1.drop(columns=['windspeed', 'month', 'humidity'], inplace=True)
data_8_1.replace({'weather': {4: 3}}, inplace=True)

train_8_1 = data_8_1.loc[train.index, :]
test_8_1 = data_8_1.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [14]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_8_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_1.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_8_1))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_8_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_1.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_8_1))

pred = pred_r + pred_c

In [15]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_8_1.csv', index=False)

### 仮説 8-2: mean

In [16]:
data_8_2 = data_0.copy()
data_8_2.drop(columns=['windspeed', 'month'], inplace=True)
data_8_2.replace({'weather': {4: 3}}, inplace=True)
data_8_2.replace({'humidity': {0: data_8_2.loc[data_8_2.loc[:, 'humidity'] != 0, 'humidity'].mean()}}, inplace=True)

train_8_2 = data_8_2.loc[train.index, :]
test_8_2 = data_8_2.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [17]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_8_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_2.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_8_2))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_8_2.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_2.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_8_2))

pred = pred_r + pred_c

In [18]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_8_2.csv', index=False)

### 仮説 8-3 : interpolate

In [19]:
data_8_3 = data_0.copy()
data_8_3.drop(columns=['windspeed', 'month'], inplace=True)
data_8_3.replace({'weather': {4: 3}}, inplace=True)

data_8_3.replace({'humidity': 0}, np.nan, inplace=True)
data_8_3.interpolate(method='time', inplace=True, limit_direction='both')

train_8_3 = data_8_3.loc[train.index, :]
test_8_3 = data_8_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [20]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_8_3.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_3.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_8_3))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_8_3.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_3.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_8_3))

pred = pred_r + pred_c

In [21]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_8_3.csv', index=False)

### 仮説 8-4 : RF

In [22]:
data_8_4 = data_0.copy()
data_8_4.drop(columns=['windspeed', 'month'], inplace=True)
data_8_4.replace({'weather': {4: 3}}, inplace=True)

rf_h = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_h.fit(data_8_4.loc[data_8_4.loc[:, 'humidity'] != 0, :].drop(columns=['casual', 'registered', 'count', 'humidity']),
            data_8_4.loc[data_8_4.loc[:, 'humidity'] != 0, 'humidity'])
pred_h = rf_h.predict(data_8_4.loc[data_8_4.loc[:, 'humidity'] == 0, :].drop(columns=['casual', 'registered', 'count', 'humidity']))

data_8_4.loc[data_8_4.loc[:, 'humidity'] == 0, 'humidity'] = pred_h

train_8_4 = data_8_4.loc[train.index, :]
test_8_4 = data_8_4.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [23]:
rf_r = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_r.fit(train_8_4.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_4.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_8_4))

rf_c = RFR(random_state=0, n_jobs=-1, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400)
rf_c.fit(train_8_4.drop(columns=['casual', 'registered', 'count']), np.log1p(train_8_4.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_8_4))

pred = pred_r + pred_c

In [24]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_8_4.csv', index=False)

### R: 7-1

## 99 : Highperparameter tuning
using Optuna

In [3]:
from sklearn.model_selection import cross_val_score
import optuna

In [4]:
data_99 = data_0.copy()
data_99.drop(columns=['windspeed', 'month'], inplace=True)

train_99 = data_99.loc[train.index, :]
test_99 = data_99.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [5]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 1000)
    max_features = trial.suggest_categorical('max_features', ['sqrt','log2', None])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1,1000)
    n_estimators =  trial.suggest_int('n_estimators', 1, 1000)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    regr = RFR(max_depth = max_depth,
               max_features = max_features,
               max_leaf_nodes = max_leaf_nodes,
               n_estimators = n_estimators,
               min_samples_split = min_samples_split,
               min_samples_leaf = min_samples_leaf,
               n_jobs = 2)

    score = cross_val_score(regr,
                            train_99.drop(columns=['casual', 'registered', 'count']),
                            train_99.loc[:, 'count'],
                            cv=5, scoring="r2")
    r2_mean = score.mean()
    print(r2_mean)

    return r2_mean

In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

optimised_rf = RFR(max_depth = study.best_params['max_depth'], 
                   max_features = study.best_params['max_features'],
                   max_leaf_nodes = study.best_params['max_leaf_nodes'],
                   n_estimators = study.best_params['n_estimators'],
                   min_samples_split = study.best_params['min_samples_split'],
                   min_samples_leaf = study.best_params['min_samples_leaf'],
                   n_jobs=2)

if False:
    optimised_rf.fit(train_99.drop(columns=['casual', 'registered', 'count']) ,train_99.loc[:, 'count'])

[I 2023-06-13 21:46:59,551] A new study created in memory with name: no-name-84255b15-cda3-4887-8c47-1a5363fe4eec
[I 2023-06-13 21:47:05,823] Trial 0 finished with value: 0.6205811513204242 and parameters: {'max_depth': 268, 'max_features': 'log2', 'max_leaf_nodes': 216, 'n_estimators': 420, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.6205811513204242.


0.6205811513204242


[I 2023-06-13 21:47:12,124] Trial 1 finished with value: 0.5449165057283483 and parameters: {'max_depth': 8, 'max_features': 'sqrt', 'max_leaf_nodes': 392, 'n_estimators': 554, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.6205811513204242.


0.5449165057283483


[I 2023-06-13 21:47:12,644] Trial 2 finished with value: 0.6534210648188995 and parameters: {'max_depth': 502, 'max_features': None, 'max_leaf_nodes': 162, 'n_estimators': 16, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 2 with value: 0.6534210648188995.


0.6534210648188995


[I 2023-06-13 21:47:26,109] Trial 3 finished with value: 0.6608413955812134 and parameters: {'max_depth': 350, 'max_features': 'log2', 'max_leaf_nodes': 821, 'n_estimators': 858, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.6608413955812134.


0.6608413955812134


[I 2023-06-13 21:47:37,082] Trial 4 finished with value: 0.6407622151249772 and parameters: {'max_depth': 385, 'max_features': 'sqrt', 'max_leaf_nodes': 309, 'n_estimators': 858, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.6608413955812134.


0.6407622151249772


[I 2023-06-13 21:47:44,560] Trial 5 finished with value: 0.6236495607500288 and parameters: {'max_depth': 548, 'max_features': 'log2', 'max_leaf_nodes': 209, 'n_estimators': 544, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.6608413955812134.


0.6236495607500288


[I 2023-06-13 21:47:53,807] Trial 6 finished with value: 0.6038726717465781 and parameters: {'max_depth': 913, 'max_features': 'sqrt', 'max_leaf_nodes': 152, 'n_estimators': 741, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.6608413955812134.


0.6038726717465781


[I 2023-06-13 21:47:58,258] Trial 7 finished with value: 0.6116172788672306 and parameters: {'max_depth': 528, 'max_features': 'sqrt', 'max_leaf_nodes': 303, 'n_estimators': 328, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.6608413955812134.


0.6116172788672306


[I 2023-06-13 21:48:00,717] Trial 8 finished with value: 0.6150434622078192 and parameters: {'max_depth': 778, 'max_features': 'sqrt', 'max_leaf_nodes': 191, 'n_estimators': 189, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.6608413955812134.


0.6150434622078192


[I 2023-06-13 21:48:03,004] Trial 9 finished with value: 0.6060313212505517 and parameters: {'max_depth': 903, 'max_features': 'log2', 'max_leaf_nodes': 856, 'n_estimators': 151, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 3 with value: 0.6608413955812134.


0.6060313212505517


[I 2023-06-13 21:48:36,872] Trial 10 finished with value: 0.7074657989905517 and parameters: {'max_depth': 119, 'max_features': None, 'max_leaf_nodes': 732, 'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 10 with value: 0.7074657989905517.


0.7074657989905517


[I 2023-06-13 21:49:10,839] Trial 11 finished with value: 0.7080901550241296 and parameters: {'max_depth': 109, 'max_features': None, 'max_leaf_nodes': 741, 'n_estimators': 992, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 11 with value: 0.7080901550241296.


0.7080901550241296


[I 2023-06-13 21:49:44,492] Trial 12 finished with value: 0.7070489291491107 and parameters: {'max_depth': 18, 'max_features': None, 'max_leaf_nodes': 678, 'n_estimators': 998, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 11 with value: 0.7080901550241296.


0.7070489291491107


[I 2023-06-13 21:50:18,104] Trial 13 finished with value: 0.7098530085127578 and parameters: {'max_depth': 158, 'max_features': None, 'max_leaf_nodes': 613, 'n_estimators': 992, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 13 with value: 0.7098530085127578.


0.7098530085127578


[I 2023-06-13 21:50:45,279] Trial 14 finished with value: 0.7158161390483946 and parameters: {'max_depth': 191, 'max_features': None, 'max_leaf_nodes': 989, 'n_estimators': 722, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 14 with value: 0.7158161390483946.


0.7158161390483946


[I 2023-06-13 21:51:10,666] Trial 15 finished with value: 0.7148267415576981 and parameters: {'max_depth': 224, 'max_features': None, 'max_leaf_nodes': 997, 'n_estimators': 677, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 14 with value: 0.7158161390483946.


0.7148267415576981


[I 2023-06-13 21:51:36,177] Trial 16 finished with value: 0.7153354512981716 and parameters: {'max_depth': 232, 'max_features': None, 'max_leaf_nodes': 984, 'n_estimators': 683, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 14 with value: 0.7158161390483946.


0.7153354512981716


[I 2023-06-13 21:51:46,264] Trial 17 finished with value: 0.18924557795822267 and parameters: {'max_depth': 369, 'max_features': None, 'max_leaf_nodes': 8, 'n_estimators': 671, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.7158161390483946.


0.18924557795822267


[I 2023-06-13 21:52:14,065] Trial 18 finished with value: 0.6971470559904663 and parameters: {'max_depth': 669, 'max_features': None, 'max_leaf_nodes': 990, 'n_estimators': 775, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 14 with value: 0.7158161390483946.


0.6971470559904663


[I 2023-06-13 21:52:30,352] Trial 19 finished with value: 0.7009149268887358 and parameters: {'max_depth': 234, 'max_features': None, 'max_leaf_nodes': 537, 'n_estimators': 431, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 14 with value: 0.7158161390483946.


0.7009149268887358


[I 2023-06-13 21:53:25,616] Trial 20 finished with value: 0.7123123878843315 and parameters: {'max_depth': 428, 'max_features': None, 'max_leaf_nodes': 886, 'n_estimators': 638, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 14 with value: 0.7158161390483946.


0.7123123878843315


[I 2023-06-13 21:53:56,238] Trial 21 finished with value: 0.7158590564473585 and parameters: {'max_depth': 257, 'max_features': None, 'max_leaf_nodes': 976, 'n_estimators': 644, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.7158590564473585


[I 2023-06-13 21:54:30,841] Trial 22 finished with value: 0.7122271526062527 and parameters: {'max_depth': 294, 'max_features': None, 'max_leaf_nodes': 932, 'n_estimators': 814, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.7122271526062527


[I 2023-06-13 21:54:56,406] Trial 23 finished with value: 0.714873693579008 and parameters: {'max_depth': 177, 'max_features': None, 'max_leaf_nodes': 824, 'n_estimators': 619, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.714873693579008


[I 2023-06-13 21:55:15,138] Trial 24 finished with value: 0.714136229437781 and parameters: {'max_depth': 76, 'max_features': None, 'max_leaf_nodes': 937, 'n_estimators': 448, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.714136229437781


[I 2023-06-13 21:55:43,506] Trial 25 finished with value: 0.701899307925281 and parameters: {'max_depth': 314, 'max_features': None, 'max_leaf_nodes': 778, 'n_estimators': 724, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 21 with value: 0.7158590564473585.


0.701899307925281


[I 2023-06-13 21:56:07,466] Trial 26 finished with value: 0.7151567230075973 and parameters: {'max_depth': 448, 'max_features': None, 'max_leaf_nodes': 935, 'n_estimators': 577, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.7151567230075973


[I 2023-06-13 21:56:40,972] Trial 27 finished with value: 0.7018885619739481 and parameters: {'max_depth': 622, 'max_features': None, 'max_leaf_nodes': 658, 'n_estimators': 896, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 21 with value: 0.7158590564473585.


0.7018885619739481


[I 2023-06-13 21:56:59,842] Trial 28 finished with value: 0.707777486038695 and parameters: {'max_depth': 196, 'max_features': None, 'max_leaf_nodes': 565, 'n_estimators': 496, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.707777486038695


[I 2023-06-13 21:57:12,815] Trial 29 finished with value: 0.6646562713776272 and parameters: {'max_depth': 264, 'max_features': 'log2', 'max_leaf_nodes': 882, 'n_estimators': 708, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.6646562713776272


[I 2023-06-13 21:57:19,391] Trial 30 finished with value: 0.6541311634326925 and parameters: {'max_depth': 75, 'max_features': 'log2', 'max_leaf_nodes': 443, 'n_estimators': 368, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.6541311634326925


[I 2023-06-13 21:57:45,511] Trial 31 finished with value: 0.713024527319989 and parameters: {'max_depth': 439, 'max_features': None, 'max_leaf_nodes': 944, 'n_estimators': 600, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.713024527319989


[I 2023-06-13 21:58:13,919] Trial 32 finished with value: 0.7146809424439388 and parameters: {'max_depth': 430, 'max_features': None, 'max_leaf_nodes': 982, 'n_estimators': 569, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.7146809424439388


[I 2023-06-13 21:58:35,710] Trial 33 finished with value: 0.7125145538824561 and parameters: {'max_depth': 287, 'max_features': None, 'max_leaf_nodes': 924, 'n_estimators': 526, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.7125145538824561


[I 2023-06-13 21:59:04,343] Trial 34 finished with value: 0.6859473636711345 and parameters: {'max_depth': 599, 'max_features': None, 'max_leaf_nodes': 793, 'n_estimators': 780, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 21 with value: 0.7158590564473585.


0.6859473636711345


[I 2023-06-13 21:59:31,924] Trial 35 finished with value: 0.7137125521056934 and parameters: {'max_depth': 330, 'max_features': None, 'max_leaf_nodes': 884, 'n_estimators': 637, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.7137125521056934


[I 2023-06-13 21:59:40,938] Trial 36 finished with value: 0.6653003507340622 and parameters: {'max_depth': 141, 'max_features': 'sqrt', 'max_leaf_nodes': 837, 'n_estimators': 489, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 21 with value: 0.7158590564473585.


0.6653003507340622


[I 2023-06-13 22:00:17,180] Trial 37 finished with value: 0.7087564663412582 and parameters: {'max_depth': 485, 'max_features': None, 'max_leaf_nodes': 941, 'n_estimators': 901, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 21 with value: 0.7158590564473585.


0.7087564663412582


[I 2023-06-13 22:00:27,507] Trial 38 finished with value: 0.662390902665163 and parameters: {'max_depth': 384, 'max_features': 'log2', 'max_leaf_nodes': 746, 'n_estimators': 582, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.662390902665163


[I 2023-06-13 22:00:41,569] Trial 39 finished with value: 0.6374027797382549 and parameters: {'max_depth': 240, 'max_features': 'sqrt', 'max_leaf_nodes': 998, 'n_estimators': 824, 'min_samples_split': 3, 'min_samples_leaf': 6}. Best is trial 21 with value: 0.7158590564473585.


0.6374027797382549


[I 2023-06-13 22:00:46,643] Trial 40 finished with value: 0.18440863826853532 and parameters: {'max_depth': 4, 'max_features': None, 'max_leaf_nodes': 899, 'n_estimators': 279, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 21 with value: 0.7158590564473585.


0.18440863826853532


[I 2023-06-13 22:01:11,985] Trial 41 finished with value: 0.714544920374579 and parameters: {'max_depth': 183, 'max_features': None, 'max_leaf_nodes': 818, 'n_estimators': 626, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 21 with value: 0.7158590564473585.


0.714544920374579


[I 2023-06-13 22:01:40,946] Trial 42 finished with value: 0.7159298849669515 and parameters: {'max_depth': 197, 'max_features': None, 'max_leaf_nodes': 867, 'n_estimators': 672, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.7159298849669515


[I 2023-06-13 22:02:10,705] Trial 43 finished with value: 0.7137092741710679 and parameters: {'max_depth': 80, 'max_features': None, 'max_leaf_nodes': 854, 'n_estimators': 700, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 42 with value: 0.7159298849669515.


0.7137092741710679


[I 2023-06-13 22:02:44,851] Trial 44 finished with value: 0.7156971037315467 and parameters: {'max_depth': 342, 'max_features': None, 'max_leaf_nodes': 963, 'n_estimators': 774, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.7156971037315467


[I 2023-06-13 22:02:59,994] Trial 45 finished with value: 0.6610054941385561 and parameters: {'max_depth': 981, 'max_features': 'sqrt', 'max_leaf_nodes': 962, 'n_estimators': 767, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 42 with value: 0.7159298849669515.


0.6610054941385561


[I 2023-06-13 22:03:31,115] Trial 46 finished with value: 0.6785120011452296 and parameters: {'max_depth': 338, 'max_features': None, 'max_leaf_nodes': 303, 'n_estimators': 886, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 42 with value: 0.7159298849669515.


0.6785120011452296


[I 2023-06-13 22:03:47,388] Trial 47 finished with value: 0.6639445921358925 and parameters: {'max_depth': 205, 'max_features': 'log2', 'max_leaf_nodes': 708, 'n_estimators': 831, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.6639445921358925


[I 2023-06-13 22:04:17,402] Trial 48 finished with value: 0.7080919775651641 and parameters: {'max_depth': 282, 'max_features': None, 'max_leaf_nodes': 784, 'n_estimators': 675, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 42 with value: 0.7159298849669515.


0.7080919775651641


[I 2023-06-13 22:04:18,866] Trial 49 finished with value: 0.7078618857178556 and parameters: {'max_depth': 132, 'max_features': None, 'max_leaf_nodes': 890, 'n_estimators': 29, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 42 with value: 0.7159298849669515.


0.7078618857178556


[I 2023-06-13 22:04:33,056] Trial 50 finished with value: 0.6141281331318016 and parameters: {'max_depth': 250, 'max_features': 'sqrt', 'max_leaf_nodes': 997, 'n_estimators': 761, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 42 with value: 0.7159298849669515.


0.6141281331318016


[I 2023-06-13 22:04:58,551] Trial 51 finished with value: 0.7144843099457013 and parameters: {'max_depth': 413, 'max_features': None, 'max_leaf_nodes': 915, 'n_estimators': 542, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.7144843099457013


[I 2023-06-13 22:05:30,586] Trial 52 finished with value: 0.7150912132633199 and parameters: {'max_depth': 358, 'max_features': None, 'max_leaf_nodes': 961, 'n_estimators': 669, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.7150912132633199


[I 2023-06-13 22:06:04,487] Trial 53 finished with value: 0.7151693197791843 and parameters: {'max_depth': 482, 'max_features': None, 'max_leaf_nodes': 874, 'n_estimators': 737, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.7151693197791843


[I 2023-06-13 22:06:37,057] Trial 54 finished with value: 0.7120305444212215 and parameters: {'max_depth': 503, 'max_features': None, 'max_leaf_nodes': 853, 'n_estimators': 739, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 42 with value: 0.7159298849669515.


0.7120305444212215


[I 2023-06-13 22:07:03,707] Trial 55 finished with value: 0.5916175967076482 and parameters: {'max_depth': 556, 'max_features': None, 'max_leaf_nodes': 75, 'n_estimators': 939, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 42 with value: 0.7159298849669515.


0.5916175967076482


[I 2023-06-13 22:07:38,687] Trial 56 finished with value: 0.7121854300576567 and parameters: {'max_depth': 47, 'max_features': None, 'max_leaf_nodes': 864, 'n_estimators': 794, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 42 with value: 0.7159298849669515.


0.7121854300576567


[I 2023-06-13 22:08:17,928] Trial 57 finished with value: 0.7167117775657018 and parameters: {'max_depth': 167, 'max_features': None, 'max_leaf_nodes': 969, 'n_estimators': 858, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7167117775657018


[I 2023-06-13 22:08:51,161] Trial 58 finished with value: 0.6837269261173322 and parameters: {'max_depth': 154, 'max_features': None, 'max_leaf_nodes': 259, 'n_estimators': 931, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 57 with value: 0.7167117775657018.


0.6837269261173322


[I 2023-06-13 22:09:30,049] Trial 59 finished with value: 0.7143052407364612 and parameters: {'max_depth': 114, 'max_features': None, 'max_leaf_nodes': 966, 'n_estimators': 859, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7143052407364612


[I 2023-06-13 22:09:44,079] Trial 60 finished with value: 0.6671970701978459 and parameters: {'max_depth': 198, 'max_features': 'log2', 'max_leaf_nodes': 915, 'n_estimators': 698, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.6671970701978459


[I 2023-06-13 22:10:18,329] Trial 61 finished with value: 0.7137545590333694 and parameters: {'max_depth': 780, 'max_features': None, 'max_leaf_nodes': 963, 'n_estimators': 736, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7137545590333694


[I 2023-06-13 22:10:55,872] Trial 62 finished with value: 0.7126029869660784 and parameters: {'max_depth': 217, 'max_features': None, 'max_leaf_nodes': 811, 'n_estimators': 860, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7126029869660784


[I 2023-06-13 22:11:20,470] Trial 63 finished with value: 0.6951520409034393 and parameters: {'max_depth': 313, 'max_features': None, 'max_leaf_nodes': 360, 'n_estimators': 655, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.6951520409034393


[I 2023-06-13 22:11:56,906] Trial 64 finished with value: 0.7135349628944458 and parameters: {'max_depth': 159, 'max_features': None, 'max_leaf_nodes': 908, 'n_estimators': 819, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7135349628944458


[I 2023-06-13 22:12:24,364] Trial 65 finished with value: 0.6760555766873582 and parameters: {'max_depth': 263, 'max_features': None, 'max_leaf_nodes': 968, 'n_estimators': 712, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 57 with value: 0.7167117775657018.


0.6760555766873582


[I 2023-06-13 22:13:00,732] Trial 66 finished with value: 0.7144275597530481 and parameters: {'max_depth': 721, 'max_features': None, 'max_leaf_nodes': 1000, 'n_estimators': 794, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7144275597530481


[I 2023-06-13 22:13:26,918] Trial 67 finished with value: 0.7107866850333722 and parameters: {'max_depth': 95, 'max_features': None, 'max_leaf_nodes': 760, 'n_estimators': 604, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.7107866850333722


[I 2023-06-13 22:14:10,049] Trial 68 finished with value: 0.7139737106317291 and parameters: {'max_depth': 38, 'max_features': None, 'max_leaf_nodes': 868, 'n_estimators': 959, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7139737106317291


[I 2023-06-13 22:14:41,892] Trial 69 finished with value: 0.6958710534436016 and parameters: {'max_depth': 309, 'max_features': None, 'max_leaf_nodes': 648, 'n_estimators': 753, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 57 with value: 0.7167117775657018.


0.6958710534436016


[I 2023-06-13 22:14:57,484] Trial 70 finished with value: 0.6698855948826384 and parameters: {'max_depth': 238, 'max_features': 'sqrt', 'max_leaf_nodes': 937, 'n_estimators': 687, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.6698855948826384


[I 2023-06-13 22:15:28,561] Trial 71 finished with value: 0.7153825981135505 and parameters: {'max_depth': 465, 'max_features': None, 'max_leaf_nodes': 926, 'n_estimators': 585, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7153825981135505


[I 2023-06-13 22:16:00,795] Trial 72 finished with value: 0.7150210071958023 and parameters: {'max_depth': 400, 'max_features': None, 'max_leaf_nodes': 904, 'n_estimators': 649, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7150210071958023


[I 2023-06-13 22:16:26,448] Trial 73 finished with value: 0.7122245499657976 and parameters: {'max_depth': 560, 'max_features': None, 'max_leaf_nodes': 836, 'n_estimators': 518, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7122245499657976


[I 2023-06-13 22:17:01,057] Trial 74 finished with value: 0.7144186747144349 and parameters: {'max_depth': 484, 'max_features': None, 'max_leaf_nodes': 945, 'n_estimators': 729, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7144186747144349


[I 2023-06-13 22:17:28,875] Trial 75 finished with value: 0.7136432099544109 and parameters: {'max_depth': 469, 'max_features': None, 'max_leaf_nodes': 972, 'n_estimators': 589, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.7136432099544109


[I 2023-06-13 22:17:54,645] Trial 76 finished with value: 0.714263530331393 and parameters: {'max_depth': 521, 'max_features': None, 'max_leaf_nodes': 882, 'n_estimators': 558, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.714263530331393


[I 2023-06-13 22:18:05,931] Trial 77 finished with value: 0.6533014409256606 and parameters: {'max_depth': 370, 'max_features': 'log2', 'max_leaf_nodes': 445, 'n_estimators': 615, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.6533014409256606


[I 2023-06-13 22:18:42,145] Trial 78 finished with value: 0.7136923853981937 and parameters: {'max_depth': 275, 'max_features': None, 'max_leaf_nodes': 926, 'n_estimators': 788, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7136923853981937


[I 2023-06-13 22:19:10,461] Trial 79 finished with value: 0.7141125367125848 and parameters: {'max_depth': 173, 'max_features': None, 'max_leaf_nodes': 806, 'n_estimators': 633, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7141125367125848


[I 2023-06-13 22:19:39,004] Trial 80 finished with value: 0.7115920356354707 and parameters: {'max_depth': 344, 'max_features': None, 'max_leaf_nodes': 720, 'n_estimators': 666, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.7115920356354707


[I 2023-06-13 22:20:05,274] Trial 81 finished with value: 0.7139654161872728 and parameters: {'max_depth': 457, 'max_features': None, 'max_leaf_nodes': 949, 'n_estimators': 573, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7139654161872728


[I 2023-06-13 22:20:27,508] Trial 82 finished with value: 0.7146477751494411 and parameters: {'max_depth': 442, 'max_features': None, 'max_leaf_nodes': 987, 'n_estimators': 479, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7146477751494411


[I 2023-06-13 22:20:49,427] Trial 83 finished with value: 0.7150450880707673 and parameters: {'max_depth': 580, 'max_features': None, 'max_leaf_nodes': 927, 'n_estimators': 456, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7150450880707673


[I 2023-06-13 22:21:19,522] Trial 84 finished with value: 0.7118796629134063 and parameters: {'max_depth': 226, 'max_features': None, 'max_leaf_nodes': 840, 'n_estimators': 541, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.7118796629134063


[I 2023-06-13 22:21:57,645] Trial 85 finished with value: 0.7151738642475678 and parameters: {'max_depth': 403, 'max_features': None, 'max_leaf_nodes': 892, 'n_estimators': 714, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7151738642475678


[I 2023-06-13 22:22:32,867] Trial 86 finished with value: 0.7137491180766748 and parameters: {'max_depth': 398, 'max_features': None, 'max_leaf_nodes': 881, 'n_estimators': 719, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7137491180766748


[I 2023-06-13 22:22:47,780] Trial 87 finished with value: 0.6670567784183735 and parameters: {'max_depth': 303, 'max_features': 'sqrt', 'max_leaf_nodes': 978, 'n_estimators': 692, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.6670567784183735


[I 2023-06-13 22:23:24,131] Trial 88 finished with value: 0.6853011612364813 and parameters: {'max_depth': 530, 'max_features': None, 'max_leaf_nodes': 913, 'n_estimators': 841, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 57 with value: 0.7167117775657018.


0.6853011612364813


[I 2023-06-13 22:24:03,629] Trial 89 finished with value: 0.7127750655411834 and parameters: {'max_depth': 377, 'max_features': None, 'max_leaf_nodes': 866, 'n_estimators': 754, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7127750655411834


[I 2023-06-13 22:24:23,801] Trial 90 finished with value: 0.659928046975023 and parameters: {'max_depth': 423, 'max_features': 'log2', 'max_leaf_nodes': 899, 'n_estimators': 877, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 57 with value: 0.7167117775657018.


0.659928046975023


[I 2023-06-13 22:24:55,051] Trial 91 finished with value: 0.7147282889195055 and parameters: {'max_depth': 333, 'max_features': None, 'max_leaf_nodes': 947, 'n_estimators': 650, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7147282889195055


[I 2023-06-13 22:25:38,326] Trial 92 finished with value: 0.7155850270634642 and parameters: {'max_depth': 504, 'max_features': None, 'max_leaf_nodes': 979, 'n_estimators': 804, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7155850270634642


[I 2023-06-13 22:26:16,928] Trial 93 finished with value: 0.7148096615820885 and parameters: {'max_depth': 502, 'max_features': None, 'max_leaf_nodes': 1000, 'n_estimators': 798, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7148096615820885


[I 2023-06-13 22:26:53,262] Trial 94 finished with value: 0.7157576961943267 and parameters: {'max_depth': 188, 'max_features': None, 'max_leaf_nodes': 974, 'n_estimators': 774, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7157576961943267


[I 2023-06-13 22:27:24,920] Trial 95 finished with value: 0.713603980294575 and parameters: {'max_depth': 183, 'max_features': None, 'max_leaf_nodes': 972, 'n_estimators': 765, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.713603980294575


[I 2023-06-13 22:27:58,335] Trial 96 finished with value: 0.7150685329943792 and parameters: {'max_depth': 140, 'max_features': None, 'max_leaf_nodes': 946, 'n_estimators': 843, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7150685329943792


[I 2023-06-13 22:28:25,695] Trial 97 finished with value: 0.7145244534694446 and parameters: {'max_depth': 212, 'max_features': None, 'max_leaf_nodes': 986, 'n_estimators': 709, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7145244534694446


[I 2023-06-13 22:29:05,084] Trial 98 finished with value: 0.7123899641225597 and parameters: {'max_depth': 256, 'max_features': None, 'max_leaf_nodes': 926, 'n_estimators': 914, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 57 with value: 0.7167117775657018.


0.7123899641225597


[I 2023-06-13 22:29:40,124] Trial 99 finished with value: 0.7149334556321394 and parameters: {'max_depth': 164, 'max_features': None, 'max_leaf_nodes': 950, 'n_estimators': 803, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 57 with value: 0.7167117775657018.


0.7149334556321394


In [7]:
rf_r = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_r.fit(train_99.drop(columns=['casual', 'registered', 'count']), np.log1p(train_99.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_99))

rf_c = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_c.fit(train_99.drop(columns=['casual', 'registered', 'count']), np.log1p(train_99.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_99))

pred = pred_r + pred_c

In [8]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_99.csv', index=False)

## 仮説 9 : peak time

### 仮説 9-1 : Add for both

In [9]:
data_9_1 = data_0.copy()
data_9_1.drop(columns=['windspeed', 'month'], inplace=True)

data_9_1.loc[:, 'peak'] = 0
peak_df = data_9_1.loc[data_9_1.loc[:, 'workingday'] == 1, :]
peak_df = peak_df.loc[peak_df.loc[:, 'hour'].isin([8, 17, 18]), :]
data_9_1.loc[peak_df.index, 'peak'] = 1

train_9_1 = data_9_1.loc[train.index, :]
test_9_1 = data_9_1.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [10]:
rf_r = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_r.fit(train_9_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_9_1.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_9_1))

rf_c = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_c.fit(train_9_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_9_1.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_9_1))

pred = pred_r + pred_c

In [11]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_9_1.csv', index=False)

### 仮説 9-2 : Add only for registered

In [12]:
rf_r = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_r.fit(train_9_1.drop(columns=['casual', 'registered', 'count']), np.log1p(train_9_1.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_9_1))

rf_c = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_c.fit(train_9_1.drop(columns=['casual', 'registered', 'count', 'peak']), np.log1p(train_9_1.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_9_1.drop(columns=['peak'])))

pred = pred_r + pred_c

In [13]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_9_2.csv', index=False)

### 仮説 9-3 : trend (8 - 17)

In [14]:
data_9_3 = data_0.copy()
data_9_3.drop(columns=['windspeed', 'month'], inplace=True)

data_9_3.loc[:, 'trend'] = 0
peak_df = data_9_3.loc[data_9_3.loc[:, 'workingday'] == 1, :]
peak_df = peak_df.loc[peak_df.loc[:, 'hour'].isin([x for x in range(8, 18)]), :]
data_9_3.loc[peak_df.index, 'trend'] = 1

train_9_3 = data_9_3.loc[train.index, :]
test_9_3 = data_9_3.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [15]:
rf_r = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_r.fit(train_9_3.drop(columns=['casual', 'registered', 'count']), np.log1p(train_9_3.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_9_3))

rf_c = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_c.fit(train_9_3.drop(columns=['casual', 'registered', 'count', 'trend']), np.log1p(train_9_3.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_9_3.drop(columns=['trend'])))

pred = pred_r + pred_c

In [16]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_9_3.csv', index=False)

## 仮説 10 : holiday

In [22]:
data_10 = data_0.copy()
data_10.drop(columns=['windspeed', 'month', 'holiday'], inplace=True)

data_10.loc[:, 'trend'] = 0
peak_df = data_10.loc[data_10.loc[:, 'workingday'] == 1, :]
peak_df = peak_df.loc[peak_df.loc[:, 'hour'].isin([x for x in range(8, 18)]), :]
data_10.loc[peak_df.index, 'trend'] = 1

train_10 = data_10.loc[train.index, :]
test_10 = data_10.loc[test.index, :].drop(columns=['casual', 'registered', 'count'])

In [23]:
rf_r = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_r.fit(train_10.drop(columns=['casual', 'registered', 'count']), np.log1p(train_10.loc[:, 'registered']))
pred_r = np.expm1(rf_r.predict(test_10))

rf_c = RFR(random_state=0, n_jobs=2, max_depth=167, max_features=None, max_leaf_nodes=969, min_samples_split=3, n_estimators=858)
rf_c.fit(train_10.drop(columns=['casual', 'registered', 'count', 'trend']), np.log1p(train_10.loc[:, 'casual']))
pred_c = np.expm1(rf_c.predict(test_10.drop(columns=['trend'])))

pred = pred_r + pred_c

In [24]:
submission = pd.DataFrame({'datetime': test.index, 'count': pred})
submission.to_csv('./data/output/t_10.csv', index=False)