In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR

train = pd.read_csv('./data/input/train.csv')
test = pd.read_csv('./data/input/test.csv')

In [2]:
def rmsle(y, y_):
    log1 = np.log(y + 1)
    log2 = np.log(y_ + 1)
    mean = np.square((log1- log2)).mean()
    return np.sqrt(mean)

In [3]:
scores_cols = ['conditions', 'descriptions', 'train', 'valid', 'submissions']
scores = pd.DataFrame(columns=scores_cols)

# データ前処理

## datetime の処理

In [4]:
train['year'] = train.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
train['month'] = train.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
train['day'] = train.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
train['hour'] = train.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

test['year'] = test.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
test['month'] = test.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
test['day'] = test.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
test['hour'] = test.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

## 検証用データの作成

In [5]:
drop_cols = ['datetime', 'count', 'casual', 'registered']
test_use_cols = [col for col in train.columns if col not in drop_cols]

train_X, valid_X, train_y, valid_y = train_test_split(
    train.drop(columns=drop_cols),
    train.loc[:, ['registered', 'casual', 'count']],
    test_size=0.3, random_state=0)

## RF 実装
count を直接予測する．

In [6]:
model_rf = RFR(random_state=0)
model_rf.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

valid_pred = model_rf.predict(valid_X)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred)

score = pd.DataFrame([['First', '', train_score, valid_score, None]], columns=scores_cols)
if score.values[0][0] not in scores.values:
    scores = pd.concat([scores, score], axis=0)
print(scores)

  conditions descriptions     train     valid submissions
0      First               0.158527  0.352344        None


### Highperparameter Tuning 1

In [7]:
# RF GridSearchCV
if False:
    params = [{
        'n_estimators': [700, 1000, 1300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [3, 5, 7],
        'max_features': ['sqrt', 'log2', None]
    }]

    gscv = GridSearchCV(
        RFR(random_state=0, n_jobs=-1),
        param_grid=params,
        cv=5,
        verbose=2
        )

    gscv.fit(train_X, train_y.loc[:, 'count'])

In [8]:
# RF GridSearchCV check params
if False:
    model_rf_best = gscv.best_estimator_
    print(gscv.best_params_)

    print(pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').loc[:, 'params'].values)

In [9]:
model_rf_best = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_best.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf_best.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

test_pred = model_rf_best.predict(valid_X)
test_score = rmsle(valid_y.loc[:, 'count'], test_pred)

sc = pd.DataFrame([['0-1', 'Hparam Tuning 1', train_score, test_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.158527  0.352344        None
1        0-1  Hparam Tuning 1  0.283048  0.362909        None


In [10]:
model_rf_test = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_test.fit(train.drop(columns=drop_cols), train.loc[:, 'count'])

pred = model_rf_test.predict(test.loc[:, test_use_cols])
df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred})
df_submit.to_csv('./data/output/submittion_RFR.csv', index=False)

In [11]:
scores.loc[scores['conditions'] == '0-1', 'submissions'] = 0.48367
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.158527  0.352344        None
1        0-1  Hparam Tuning 1  0.283048  0.362909     0.48367


### Highper parameter Tuning 2

In [12]:
model_rf = RFR(random_state=0, max_depth=20, min_samples_split=5, n_estimators=700, n_jobs=-1)
model_rf.fit(train_X, train_y.loc[:, 'count'])

train_pred = model_rf.predict(train_X)
train_score = rmsle(train_y.loc[:, 'count'], train_pred)

test_pred = model_rf.predict(valid_X)
test_score = rmsle(valid_y.loc[:, 'count'], test_pred)

sc = pd.DataFrame([['0-2', 'Hparam Tuning 2', train_score, test_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.158527  0.352344        None
1        0-1  Hparam Tuning 1  0.283048  0.362909     0.48367
2        0-2  Hparam Tuning 2  0.195116  0.354192        None


## 仮説検証

### 仮説 1
count を registered と casual の和として予測する．

In [13]:
rf_1_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
rf_1_registered.fit(train_X, train_y.loc[:, 'registered'])

rf_1_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
rf_1_casual.fit(train_X, train_y.loc[:, 'casual'])

In [14]:
train_pred_registered = rf_1_registered.predict(train_X)
train_pred_casual = rf_1_casual.predict(train_X)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = rf_1_registered.predict(valid_X)
valid_pred_casual = rf_1_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['1', 'split obj var', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions     descriptions     train     valid submissions
0      First                   0.158527  0.352344        None
1        0-1  Hparam Tuning 1  0.283048  0.362909     0.48367
2        0-2  Hparam Tuning 2  0.195116  0.354192        None
3          1    split obj var  0.164491  0.347886        None


**仮説 1 の下で submit**

In [25]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train.drop(drop_cols, axis=1), train.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train.drop(drop_cols, axis=1), train.loc[:, 'casual'])

pred_registered = model_registered.predict(test.loc[:, test_use_cols])
pred_casual = model_casual.predict(test.loc[:, test_use_cols])
pred_count = pred_registered + pred_casual

df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred_count})
df_submit.to_csv('./data/output/submission_rf_1.csv', index=False)

KeyError: "['datetime', 'count', 'casual', 'registered'] not found in axis"

### 仮説 2-1
風速 0 を削除

In [15]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X.loc[train_X.loc[:, 'windspeed'] != 0, :], train_y.loc[train_X.loc[:, 'windspeed'] != 0, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X.loc[train_X.loc[:, 'windspeed'] != 0, :], train_y.loc[train_X.loc[:, 'windspeed'] != 0, 'casual'])

train_pred_registered = model_registered.predict(train_X)
train_pred_casual = model_casual.predict(train_X)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X)
valid_pred_casual = model_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-1', 'windspeed == 0 -> drop', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions            descriptions     train     valid submissions
0      First                          0.158527  0.352344        None
1        0-1         Hparam Tuning 1  0.283048  0.362909     0.48367
2        0-2         Hparam Tuning 2  0.195116  0.354192        None
3          1           split obj var  0.164491  0.347886        None
4        2-1  windspeed == 0 -> drop  0.201615  0.349233        None


### 仮説 2-2
風速 0 を平均値に置き換え

In [16]:
train_X_2 = train_X.copy()
train_X_2.loc[train_X_2.loc[:, 'windspeed'] == 0, 'windspeed'] = train.loc[train.loc[:, 'windspeed'] != 0, 'windspeed'].mean()

model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_2, train_y.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_2, train_y.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_2)
train_pred_casual = model_casual.predict(train_X_2)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X)
valid_pred_casual = model_casual.predict(valid_X)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-2', 'windspeed == 0 -> mean', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions            descriptions     train     valid submissions
0      First                          0.158527  0.352344        None
1        0-1         Hparam Tuning 1  0.283048  0.362909     0.48367
2        0-2         Hparam Tuning 2  0.195116  0.354192        None
3          1           split obj var  0.164491  0.347886        None
4        2-1  windspeed == 0 -> drop  0.201615  0.349233        None
5        2-2  windspeed == 0 -> mean  0.164706  0.348899        None


### 仮説 2-3
風速 0 を線形補完

In [17]:
train_2 = train.copy()
train_2.loc[:, 'datetime'] = pd.to_datetime(train_2.loc[:, 'datetime'])
train_2.set_index('datetime', inplace=True)
train_2.replace({'windspeed': 0}, np.nan, inplace=True)
train_2.interpolate(method='time', inplace=True, limit_direction='both')

test_2 = test.copy()
test_2.loc[:, 'datetime'] = pd.to_datetime(test_2.loc[:, 'datetime'])
test_2.set_index('datetime', inplace=True)
test_2.replace({'windspeed': 0}, np.nan, inplace=True)
test_2.interpolate(method='time', inplace=True, limit_direction='both')

In [18]:
train_X_2, valid_X_2, train_y_2, valid_y_2 = train_test_split(
    train_2.drop(columns=['registered', 'casual', 'count']),
    train_2.loc[:, ['registered', 'casual', 'count']],
    test_size=0.3, random_state=0)

In [19]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_2, train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_2, train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_2)
train_pred_casual = model_casual.predict(train_X_2)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2)
valid_pred_casual = model_casual.predict(valid_X_2)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['2-3', 'windspeed == 0 -> linear replacement', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.158527  0.352344  \
1        0-1                       Hparam Tuning 1  0.283048  0.362909   
2        0-2                       Hparam Tuning 2  0.195116  0.354192   
3          1                         split obj var  0.164491  0.347886   
4        2-1                windspeed == 0 -> drop  0.201615  0.349233   
5        2-2                windspeed == 0 -> mean  0.164706  0.348899   
6        2-3  windspeed == 0 -> linear replacement  0.164512  0.348137   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  


## 仮説 3
天候 4 を 3 とする．  
仮説 1, 仮説 2-3 の手法を用いる．

In [20]:
print(len(train_X_2[train_X_2.weather == 4]), len(test_2[test_2.weather == 4]))

1 2


In [21]:
train_X_3 = train_X_2.replace({'weather': 4}, 3, inplace=False)
test_3 = test_2.replace({'weather': 4}, 3, inplace=False)

In [22]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_3, train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_3, train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_3)
train_pred_casual = model_casual.predict(train_X_3)
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2)
valid_pred_casual = model_casual.predict(valid_X_2)
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['3', 'weather == 4 -> 3', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.158527  0.352344  \
1        0-1                       Hparam Tuning 1  0.283048  0.362909   
2        0-2                       Hparam Tuning 2  0.195116  0.354192   
3          1                         split obj var  0.164491  0.347886   
4        2-1                windspeed == 0 -> drop  0.201615  0.349233   
5        2-2                windspeed == 0 -> mean  0.164706  0.348899   
6        2-3  windspeed == 0 -> linear replacement  0.164512  0.348137   
7          3                     weather == 4 -> 3  0.164487  0.348088   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  
7        None  


## 仮説 4
atemp を使用しない．  
仮説 3 の下で行う．

In [23]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_X_3.drop('atemp', axis=1), train_y_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_X_3.drop('atemp', axis=1), train_y_2.loc[:, 'casual'])

train_pred_registered = model_registered.predict(train_X_3.drop('atemp', axis=1))
train_pred_casual = model_casual.predict(train_X_3.drop('atemp', axis=1))
train_pred_count = train_pred_registered + train_pred_casual

valid_pred_registered = model_registered.predict(valid_X_2.drop('atemp', axis=1))
valid_pred_casual = model_casual.predict(valid_X_2.drop('atemp', axis=1))
valid_pred_count = valid_pred_registered + valid_pred_casual

train_score = rmsle(train_y_2.loc[:, 'count'], train_pred_count)
valid_score = rmsle(valid_y_2.loc[:, 'count'], valid_pred_count)

sc = pd.DataFrame([['4', 'atemp -> drop', train_score, valid_score, None]], columns=scores_cols)
if sc.values[0][0] not in scores.values:
    scores = pd.concat([scores, sc], axis=0, ignore_index=True)
print(scores)

  conditions                          descriptions     train     valid   
0      First                                        0.158527  0.352344  \
1        0-1                       Hparam Tuning 1  0.283048  0.362909   
2        0-2                       Hparam Tuning 2  0.195116  0.354192   
3          1                         split obj var  0.164491  0.347886   
4        2-1                windspeed == 0 -> drop  0.201615  0.349233   
5        2-2                windspeed == 0 -> mean  0.164706  0.348899   
6        2-3  windspeed == 0 -> linear replacement  0.164512  0.348137   
7          3                     weather == 4 -> 3  0.164487  0.348088   
8          4                         atemp -> drop  0.165335  0.349203   

  submissions  
0        None  
1     0.48367  
2        None  
3        None  
4        None  
5        None  
6        None  
7        None  
8        None  


**仮説 3 の下で submit**

In [24]:
model_registered = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_registered.fit(train_2.drop(['casual', 'registered', 'count'], axis=1), train_2.loc[:, 'registered'])

model_casual = RFR(random_state=0, max_depth=20, min_samples_split=3, n_estimators=700, n_jobs=-1)
model_casual.fit(train_2.drop(['casual', 'registered', 'count'], axis=1), train_2.loc[:, 'casual'])

test_pred_registered = model_registered.predict(test_2)
test_pred_casual = model_casual.predict(test_2)
test_pred_count = test_pred_registered + test_pred_casual

test_submittion = pd.DataFrame({'datetime': test_2.index, 'count': test_pred_count})
test_submittion.to_csv('./data/output/submission_rf_3.csv', index=False)