In [8]:
from datetime import datetime
from pathlib import Path

import pandas as pd
from tpot import TPOTRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

In [2]:
def load_ds(train_path='../data/train.csv', test_path='../data/test.csv'):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    all_df = pd.concat([train_df, test_df])
    all_df = _preprocess_data(all_df)
    n_train = len(train_df)
    train_df = all_df.iloc[:n_train]
    test_df = all_df.iloc[n_train:]
    X_train, y_train = train_df.drop(['SalePrice', 'Id'], axis=1), train_df['SalePrice']
    X_test = test_df.drop('SalePrice', axis=1)
    return X_train, y_train, X_test

    

def _preprocess_data(df):
    df = _fill_missing_values(df)
    df = _categorical_values2one_hot(df)
    return df


def _fill_missing_values(df: pd.DataFrame):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'O':
            # カテゴリー変数の場合は、プールが無いなどの意味が多いのでNoneとしている。
            df[col] = df[col].fillna('None')
        else:
            df[col] = df[col].fillna(0)
    return df


def _categorical_values2one_hot(df: pd.DataFrame):
    return pd.get_dummies(df)

In [3]:
X_train, y_train, X_test = load_ds()
print('X_train shape:', X_train.shape, 'y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (1460, 310) y_train shape: (1460,)
X_test shape: (1459, 311)


## TOPTRegressorを使用して学習

- 回数は、以下のようになる。

population_size(パイプライン数) + generations(世代数) x offspringsize(子孫数.デフォルトでは、population_sizeと同じ)

- 評価指標は、「予測値」と「実際の値」の対数に対してrmse(Root Mean Squared Error)を計算したものを使用

- 簡単のため、「generations」, 「population_size」は小さめの値(それぞれ10, 50)を使用(デフォルトだと10,000ステップ以上となり学習がなかなか終わらない。)

 - 上記の設定で0.124程度のスコアが出る(submit結果では、0.124程度)。
 
 - デフォルトの設定(generations: 100, population_size: 100)では、0.1159程度のスコアが出る(submit結果では、0.126程度なので単にgenerationsやpopulation_sizeを上げればいいというわけではない。)

In [5]:
def log_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true=np.log(y_true), y_pred=np.log(y_pred)))


def learn_tpot(x_train, y_train, seed=42, generations=10, population_size=50):
    tpot = TPOTRegressor(
        random_state=seed,
        n_jobs=-1,
        verbosity=2,
        generations=generations,
        population_size=population_size,
        scoring=make_scorer(log_rmse, greater_is_better=False)
    )
    tpot.fit(X_train, y_train)
    return tpot

ちなみに、単純に平均値を予測結果とした場合のRMSEは、0.407程度

In [7]:
log_rmse(y_train, [y_train.mean()] * len(y_train))

0.40760050769850525

In [10]:
SEED = 42
# 世代数
GENERATIONS = 10
# パイプライン数
POPULATION_SIZE = 50
date_format = '%Y_%m_%d_%H_%M'
experiment_date = datetime.now().strftime(date_format)
DST_ROOT = Path(f'../submit/{experiment_date}/')
DST_ROOT.mkdir(exist_ok=True, parents=True)
tpot = learn_tpot(x_train=X_train, y_train=y_train, seed=SEED, generations=GENERATIONS, population_size=POPULATION_SIZE)
tpot.export(str(DST_ROOT / 'tpot_pipeline.py'))

y_pred = tpot.predict(X_test.drop('Id', axis=1))
submit_df = pd.DataFrame(
    {
        'Id': X_test['Id'],
        'SalePrice': y_pred
    }
)
dst = str(DST_ROOT / 'submit.csv')
submit_df.to_csv(
    dst,
    index=False
)

Optimization Progress:   0%|          | 0/2 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -0.1554847844572039

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=11, min_samples_split=9, n_estimators=100)
