In [6]:
import pickle
import pandas as pd
import numpy as np

from heamy.dataset import Dataset
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

from catboost import Pool
from catboost import CatBoostRegressor

In [3]:
filename = 'dataset/stack.sav'
dataset_stack = pickle.load(open(filename, 'rb'))

In [4]:
best_params_cat = {'l2_leaf_reg': 33.51917340457483, 'random_strength': 0.16779085114202497, 'subsample': 0.9828709713363581, 'objective': 'MAE', 'colsample_bylevel': 0.09, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'eval_metric': 'MAE', 'learning_rate': 0.1, 'early_stopping_rounds': 50, 'iterations': 20000, 'verbose': 500, 'loss_function': 'MAE', 'random_seed': 42}


In [7]:
FOLD = 10
valid_scores_cat = []
models_cat = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(dataset_stack.X_train.values)):
    X_train, X_valid = dataset_stack.X_train.values[train_indices], dataset_stack.X_train.values[valid_indices]
    y_train, y_valid = dataset_stack.y_train[train_indices], dataset_stack.y_train[valid_indices]
    train_pool = Pool(X_train, y_train)
    validate_pool = Pool(X_valid, y_valid)
    
    model = CatBoostRegressor(**best_params_cat)
    model.fit(train_pool, eval_set=validate_pool)

    y_valid_pred = model.predict(X_valid)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    valid_scores_cat.append(score)

    models_cat.append(model)

cv_score = np.mean(valid_scores_cat)
print(f'CV score: {cv_score}')

0:	learn: 0.2424987	test: 0.2431377	best: 0.2431377 (0)	total: 523ms	remaining: 2h 54m 15s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.06779303872
bestIteration = 279

Shrink model to first 280 iterations.
fold 0 MAE: 0.06779404141230155
0:	learn: 0.2433564	test: 0.2439967	best: 0.2439967 (0)	total: 406ms	remaining: 2h 15m 29s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.06728300569
bestIteration = 343

Shrink model to first 344 iterations.
fold 1 MAE: 0.06728400767721814
0:	learn: 0.2423649	test: 0.2429946	best: 0.2429946 (0)	total: 366ms	remaining: 2h 1m 55s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.06742354255
bestIteration = 359

Shrink model to first 360 iterations.
fold 2 MAE: 0.06742454449256575
0:	learn: 0.2428823	test: 0.2445630	best: 0.2445630 (0)	total: 391ms	remaining: 2h 10m 17s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.06829816772
bestIteration = 362

Shrink model to first 3

In [87]:
submit_pred_cats = []
for i in range(FOLD):
    submit_pred = models_cat[i].predict(dataset_stack.X_test.values)
    submit_pred_cats.append(submit_pred)

for i in range(FOLD):
    if i == 0:
        submit_pred_cat = submit_pred_cats[i]
    else:
        submit_pred_cat = submit_pred_cat + submit_pred_cats[i]

submit_pred_cat = submit_pred_cat/FOLD

In [88]:
submit_pred_cat

array([7.27821623, 7.27267218, 7.27977657, ..., 6.89096343, 6.37370112,
       7.59301751])

In [89]:
import decimal
# 有効数字2桁指定
# 切り上げ
decimal.getcontext().prec = 2
# ROUND_05UP, ROUND_DOWN, ROUND_UP
decimal.getcontext().rounding = decimal.ROUND_UP
# c = +decimal.Decimal(1518000)

In [53]:
# c

In [90]:
floats = []
for val in np.power(10, submit_pred_cat):
    floats.append(float(+decimal.Decimal(val)))

In [91]:
np.log10(np.array(floats))

array([7.2787536 , 7.2787536 , 7.30103   , ..., 6.8920946 , 6.38021124,
       7.60205999])

In [76]:
np.log10(np.array(floats))

array([7.25527251, 7.25527251, 7.2787536 , ..., 6.88649073, 6.36172784,
       7.59106461])

In [92]:
submit_pred_cat = np.log10(np.array(floats))

In [None]:
# submit_pred_cat = np.log10(np.ceil(np.power(10, submit_pred_cat)/100000)*100000)

In [93]:
ID = 'ID'
TARGET = '取引価格（総額）_log'
X_submit_df = dataset_stack.X_test
submit_pred_cat_df = pd.DataFrame(submit_pred_cat, columns=[TARGET])
submit_pred_cat_df[ID] = X_submit_df[ID]
submit_pred_cat_df = submit_pred_cat_df[[ID, TARGET]].astype({ID:int})
# submit_pred_cat_df.to_csv('./output/submission_cat.csv', index = False)

In [94]:
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df = pd.merge(sub_df[[ID]], submit_pred_cat_df[[ID, TARGET]], on=ID)
sub_df.to_csv('output/submission_cat01_10.csv', index=False)