In [None]:
!pip install -U heamy

In [None]:
import lightgbm as lgb
import japanize_matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from dinamiclr import LrSchedulingCallback
from catboost import Pool
from catboost import CatBoostRegressor

from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [None]:
df = pickle.load(open('fixed/df_std.sav', 'rb'))

In [None]:
ID = 'ID'
TARGET = '取引価格（総額）_log'
dropcol = ['市区町村コード', '取引時点int']
dropval = 0.9
df = df.drop(dropcol, axis=1)
# df.set_index('ID', inplace = True)

cols = df.columns
cols = cols[cols != '取引価格（総額）_log']
cols = cols[cols != '学習データ']
cols = cols[cols != '価格/面積log']
cols = cols[cols != '面積（㎡）']
# cols = cols[cols != 'ID']

test = df[df['学習データ']==False]
train = df[df['学習データ']==True]

In [None]:
train_1 = train[train['価格/面積log'] <= dropval]
train_1 = train_1[train_1['価格/面積log'] >= -1*dropval]
print(f'train:{len(train)}, train_1:{len(train_1)}')
print(f"max:{train_1['価格/面積log'].max()}, min:{train_1['価格/面積log'].min()}")

In [None]:
X_np_1 = np.array(train_1[cols])
y_np_1 = train_1['取引価格（総額）_log'].values
X_submit = np.array(test[cols])
X_np = np.array(train[cols])
y_np = train['取引価格（総額）_log'].values

In [None]:
df.isnull().sum()[df.isnull().sum() != 0]

In [None]:
best_params_cat = {'l2_leaf_reg': 33.51917340457483, 'random_strength': 0.16779085114202497, 'subsample': 0.9828709713363581, 'objective': 'MAE', 'colsample_bylevel': 0.09, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'eval_metric': 'MAE', 'learning_rate': 0.1, 'early_stopping_rounds': 50, 'iterations': 20000, 'verbose': 500, 'loss_function': 'MAE', 'random_seed': 42}

In [None]:
%%time
FOLD = 5
valid_scores_cat = []
models_cat = []
y_pred_cat = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(X_np_1)):
    X_train, X_valid = X_np_1[train_indices], X_np_1[valid_indices]
    y_train, y_valid = y_np_1[train_indices], y_np_1[valid_indices]
    train_pool = Pool(X_train, y_train)
    validate_pool = Pool(X_valid, y_valid)
    
    model = CatBoostRegressor(**best_params_cat)
    model.fit(train_pool, eval_set=validate_pool)

    y_valid_pred = model.predict(X_valid)
    
    X_valid_df = pd.DataFrame(X_valid, columns=cols)
    y_valid_pred_df = pd.DataFrame(y_valid_pred, columns=[TARGET])
    y_valid_pred_df[ID] = X_valid_df[ID]
    
    y_pred_cat.append(y_valid_pred_df)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    valid_scores_cat.append(score)

    models_cat.append(model)

cv_score = np.mean(valid_scores_cat)
print(f'CV score: {cv_score}')

In [None]:
# モデルを保存する
filename = 'models/catboost_{}/models_{}.sav'
for i in range(5):
    pickle.dump(models_cat[i], open(filename.format(dropval, i), 'wb'))

In [None]:
y_pred_cat = pd.concat([y_pred_cat[0], y_pred_cat[1], y_pred_cat[2], y_pred_cat[3], y_pred_cat[4]]).reset_index().drop(['index'], axis=1)
y_pred_cat = y_pred_cat.astype({ID:int})
y_pred_cat[[ID, TARGET]].to_csv(f'./pre-pred/prepred_cat_{dropval}.csv', index = False)

In [None]:
# 保存したモデルをロードする
filename = 'models/catboost_{}/models_{}.sav'
models_cat = []
for i in range(5):
    loaded_model = pickle.load(open(filename.format(dropval, i), 'rb'))
    models_cat.append(loaded_model)

In [None]:
# lgb0モデルでsubmitデータを作る
submit_pred_cat = []
for i in range(5):
    submit_pred = models_cat[i].predict(X_submit)
    submit_pred_cat.append(submit_pred)
submit_pred_cat = (submit_pred_cat[0] + submit_pred_cat[1] + submit_pred_cat[2] + submit_pred_cat[3] + submit_pred_cat[4])/5

In [None]:
submit_pred_cat = np.log10(np.ceil(np.power(10, submit_pred_cat)/100000)*100000)

In [None]:
X_submit_df = pd.DataFrame(X_submit, columns=cols)
submit_pred_cat_df = pd.DataFrame(submit_pred_cat, columns=[TARGET])
submit_pred_cat_df[ID] = X_submit_df[ID]
submit_pred_cat_df = submit_pred_cat_df[[ID, TARGET]].astype({ID:int})
submit_pred_cat_df.to_csv(f'./pre-pred/submit/prepred_cat_{}.csv', index = False)