In [None]:
!pip install git+https://github.com/pfnet-research/xfeat.git > /dev/null
!pip install catboost > /dev/null

In [None]:
import pandas as pd
import numpy as np
import xfeat
from sklearn.model_selection import KFold
from functools import partial
import seaborn as sns
import optuna
import pickle
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer, CountEncoder

In [None]:
df = pd.read_csv('fixed/rawdata.csv')
display(df.head(2))
print(df.columns)
category_cols = SelectCategorical().fit_transform(df).columns
category_cols

In [None]:
# 面積関連の特徴量を作成する
df['面積log'] = pd.DataFrame(np.log10(df['面積（㎡）']))
df['価格/面積log'] = 0
df.loc[df['学習データ'] == True,'価格/面積log'] = df['取引価格（総額）_log'] - df['面積log']

In [None]:
# TargetEncoderを利用
train = df[df['学習データ']==True]
test = df[df['学習データ']==False]
fold = KFold(n_splits=2, shuffle=True, random_state=42)
te = TargetEncoder(fold=fold, target_col='取引価格（総額）_log', input_cols=['都道府県名', '市区町村名', '地区名', '最寄駅：名称', '間取り', '建物の構造', '用途', '今後の利用目的',
       '都市計画', '改装', '取引の事情等'])
train = te.fit_transform(train)
test = te.transform(test)
df = pd.concat([train, test])
del train
del test

In [None]:
# CountEncoder
ce = CountEncoder(output_suffix="_ce",input_cols=['都道府県名', '市区町村名', '地区名', '最寄駅：名称', '間取り', '建物の構造', '用途', '今後の利用目的',
       '都市計画', '改装', '取引の事情等'])
df = ce.fit_transform(df)
df.sample(2)

In [None]:
# 人口密度データのマージ
# 人口密度はあまり役に立たなさそう。
# population = pd.read_csv('reference/population.csv')
# population = population.drop(['都道府県・市区町村名'], axis=1)
# population = population[['市区町村コード','総数（人）', '面積（参考）（km2）', '人口密度（人/km2）', '平均年齢（歳）', '年齢中位数（歳）', '人口性比（％）']]
# display(population.head(2))
# df = pd.merge(df, population, left_on=['市区町村コード'], right_on=['市区町村コード'], how='left')
# del population

In [None]:
# 乗降客数データのマージ
passengers = pd.read_csv('reference/passengers.csv')
# 表記ゆれの修正（UNICODE正規化、(JR)等を除去、ヶ→ケに置換）
df['最寄駅：名称'] = df['最寄駅：名称'].str.normalize("NFKC")
passengers['駅名'] = passengers['駅名'].str.normalize("NFKC")
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('(', '_', regex=False)
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace(')', '', regex=False)

df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('ヶ', 'ケ')
passengers['駅名'] = passengers['駅名'].str.replace('ヶ', 'ケ')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('なんば', '難波')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('なかもず', '中百舌鳥')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('なかもず', '中百舌鳥')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('本町4丁目', '本町四丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('本町6丁目', '本町六丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('本町3丁目', '本町三丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('本町5丁目', '本町五丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('萱町6丁目', '萱町六丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('平和通1丁目', '平和通一丁目')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('下祇園', '下祗園')
passengers['駅名'] = passengers['駅名'].str.replace('押上_スカイツリー前', '押上', regex=False)
passengers['駅名'] = passengers['駅名'].str.replace('西武園ゆうえんち', '多摩湖', regex=False)
passengers['駅名'] = passengers['駅名'].str.replace('遊園地西', '西武園ゆうえんち', regex=False)
passengers['駅名'] = passengers['駅名'].str.replace('蓮町(馬場記念公園前)', '蓮町', regex=False)
passengers['駅名'] = passengers['駅名'].str.replace('本諫早(諫早市役所前)', '本諫早', regex=False)
passengers['駅名'] = passengers['駅名'].str.replace('押上(スカイツリー前)', '押上', regex=False)
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('ジヤトコ前_ジヤトコ1地区前', 'ジヤトコ前(ジヤトコ1地区前)', regex=False)
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('西鉄福岡_天神', '西鉄福岡(天神)', regex=False)

df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('大山_東京', '大山')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('トヨタモビリティ富山 Gスクエア五福前', '富山トヨペット本社前(五福末広町)')

df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('平岸_札幌市営', '平岸')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('落合_東京', '落合')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('松原_東京', '松原')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('中川_神奈川', '中川')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('生田_神奈川', '生田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('北山田_神奈川', '北山田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('番田_神奈川', '番田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('根岸_神奈川', '根岸')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('杉田_神奈川', '杉田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('江田_神奈川', '江田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('月岡_新潟', '月岡')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('内海_愛知', '内海')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('黄金_愛知', '黄金')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('羽黒_愛知', '羽黒')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('瀬田_滋賀', '瀬田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('醍醐_京都', '醍醐')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('扇町_大阪', '扇町')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('玉川_大阪', '玉川')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('平林_大阪', '平林')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('南方_大阪', '南方')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('今川_大阪', '今川')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('滝谷_大阪', '滝谷')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('庄内_大阪', '庄内')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('額田_大阪', '額田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('上牧_大阪', '上牧')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('船尾_大阪', '船尾')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('妙法寺_兵庫', '妙法寺')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('岩屋_兵庫', '岩屋')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('藤崎_福岡', '藤崎')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('祇園_福岡', '祇園')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('北方_福岡', '北方')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('西山_福岡', '西山')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('浦田_福岡', '浦田')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('加納_宮崎', '加納')

df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('旭ケ丘_宮城', '旭ケ丘')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('ひばりケ丘_東京', 'ひばりケ丘')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('霞ケ関_埼玉', '霞ケ関')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('愛宕_千葉', '愛宕')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('入谷_神奈川', '入谷')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('蓮町_馬場記念公園前', '蓮町')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('桐原_長野', '桐原')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('柳津_岐阜', '柳津')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('自由ケ丘_愛知', '自由ケ丘')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('五条_京都市営', '五条')
df['最寄駅：名称'] = df['最寄駅：名称'].str.replace('とうきょうスカイツリー', '押上')

df = pd.merge(df, passengers, left_on=['最寄駅：名称'], right_on=['駅名'], how='left').drop(['駅名'],axis=1)

# 多摩湖駅は0で設定
# 伏石駅は3000程度
df.loc[df['最寄駅：名称'] == '多摩湖', '乗降客数2019'] = 0
df.loc[df['最寄駅：名称'] == '伏石', '乗降客数2019'] = 3000

display(df.head(2))
del passengers

In [None]:
# 公示地価データのマージ
correction = pd.read_csv('reference/correction.csv')
# display(correction.head(2))
df = pd.merge(df, correction, left_on=['市区町村コード', '取引時点int'], right_on=['所在地コード', '年度'], how='inner').drop(['所在地コード','年度'],axis=1)
display(df.head(2))
del correction

In [None]:
# 特徴量について。
# ID、市区町村コード：削除？
# '都道府県名', '市区町村名', '地区名', '最寄駅：名称'
# '最寄駅：距離（分）', '面積（㎡）', '建築年', '取引時点築年数', '平均地価log'：そのまま
# '間取り', '建物の構造', '用途', '今後の利用目的', '取引の事情等' : 要素の抜き出し
# '都市計画', '改装': カテゴリデータとして利用
# '建ぺい率（％）', '容積率（％）', '面積（㎡）' : かけ合わせデータを作成
# '取引時点', '取引時点int': どちらかを落とす。
# '取引価格（総額）_log': 目的変数
# '学習データ': 学習データを示すフラグ

In [None]:
# '間取り', '建物の構造', '用途', '今後の利用目的', '取引の事情等' : 要素の抜き出しを行う。
colname = '間取り'

madori = pd.DataFrame(index=[], columns=['間取り', '部屋の数', 'リビング', 'ダイニング', 'キッチン', 'サービス', 'オープンフロア', 'スタジオ', 'メゾネット'])
madori = madori.astype({'部屋の数':float, 'リビング':float, 'ダイニング':float,
                            'キッチン':float, 'サービス':float, 'オープンフロア':float ,
                            'スタジオ':float, 'メゾネット':float})
for room in df[colname].unique():
    if room in ['オープンフロア', 'スタジオ', 'メゾネット']:
        room_num = 1
    else:
        room_num = int(room[:1])
    tmp = pd.DataFrame(index = [1],
                   data = [{'間取り':room, '部屋の数':int(room_num), 'リビング':room.count('Ｌ'), 'ダイニング':room.count('Ｄ'),
                            'キッチン':room.count('Ｋ'), 'サービス':room.count('Ｓ'), 'オープンフロア':room.count('オープンフロア') ,
                            'スタジオ':room.count('スタジオ'), 'メゾネット':room.count('メゾネット')}])
    madori = pd.concat([madori, tmp])
df = pd.merge(df, madori, left_on=['間取り'], right_on=['間取り'], how='left')
display(df.head(2))
del madori

In [None]:
colname = '建物の構造'
df[colname].unique()
kouzo = pd.DataFrame(index=[], columns=['建物の構造', 'ＲＣ', 'ＳＲＣ', '鉄骨造', '軽量鉄骨造', '木造', 'ブロック造'])
kouzo = kouzo.astype({'ＲＣ':float, 'ＳＲＣ':float, '鉄骨造':float,
                            '軽量鉄骨造':float, '木造':float, 'ブロック造':float})

for value in df[colname].unique():
    if value == 'ＳＲＣ、ＲＣ、鉄骨造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':1, '鉄骨造':1, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == 'ＳＲＣ、鉄骨造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':1, '鉄骨造':1, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == 'ＲＣ、木造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':0, '鉄骨造':1, '軽量鉄骨造':0, '木造':1, 'ブロック造':0}])
    elif value == '軽量鉄骨造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':0, '鉄骨造':0, '軽量鉄骨造':1, '木造':0, 'ブロック造':0}])
    elif value == 'ＲＣ、ブロック造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':0, '鉄骨造':0, '軽量鉄骨造':0, '木造':0, 'ブロック造':1}])
    elif value == 'ＲＣ、鉄骨造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':0, '鉄骨造':1, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == '木造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':0, '鉄骨造':0, '軽量鉄骨造':0, '木造':1, 'ブロック造':0}])
    elif value == 'ブロック造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':0, '鉄骨造':0, '軽量鉄骨造':0, '木造':0, 'ブロック造':1}])
    elif value == 'ＳＲＣ、ＲＣ':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':1, '鉄骨造':0, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == '鉄骨造':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':0, '鉄骨造':1, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == 'ＳＲＣ':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':0, 'ＳＲＣ':1, '鉄骨造':0, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
    elif value == 'ＲＣ':
        tmp2 = pd.DataFrame(index = [1], data = [{'建物の構造':value, 'ＲＣ':1, 'ＳＲＣ':0, '鉄骨造':0, '軽量鉄骨造':0, '木造':0, 'ブロック造':0}])
        
    kouzo = pd.concat([kouzo, tmp2])
df = pd.merge(df, kouzo, left_on=['建物の構造'], right_on=['建物の構造'], how='left')
df.head(2)

display(df.head(2))
del kouzo

In [None]:
colname = '用途'
df[colname].unique()
youto = pd.DataFrame(index=[], columns=['用途', '住宅', '店舗', '事務所', '駐車場', '倉庫', '作業場', '工場', 'その他', '不明'])
youto = youto.astype({'住宅':float, '店舗':float, '事務所':float, '駐車場':float, '倉庫':float, '作業場':float, '工場':float, 'その他':float, '不明':float})
for value in df[colname].unique():
    tmp3 = pd.DataFrame(index = [1], data = [{'用途':value, '住宅':value.count('住宅'), '店舗':value.count('店舗'), 
                                              '事務所':value.count('事務所'), '駐車場':value.count('駐車場'), '倉庫':value.count('倉庫'),
                                              '作業場':value.count('作業場'), '工場':value.count('工場'), 'その他':value.count('その他'), '不明':value.count('不明')}])

    youto = pd.concat([youto, tmp3])
df = pd.merge(df, youto, left_on=['用途'], right_on=['用途'], how='left')
display(df.head(2))
del youto

In [None]:
# そのままラベル化する。
colname = '取引の事情等'
torihiki = pd.DataFrame(index=[], columns=['取引の事情等', '取引の事情なし', '調停・競売等', '関係者間取引', 'その他事情有り', '瑕疵有りの可能性', '他の権利・負担付き'])
torihiki = torihiki.astype({'取引の事情なし':float, '調停・競売等':float, '関係者間取引':float, 'その他事情有り':float, '瑕疵有りの可能性':float, '他の権利・負担付き':float})
for value in df[colname].unique():
    tmp4 = pd.DataFrame(index = [1], data = [{'取引の事情等':value, '取引の事情なし':value.count('なし'), '調停・競売等':value.count('調停・競売等'),
                                              '関係者間取引':value.count('関係者間取引'), 'その他事情有り':value.count('その他事情有り'),
                                              '瑕疵有りの可能性':value.count('瑕疵有りの可能性'), '他の権利・負担付き':value.count('他の権利・負担付き')}])
    torihiki = pd.concat([torihiki, tmp4])
df = pd.merge(df, torihiki, left_on=['取引の事情等'], right_on=['取引の事情等'], how='left')
display(df.head(2))
del torihiki

In [None]:
# '建ぺい率（％）', '容積率（％）' , '面積（㎡）': かけ合わせデータを作成
df['建ぺい率×容積率'] = df['建ぺい率（％）'] * df['容積率（％）']
df['容積率/建ぺい率'] = df['容積率（％）'] / df['建ぺい率（％）']
df['建ぺい率×面積'] = df['建ぺい率（％）'] * df['面積（㎡）']
df['容積率×面積'] = df['容積率（％）'] * df['面積（㎡）']

In [None]:
# LabelEncoderと集積特徴量を作成する前にDataFrameを保存
filename = 'fixed/df.sav'
pickle.dump(df, open(filename, 'wb'))

In [None]:
filename = 'fixed/df.sav'
df = pickle.load(open(filename, 'rb'))

In [None]:
encoder = Pipeline([
    SelectCategorical(),
    LabelEncoder(output_suffix=""),
])
encoded_df = encoder.fit_transform(df)
# encoded_df.head(2)
for encode in encoded_df.columns:
    df[encode] = encoded_df[encode]

In [None]:
# 集積特徴量
# 都道府県ごとに最寄り駅の距離の平均、分散、最大値、最小値、最大-最小を算出する。
def max_min(x):
    return max(x)-min(x)
columns = ['都道府県名', '市区町村名', '地区名', '最寄駅：名称', '建物の構造', '用途', '今後の利用目的', '都市計画', '改装', '取引の事情等']
agg_dfs = []
for column in columns:
    agg_df,agg_cols = xfeat.aggregation(
        df,
        group_key=column,
        group_values=[
            '最寄駅：距離（分）', '建ぺい率（％）', '容積率（％）' , 
            '面積（㎡）', '建築年', '取引時点', '取引時点築年数', '面積log', '乗降客数2019'
        ],
        agg_methods=['count', 'min', 'max', 'mean', 'std', max_min],
    )
    agg_dfs.append(agg_df[agg_cols])
df = pd.concat([df]+agg_dfs, axis=1)

In [None]:
del agg_dfs
df.info()

In [None]:
colnames = [
    'agg_std_最寄駅：距離（分）_grpby_市区町村名',
    'agg_std_建ぺい率（％）_grpby_市区町村名',
    'agg_std_容積率（％）_grpby_市区町村名',
    'agg_std_面積（㎡）_grpby_市区町村名',
    'agg_std_建築年_grpby_市区町村名',
    'agg_std_取引時点_grpby_市区町村名',
    'agg_std_取引時点築年数_grpby_市区町村名',
    'agg_std_面積log_grpby_市区町村名',
    'agg_std_乗降客数2019_grpby_市区町村名',
    'agg_std_最寄駅：距離（分）_grpby_地区名',
    'agg_std_建ぺい率（％）_grpby_地区名',
    'agg_std_容積率（％）_grpby_地区名',
    'agg_std_面積（㎡）_grpby_地区名',
    'agg_std_建築年_grpby_地区名',
    'agg_std_取引時点_grpby_地区名',
    'agg_std_取引時点築年数_grpby_地区名',
    'agg_std_面積log_grpby_地区名',
    'agg_std_乗降客数2019_grpby_地区名',
    'agg_std_最寄駅：距離（分）_grpby_最寄駅：名称',
    'agg_std_建ぺい率（％）_grpby_最寄駅：名称',
    'agg_std_容積率（％）_grpby_最寄駅：名称',
    'agg_std_面積（㎡）_grpby_最寄駅：名称',
    'agg_std_建築年_grpby_最寄駅：名称',
    'agg_std_取引時点_grpby_最寄駅：名称',
    'agg_std_取引時点築年数_grpby_最寄駅：名称',
    'agg_std_面積log_grpby_最寄駅：名称',
    'agg_std_乗降客数2019_grpby_最寄駅：名称',
    'agg_std_最寄駅：距離（分）_grpby_建物の構造',
    'agg_std_建ぺい率（％）_grpby_建物の構造',
    'agg_std_容積率（％）_grpby_建物の構造',
    'agg_std_面積（㎡）_grpby_建物の構造',
    'agg_std_建築年_grpby_建物の構造',
    'agg_std_取引時点_grpby_建物の構造',
    'agg_std_取引時点築年数_grpby_建物の構造',
    'agg_std_面積log_grpby_建物の構造',
    'agg_std_乗降客数2019_grpby_建物の構造',
    'agg_std_最寄駅：距離（分）_grpby_用途',
    'agg_std_建ぺい率（％）_grpby_用途',
    'agg_std_容積率（％）_grpby_用途',
    'agg_std_面積（㎡）_grpby_用途',
    'agg_std_建築年_grpby_用途',
    'agg_std_取引時点_grpby_用途',
    'agg_std_取引時点築年数_grpby_用途',
    'agg_std_面積log_grpby_用途',
    'agg_std_乗降客数2019_grpby_用途',
    'agg_std_最寄駅：距離（分）_grpby_取引の事情等',
    'agg_std_建ぺい率（％）_grpby_取引の事情等',
    'agg_std_容積率（％）_grpby_取引の事情等',
    'agg_std_面積（㎡）_grpby_取引の事情等',
    'agg_std_建築年_grpby_取引の事情等',
    'agg_std_取引時点_grpby_取引の事情等',
    'agg_std_取引時点築年数_grpby_取引の事情等',
    'agg_std_面積log_grpby_取引の事情等',
    'agg_std_乗降客数2019_grpby_取引の事情等'
]

In [None]:
for colname in colnames:
    df.loc[df[colname].isna(), colname] = 0.0

In [None]:
df.isnull().sum()[df.isnull().sum() != 0]

In [None]:
df.info()

In [None]:
# 集積特徴量を作成後にDataFrameを保存
filename = 'fixed/df_agg.sav'
pickle.dump(df, open(filename, 'wb'))

In [None]:
filename = 'fixed/df_agg.sav'
df = pickle.load(open(filename, 'rb'))

In [None]:
from sklearn.preprocessing import StandardScaler
float_cols = df.select_dtypes(float).columns
float_cols = float_cols[float_cols != '取引価格（総額）_log']
# float_cols = float_cols[float_cols != '面積log']
scaler = StandardScaler()
scale_df = df[float_cols]
scaler.fit(scale_df)
scale_df = scaler.transform(scale_df)
scale_df = pd.DataFrame(data=scale_df , columns=float_cols)
for colname in scale_df.columns:
    if colname == '面積log':
        df['面積log_scaled'] = scale_df[colname]
    else:
        df[colname] = scale_df[colname]
del scale_df
df.head(2)

In [None]:
# 標準化後にDataFrameを保存
filename = 'fixed/df_std.sav'
pickle.dump(df, open(filename, 'wb'))

In [None]:
filename = 'fixed/df_std.sav'
df = pickle.load(open(filename, 'rb'))

# 学習の実施

In [None]:
import lightgbm as lgb
import japanize_matplotlib
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from modules.dinamiclr import LrSchedulingCallback
from catboost import Pool
from catboost import CatBoostRegressor

In [None]:
ID = 'ID'
TARGET = '取引価格（総額）_log'

In [None]:
dropcol = ['市区町村コード', '取引時点int']
df = df.drop(dropcol, axis=1)
# df.set_index('ID', inplace = True)
test = df[df['学習データ']==False]
train = df[df['学習データ']==True]

In [None]:
train.head(1)

In [None]:
sns.jointplot(x='面積log', y='価格/面積log', data=train, kind='scatter')

In [None]:
sns.jointplot(x='面積log_scaled', y='価格/面積log', data=train, kind='scatter')

In [None]:
# 価格/面積log > 3.0 のtrainデータを削除する。
# 標準化した結果-1~1とする。（この範囲に68%が入っているはず。）
# 2つ目以降のモデルでは-2～2や-1.5～1.5などで検討したい。

In [None]:
# train_2は価格/面積log > 1 の上位16%を取ったデータを利用する。 
# train_2 = train[train['価格/面積log'] < -1]

In [None]:
# train[train['価格/面積log'] < -0.5]

In [None]:
train_1 = train[train['価格/面積log'] <= 1]
train_1 = train_1[train_1['価格/面積log'] >= -1]

In [None]:
cols = df.columns
cols = cols[cols != '取引価格（総額）_log']
cols = cols[cols != '学習データ']
cols = cols[cols != '価格/面積log']
cols = cols[cols != '面積（㎡）']
# cols = cols[cols != 'ID']

In [None]:
X_np_1 = np.array(train_1[cols])
y_np_1 = train_1['取引価格（総額）_log'].values
X_submit = np.array(test[cols])
X_np = np.array(train[cols])
y_np = train['取引価格（総額）_log'].values

In [None]:
# del df

In [None]:
# 遺伝的アルゴリズムの検証
# from GaSolverImpl import GaSolverImpl

In [None]:
# solver = GaSolverImpl(
#     chromosome_length = train[cols].shape[1], 
#     population_size = 30,
#     pick_out_size = 10,
#     individual_mutation_probability = 0.3,
#     gene_mutation_probability = 0.1,
#     iteration = 50,
#     verbose = True
# )

In [None]:
# history = solver.solve(train[cols], train['取引価格（総額）_log'])

In [None]:
# 50回で1700sec程度

In [None]:
# min(history['Min'])

In [None]:
# history['Min'][1]

In [None]:
# for i in range(len(history['Min'])):
#     if min(history['Min']) == history['Min'][i]:
#         bestchromosome = history['BestChromosome'][i]
#         print('{}:{}'.format(i,history['Min'][i]))


In [None]:
# print("最も優れた個体は{}".format(bestchromosome))

In [None]:
# use_cols = [bool(gene) for gene in bestchromosome]

In [None]:
# X_best = train[cols].iloc[:, use_cols]

In [None]:
# X_best.head(2)

In [None]:
# bestcols = X_best.columns

In [None]:
# X_np = np.array(X_best)
# y_np = train['取引価格（総額）_log'].values
# X_submit = np.array(test[bestcols])

In [None]:
# train：KFoldするため。test：検証のため。
# X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_np, y_np, test_size=0.3, random_state=42)
# X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_np_1, y_np_1, test_size=0.3, random_state=42)

In [None]:
# X_valid_0, X_test_0, y_valid_0, y_test_0 = train_test_split(X_test_0, y_test_0, test_size=0.3, random_state=42)
# X_valid_1, X_test_1, y_valid_1, y_test_1 = train_test_split(X_test_1, y_test_1, test_size=0.3, random_state=42)

In [None]:
# # DataFrameの作成
# X_train_df = pd.DataFrame(X_train, columns=cols)
# X_test_df = pd.DataFrame(X_test, columns=cols)
# y_train_df = pd.DataFrame(y_train, columns=['取引価格（総額）_log'])
# y_test_df = pd.DataFrame(y_test, columns=['取引価格（総額）_log'])

In [None]:
# X_train_df_1 = pd.DataFrame(X_train_1, columns=cols)
# X_test_df_1 = pd.DataFrame(X_test_1, columns=cols)
# y_train_df_1 = pd.DataFrame(y_train_1, columns=['取引価格（総額）_log'])
# y_test_df_1 = pd.DataFrame(y_test_1, columns=['取引価格（総額）_log'])

In [None]:
# X_train_df.head(2)

In [None]:
# y_train_df.head(2)

In [None]:
# del X_np
# del y_np
# del X_train
# del y_train
# del X_test
# del y_test

In [None]:
# del X_np_1
# del y_np_1
# del X_train_1
# del y_train_1
# del X_test_1
# del y_test_1

In [None]:
# 価格/面積log > 3.0 のtrainデータを削除する。→どこで削除すべきか・・・
# train = train[train['価格/面積log'] >= 3.0]

In [None]:
# lgb_train = lgb.Dataset(X_train_df, y_train_df)
# lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
# params = {
#     'objective': 'regression',
#     'metric': 'mae',
#     'num_leaves': 42,
#     'max_depth': 7,
#     "feature_fraction": 0.8,
#     'subsample_freq': 1,
#     "bagging_fraction": 0.95,
#     'min_data_in_leaf': 2,
#     'learning_rate': 0.1,
#     "boosting": "gbdt",
#     "lambda_l1": 0.1,
#     "lambda_l2": 10,
#     "verbosity": -1,
#     "random_state": 42,
#     "num_boost_round": 100, # 50000
#     "early_stopping_rounds": 100
# }
# num_round = 100

In [None]:
# lgb_results = {}                                    # 学習の履歴を入れる入物

# model = lgb.train(
#                   params=params,                    # ハイパーパラメータをセット
#                   train_set=lgb_train,              # 訓練データを訓練用にセット
#                   valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
#                   valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
#                   evals_result=lgb_results,             # 学習の履歴を保存
#                   verbose_eval=100,                           # ログを100置きに表示
#                   num_boost_round = 100
#                   )  

In [None]:
# test_pred = model.predict(X_test, num_iteration=model.best_iteration)
# mae = mean_absolute_error(y_test, test_pred)

In [None]:
# mae

In [None]:
# Early stopping, best iteration is:
# [8466]	Train's l1: 0.0591203	Test's l1: 0.0770325

# Optunaを利用したハイパーパラメータチューニング

In [None]:
def sample_scheduler_func(current_lr, eval_history, best_round, is_higher_better):
    """次のラウンドで用いる学習率を決定するための関数 (この中身を好きに改造する)

    :param current_lr: 現在の学習率 (指定されていない場合の初期値は None)
    :param eval_history: 検証用データに対する評価指標の履歴
    :param best_round: 現状で最も評価指標の良かったラウンド数
    :param is_higher_better: 高い方が性能指標として優れているか否か
    :return: 次のラウンドで用いる学習率

    NOTE: 学習を打ち切りたいときには callback.EarlyStopException を上げる
    """
    # 学習率が設定されていない場合のデフォルト
    current_lr = current_lr or 0.2

    # 試しに 20 ラウンド毎に学習率を半分にしてみる
    if len(eval_history) > 900:
        if len(eval_history) % 100 == 0:
            current_lr /= 1.1

    # 小さすぎるとほとんど学習が進まないので下限も用意する
    min_threshold = 0.01
    current_lr = max(min_threshold, current_lr)
    
    if len(eval_history) % 300 == 0:
        print('現在の学習率：{}'.format(current_lr))
    
    return current_lr

In [None]:
# lgb_train = lgb.Dataset(X_train_df, y_train_df)
# lgb_test = lgb.Dataset(X_test_df, y_test_df, reference=lgb_train)
# lgb_train_1 = lgb.Dataset(X_train_df_1, y_train_df_1)
# lgb_test_1 = lgb.Dataset(X_test_df_1, y_test_df_1, reference=lgb_train_1)

In [None]:
# LightGBMで
def objective(trial):
    num_leaves =  trial.suggest_int("num_leaves", 40, 50)
    max_depth = trial.suggest_int("max_depth", 8, 12)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.6, 1.0)
    lambda_l1 = trial.suggest_uniform('lambda_l1', 0.0, 10.0)
    lambda_l2 = trial.suggest_uniform('lambda_l2', 20.0, 50.0)
    subsample_freq = trial.suggest_int('subsample_freq', 0, 5)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1.0)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 20)
    tree_learner = trial.suggest_categorical('tree_learner', ["serial", "feature", "data", "voting"])
    
    params = {
        'objective': 'regression', 		# 固定
        'metric': 'mae',				# 固定
        'num_leaves': num_leaves,
        'tree_learner': tree_learner,
        'max_depth': max_depth,
        "feature_fraction": feature_fraction,
        'subsample_freq': subsample_freq,
        "bagging_fraction": bagging_fraction,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': 0.3,
        "boosting": "gbdt",				# 固定
        "lambda_l1": lambda_l1,
        "lambda_l2": lambda_l2,
        "verbosity": -1,				# 固定
        "random_state": 42,				# 固定
        "early_stopping_rounds": 100,	# 固定
        "feature_pre_filter": False
    }
    callbacks = [
        lgb.log_evaluation(100),       # ログを100置きに表示
    ]
    model = lgb.train(
                  params=params,                    # ハイパーパラメータをセット
                  train_set=lgb_train,              # 訓練データを訓練用にセット
                  valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                  valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                  callbacks=callbacks,
                  num_boost_round = 100				# 50000
                  )
    # 評価
    return mean_absolute_error(y_test, model.predict(X_test))

In [None]:
def objective_1(trial):
    num_leaves =  trial.suggest_int("num_leaves", 40, 50)
    max_depth = trial.suggest_int("max_depth", 8, 12)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.6, 1.0)
    lambda_l1 = trial.suggest_uniform('lambda_l1', 0.0, 10.0)
    lambda_l2 = trial.suggest_uniform('lambda_l2', 20.0, 50.0)
    subsample_freq = trial.suggest_int('subsample_freq', 0, 5)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1.0)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 20)
    tree_learner = trial.suggest_categorical('tree_learner', ["serial", "feature", "data", "voting"])
    
    params = {
        'objective': 'regression', 		# 固定
        'metric': 'mae',				# 固定
        'num_leaves': num_leaves,
        'tree_learner': tree_learner,
        'max_depth': max_depth,
        "feature_fraction": feature_fraction,
        'subsample_freq': subsample_freq,
        "bagging_fraction": bagging_fraction,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': 0.3,
        "boosting": "gbdt",				# 固定
        "lambda_l1": lambda_l1,
        "lambda_l2": lambda_l2,
        "verbosity": -1,				# 固定
        "random_state": 42,				# 固定
        "early_stopping_rounds": 100,	# 固定
        "feature_pre_filter": False
    }
    callbacks = [
        lgb.log_evaluation(100),       # ログを100置きに表示
    ]
    model = lgb.train(
                  params=params,                    # ハイパーパラメータをセット
                  train_set=lgb_train_1,              # 訓練データを訓練用にセット
                  valid_sets=[lgb_train_1, lgb_test_1], # 訓練データとテストデータをセット
                  valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                  callbacks=callbacks,
                  num_boost_round = 100				# 50000
                  )
    # 評価
    return mean_absolute_error(y_test_1, model.predict(X_test_1))

In [None]:
# n_trials = 50 # 50
# study = optuna.create_study()
# study.optimize(objective, n_trials=n_trials)
# best_params = study.best_params
# add_params = {
#         'learning_rate': 0.3,
#         'objective': 'regression',
#         'metric': 'mae',
#         # 'tree_learner': 'feature',
#         # 'max_depth': 7,
#         # "feature_fraction": 0.8,
#         # 'subsample_freq': 1,
#         # "bagging_fraction": 0.95,
#         # 'min_data_in_leaf': 2,
#         "boosting": "gbdt",
#         # "lambda_l1": 0.1,
#         # "lambda_l2": 10,
#         "verbosity": -1,
#         "random_state": 42,
#         "early_stopping_rounds": 50,
#     }
# best_params.update(add_params)

In [None]:
# optuna.visualization.plot_contour(study)

In [None]:
# optuna.visualization.plot_param_importances(study)

In [None]:
# print(best_params)

In [None]:
# best_params_1 = study.best_params
# best_params_1.update(add_params)
# print(best_params_1)

- 'learning_rate': 0.27246396622859664,
- 'num_leaves': 48,
- 'tree_learner': 'feature'
- [1700]	Train's l1: 0.060631	Test's l1: 0.076648
- 提出後：0.0815

In [None]:
best_params = {'num_leaves': 50, 'max_depth': 9, 'feature_fraction': 0.6771189004705872, 'lambda_l1': 3.769927741005735, 'lambda_l2': 33.11864954025415, 'subsample_freq': 0, 'bagging_fraction': 0.9811102265887287, 'min_data_in_leaf': 1, 'tree_learner': 'voting', 'learning_rate': 0.3, 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
best_params_1 = {'num_leaves': 50, 'max_depth': 12, 'feature_fraction': 0.9936244753324097, 'lambda_l1': 1.99436831601929, 'lambda_l2': 41.53561366155952, 'subsample_freq': 4, 'bagging_fraction': 0.9055551419531833, 'min_data_in_leaf': 2, 'tree_learner': 'voting', 'learning_rate': 0.3, 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}


In [None]:
# min, max, countを追加
# {'learning_rate': 0.3137985421886106, 'num_leaves': 47, 'max_depth': 9, 'feature_fraction': 0.8823213918895785, 'lambda_l1': 8.487232300704612, 'lambda_l2': 18.508332918744934, 'subsample_freq': 5, 'bagging_fraction': 0.8210512881187388, 'min_data_in_leaf': 5, 'objective': 'regression', 'metric': 'mae', 'tree_learner': 'feature', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# [1900]	Train's l1: 0.0652579	Test's l1: 0.0767014
# 提出後 0.0808

In [None]:
# 地価にmin,max,max-min,count,stdを追加
# {'learning_rate': 0.25612821629251814, 'num_leaves': 47, 'max_depth': 10, 'feature_fraction': 0.6513130781826354, 'lambda_l1': 4.866437424633705, 'lambda_l2': 8.9330464563469, 'subsample_freq': 4, 'bagging_fraction': 0.7674893318623637, 'min_data_in_leaf': 1, 'objective': 'regression', 'metric': 'mae', 'tree_learner': 'feature', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# [1600]	Train's l1: 0.0656753	Test's l1: 0.0765212
# 提出後 0.0801

In [None]:
# 3.0以下の外れ値を除去
# [1800]	Train's l1: 0.0656376	Test's l1: 0.0768981
# 提出後 0.0800

In [None]:
# 3.0以下の外れ値を除去
# {'learning_rate': 0.28103437166049444, 'num_leaves': 48, 'max_depth': 9, 'feature_fraction': 0.896819587342583, 'lambda_l1': 8.446775572884537, 'lambda_l2': 4.793434982955629, 'subsample_freq': 2, 'bagging_fraction': 0.9435825781121129, 'min_data_in_leaf': 15, 'objective': 'regression', 'metric': 'mae', 'tree_learner': 'feature', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# 0.075Xくらいだったような
# 提出後 0.0798

In [None]:
# {'learning_rate': 0.2816136833111954, 'num_leaves': 50, 'max_depth': 11, 'feature_fraction': 0.9992504277288726, 'lambda_l1': 1.4111483663786961, 'lambda_l2': 22.908251871514096, 'subsample_freq': 1, 'bagging_fraction': 0.9748873660775911, 'min_data_in_leaf': 2, 'tree_learner': 'feature', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# 特徴量を274個に減じた。
# [1500]	Train's l1: 0.0617913	Test's l1: 0.0764189
# 提出後 0.0814

In [None]:
# {'learning_rate': 0.2997952552607652, 'num_leaves': 49, 'max_depth': 12, 'feature_fraction': 0.7534749533266163, 'lambda_l1': 2.7270263662026006, 'lambda_l2': 5.205898059128051, 'subsample_freq': 0, 'bagging_fraction': 0.7276572071292827, 'min_data_in_leaf': 3, 'tree_learner': 'voting', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# Standard Scalerを利用した。
# 0.0759くらい。
# 提出後 0.0800

In [None]:
# {'learning_rate': 0.2825500205118874, 'num_leaves': 50, 'max_depth': 9, 'feature_fraction': 0.9517762533772877, 'lambda_l1': 7.5440020768205285, 'lambda_l2': 14.881282532779924, 'subsample_freq': 3, 'bagging_fraction': 0.6767205770881366, 'min_data_in_leaf': 3, 'tree_learner': 'feature', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# Standard Scalerを利用した。900超の特徴量。（人口密度等）
# [1400]	Train's l1: 0.0671291	Test's l1: 0.0764916
# 提出後 0.0804

In [None]:
# {'learning_rate': 0.2001026063764013, 'num_leaves': 48, 'max_depth': 10, 'feature_fraction': 0.6569581542581295, 'lambda_l1': 3.671429674023342, 'lambda_l2': 21.104969811169425, 'subsample_freq': 5, 'bagging_fraction': 0.9606337279489678, 'min_data_in_leaf': 20, 'tree_learner': 'serial', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# CountEncoder TargetEncoder
# Standard Scaler
# [2000]	Train's l1: 0.0626875	Test's l1: 0.074743
# 提出後 0.0788

In [None]:
# {'learning_rate': 0.28514803450634096, 'num_leaves': 49, 'max_depth': 10, 'feature_fraction': 0.983224445828606, 'lambda_l1': 3.7055473835000097, 'lambda_l2': 31.122941940770808, 'subsample_freq': 2, 'bagging_fraction': 0.928818087754744, 'min_data_in_leaf': 3, 'tree_learner': 'feature', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# CountEncoder TargetEncoder 乗降客数
# [4800]	Train's l1: 0.0608743	Test's l1: 0.0755986
# 提出後 0.0787

In [None]:
# {'learning_rate': 0.28514803450634096, 'num_leaves': 49, 'max_depth': 10, 'feature_fraction': 0.983224445828606, 'lambda_l1': 3.7055473835000097, 'lambda_l2': 31.122941940770808, 'subsample_freq': 2, 'bagging_fraction': 0.928818087754744, 'min_data_in_leaf': 3, 'tree_learner': 'feature', 'objective': 'regression', 'metric': 'mae', 'boosting': 'gbdt', 'verbosity': -1, 'random_state': 42, 'early_stopping_rounds': 50}
# CountEncoder TargetEncoder 乗降客数 10万単位で切り上げ
# Test's l1: 0.07544327084053161
# 提出後 0.0785

In [None]:
# KFoldを利用して交差検証を行う。

In [None]:
FOLD = 5

valid_scores = []
models = []
y_pred = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)

# モデル1を実行（外れ値を未除去）

for fold, (train_indices, valid_indices) in enumerate(kf.split(X_np)):
    X_train, X_valid = X_np[train_indices], X_np[valid_indices]
    y_train, y_valid = y_np[train_indices], y_np[valid_indices]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)
    lr_scheduler_cb = LrSchedulingCallback(strategy_func=sample_scheduler_func)

    callbacks = [
        lgb.log_evaluation(100),       # ログを100置きに表示
        # lgb.record_evaluation(lgb_results),
        lr_scheduler_cb,
    ]
    model = lgb.train(
              params=best_params,                    # ハイパーパラメータをセット
              train_set=lgb_train,              # 訓練データを訓練用にセット
              valid_sets=[lgb_train, lgb_eval], # 訓練データとテストデータをセット
              valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
              callbacks=callbacks,
              num_boost_round = 50000                   
              )  

    y_valid_pred = model.predict(X_valid)
    
    X_valid_df = pd.DataFrame(X_valid, columns=cols)
    y_valid_pred_df = pd.DataFrame(y_valid_pred, columns=[TARGET])
    y_valid_pred_df[ID] = X_valid_df[ID]
    
    y_pred.append(y_valid_pred_df)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    valid_scores.append(score)

    models.append(model)

cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')

In [None]:
# モデルを保存する
filename = 'models/lightgbm_0/models_{}.sav'
for i in range(5):
    pickle.dump(models[i], open(filename.format(i), 'wb'))

In [None]:
y_pred_lgb0 = pd.concat([y_pred[0], y_pred[1], y_pred[2], y_pred[3], y_pred[4]]).reset_index().drop(['index'], axis=1)
y_pred_lgb0 = y_pred_lgb0.astype({ID:int})
y_pred_lgb0[[ID, TARGET]].to_csv('./pre-pred/prepred_lgb0.csv', index = False)

In [None]:
# csv読み込み
y_pred_lgb0 = pd.read_csv('./pre-pred/prepred_lgb0.csv')
y_pred_lgb0.columns = [ID, '取引価格（総額）_log_pred']

In [None]:
y_pred_lgb0 = pd.merge(train_1[[ID, TARGET]].astype({ID:int}), y_pred_lgb0, on=ID)

In [None]:
# 保存したモデルをロードする
filename = 'models/lightgbm_0/models_{}.sav'
models = []
for i in range(5):
    loaded_model = pickle.load(open(filename.format(i), 'rb'))
    models.append(loaded_model)

In [None]:
# lgb0モデルでsubmitデータを作る
submit_pred_lgb0 = []
for i in range(5):
    submit_pred = models[i].predict(X_submit)
    submit_pred_lgb0.append(submit_pred)
submit_pred_lgb0 = (submit_pred_lgb0[0] + submit_pred_lgb0[1] + submit_pred_lgb0[2] + submit_pred_lgb0[3] + submit_pred_lgb0[4])/5

In [None]:
X_submit_df = pd.DataFrame(X_submit, columns=cols)
submit_pred_lgb0_df = pd.DataFrame(submit_pred_lgb0, columns=[TARGET])
submit_pred_lgb0_df[ID] = X_submit_df[ID]
submit_pred_lgb0_df = submit_pred_lgb0_df[[ID, TARGET]].astype({ID:int})

In [None]:
submit_pred_lgb0_df.to_csv('./pre-pred/submit/prepred_lgb0.csv', index = False)

In [None]:
# モデル2を実行（外れ値を除去済）
FOLD = 5
valid_scores_1 = []
models_1 = []
y_pred_1 = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(X_np_1)):
    X_train, X_valid = X_np_1[train_indices], X_np_1[valid_indices]
    y_train, y_valid = y_np_1[train_indices], y_np_1[valid_indices]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)
    lr_scheduler_cb = LrSchedulingCallback(strategy_func=sample_scheduler_func)
    callbacks = [
        lgb.log_evaluation(100),       # ログを100置きに表示
        lr_scheduler_cb,
    ]
    model = lgb.train(
              params=best_params_1,                    # ハイパーパラメータをセット
              train_set=lgb_train,              # 訓練データを訓練用にセット
              valid_sets=[lgb_train, lgb_eval], # 訓練データとテストデータをセット
              valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
              callbacks=callbacks,
              num_boost_round = 50000                   
              )

    y_valid_pred = model.predict(X_valid)
    
    X_valid_df = pd.DataFrame(X_valid, columns=cols)
    y_valid_pred_df = pd.DataFrame(y_valid_pred, columns=[TARGET])
    y_valid_pred_df[ID] = X_valid_df[ID]
    
    y_pred_1.append(y_valid_pred_df)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    valid_scores_1.append(score)

    models_1.append(model)

cv_score = np.mean(valid_scores_1)
print(f'CV score: {cv_score}')

In [None]:
# モデルを保存する
filename = 'models/lightgbm_1/models_{}.sav'
for i in range(5):
    pickle.dump(models_1[i], open(filename.format(i), 'wb'))

In [None]:
y_pred_lgb = pd.concat([y_pred_1[0], y_pred_1[1], y_pred_1[2], y_pred_1[3], y_pred_1[4]]).reset_index().drop(['index'], axis=1)
y_pred_lgb = y_pred_lgb.astype({ID:int})
y_pred_lgb[[ID, TARGET]].to_csv('./pre-pred/prepred_lgb.csv', index = False)

In [None]:
# csv読み込み
y_pred_lgb = pd.read_csv('./pre-pred/prepred_lgb.csv')
y_pred_lgb.columns = [ID, '取引価格（総額）_log_pred']

In [None]:
y_pred_lgb = pd.merge(train_1[[ID, TARGET]].astype({ID:int}), y_pred_lgb, on=ID)

In [None]:
mean_absolute_error(y_pred_lgb[TARGET].values, y_pred_lgb[TARGET + '_pred'].values)

In [None]:
mean_absolute_error(y_pred_lgb[TARGET].values, np.log10(np.ceil(np.power(10, y_pred_lgb[TARGET + '_pred'].values)/100000)*100000))

In [None]:
y_pred_lgb

In [None]:
# 保存したモデルをロードする
filename = 'models/lightgbm_1/models_{}.sav'
models_1 = []
for i in range(5):
    loaded_model = pickle.load(open(filename.format(i), 'rb'))
    models_1.append(loaded_model)

In [None]:
# lgb0モデルでsubmitデータを作る
submit_pred_lgb1 = []
for i in range(5):
    submit_pred = models_1[i].predict(X_submit)
    submit_pred_lgb1.append(submit_pred)
submit_pred_lgb1 = (submit_pred_lgb1[0] + submit_pred_lgb1[1] + submit_pred_lgb1[2] + submit_pred_lgb1[3] + submit_pred_lgb1[4])/5

In [None]:
X_submit_df = pd.DataFrame(X_submit, columns=cols)
submit_pred_lgb1_df = pd.DataFrame(submit_pred_lgb1, columns=[TARGET])
submit_pred_lgb1_df[ID] = X_submit_df[ID]
submit_pred_lgb1_df = submit_pred_lgb1_df[[ID, TARGET]].astype({ID:int})
submit_pred_lgb1_df.to_csv('./pre-pred/submit/prepred_lgb1.csv', index = False)

In [None]:
# CatBoostを利用してみる

In [None]:
category_cols

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_np_1, y_np_1, test_size=0.3, random_state=42)

# DataFrameの作成
# X_train_df_1 = pd.DataFrame(X_train_1, columns=cols)
# X_test_df_1 = pd.DataFrame(X_test_1, columns=cols)
# y_train_df_1 = pd.DataFrame(y_train_1, columns=['取引価格（総額）_log'])
# y_test_df_1 = pd.DataFrame(y_test_1, columns=['取引価格（総額）_log'])

In [None]:
train_pool = Pool(X_train_1, y_train_1)
validate_pool = Pool(X_test_1, y_test_1)

In [None]:
# CatBoostでチューニング
def cat_objective(trial):
    print('開始')
    params = {
        'objective': 'MAE', #trial.suggest_categorical("objective", ["MAE"]),
        "colsample_bylevel": 0.09, #trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": 12, #trial.suggest_int("depth", 4, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 50),
        'random_strength': trial.suggest_float('random_strength', 0.001, 100),
        # 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        "boosting_type": 'Plain', #trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": 'Bernoulli', #trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        'eval_metric': 'MAE',
        'learning_rate' : 0.1,       # 学習率
        'early_stopping_rounds' : 50,
        'iterations' : 20000, 
        'verbose' : 500,
        'loss_function': 'MAE',
        'random_seed' :42
    }
    # if params['od_type'] == 'IncToDec':
    #     params['od_pval'] = trial.suggest_float('od_pval', 1e-10, 1e-2)
    
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.01, 100)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    print(params)
    gbm = CatBoostRegressor(**params)
    gbm.fit(train_pool, eval_set=validate_pool)
    # 評価
    score = mean_absolute_error(y_test_1, gbm.predict(X_test_1))
    del gbm
    print(score)
    return score

In [None]:
n_trials = 20
study = optuna.create_study()
study.optimize(cat_objective, n_trials=n_trials)

In [None]:
best_params_cat = study.best_params
add_params = {        
        'objective': 'MAE',
        "colsample_bylevel": 0.09, 
        "depth": 12, 
        "boosting_type": 'Plain',
        "bootstrap_type": 'Bernoulli',
        'eval_metric': 'MAE',
        'learning_rate' : 0.1,       # 学習率
        'early_stopping_rounds' : 50,
        'iterations' : 20000, 
        'verbose' : 500,
        'loss_function': 'MAE',
        'random_seed' :42
    }
best_params_cat.update(add_params)
print(best_params_cat)

In [None]:
best_params_cat = {'l2_leaf_reg': 33.51917340457483, 'random_strength': 0.16779085114202497, 'subsample': 0.9828709713363581, 'objective': 'MAE', 'colsample_bylevel': 0.09, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'eval_metric': 'MAE', 'learning_rate': 0.1, 'early_stopping_rounds': 50, 'iterations': 20000, 'verbose': 500, 'loss_function': 'MAE', 'random_seed': 42}


In [None]:
FOLD = 5
valid_scores_cat = []
models_cat = []
y_pred_cat = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(X_np_1)):
    X_train, X_valid = X_np_1[train_indices], X_np_1[valid_indices]
    y_train, y_valid = y_np_1[train_indices], y_np_1[valid_indices]
    train_pool = Pool(X_train, y_train)
    validate_pool = Pool(X_valid, y_valid)
    
    model = CatBoostRegressor(**best_params_cat)
    model.fit(train_pool, eval_set=validate_pool)

    y_valid_pred = model.predict(X_valid)
    
    X_valid_df = pd.DataFrame(X_valid, columns=cols)
    y_valid_pred_df = pd.DataFrame(y_valid_pred, columns=[TARGET])
    y_valid_pred_df[ID] = X_valid_df[ID]
    
    y_pred_cat.append(y_valid_pred_df)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    valid_scores_cat.append(score)

    models_cat.append(model)

cv_score = np.mean(valid_scores_cat)
print(f'CV score: {cv_score}')

In [None]:
# モデルを保存する
filename = 'models/catboost_1/models_{}.sav'
for i in range(5):
    pickle.dump(models_cat[i], open(filename.format(i), 'wb'))

In [None]:
y_pred_cat = pd.concat([y_pred_cat[0], y_pred_cat[1], y_pred_cat[2], y_pred_cat[3], y_pred_cat[4]]).reset_index().drop(['index'], axis=1)
y_pred_cat = y_pred_cat.astype({ID:int})
y_pred_cat[[ID, TARGET]].to_csv('./pre-pred/prepred_cat.csv', index = False)

In [None]:
y_pred_cat

In [None]:
# csv読み込み
y_pred_cat = pd.read_csv('./pre-pred/prepred_cat.csv')

In [None]:
y_pred_cat

In [None]:
y_pred_cat.columns = [ID, '取引価格（総額）_log_pred']

In [None]:
y_pred_cat = pd.merge(train_1[[ID, TARGET]].astype({ID:int}), y_pred_cat, on=ID)

In [None]:
mean_absolute_error(y_pred_cat[TARGET].values, y_pred_cat[TARGET + '_pred'].values)

In [None]:
y_pred_cat

In [None]:
train_1[[ID, TARGET]].to_csv('./pre-pred/y_np.csv', index = False)

In [None]:
# 保存したモデルをロードする
filename = 'models/catboost_1/models_{}.sav'
models_cat = []
for i in range(5):
    loaded_model = pickle.load(open(filename.format(i), 'rb'))
    models_cat.append(loaded_model)

In [None]:
# lgb0モデルでsubmitデータを作る
submit_pred_cat = []
for i in range(5):
    submit_pred = models_cat[i].predict(X_submit)
    submit_pred_cat.append(submit_pred)
submit_pred_cat = (submit_pred_cat[0] + submit_pred_cat[1] + submit_pred_cat[2] + submit_pred_cat[3] + submit_pred_cat[4])/5

In [None]:
X_submit_df = pd.DataFrame(X_submit, columns=cols)
submit_pred_cat_df = pd.DataFrame(submit_pred_cat, columns=[TARGET])
submit_pred_cat_df[ID] = X_submit_df[ID]
submit_pred_cat_df = submit_pred_cat_df[[ID, TARGET]].astype({ID:int})
submit_pred_cat_df.to_csv('./pre-pred/submit/prepred_cat.csv', index = False)

In [None]:
# 分類用のインスタンスを作成
model = CatBoostRegressor(**best_params_cat)

# CatBoost, CatBoostRegressorも存在するが損失関数が異なるだけ
model.fit(train_pool, eval_set=validate_pool)
cat_pred = model.predict(X_valid_0)

In [None]:
score = mean_absolute_error(y_valid_0, cat_pred)
print(f'catboost MAE: {score}')

In [None]:
# valid_predの予測値の平均を出す。

In [None]:
valid_preds = []
for i in range(4):
    valid_pred = models[i].predict(X_valid_0, num_iteration=models[i].best_iteration)
    score = mean_absolute_error(y_valid_0, valid_pred)
    valid_preds.append(valid_pred)
    print(f'fold {i} MAE: {score}')
valid_pred = (valid_preds[0] + valid_preds[1] + valid_preds[2] + valid_preds[3])/4
score = mean_absolute_error(y_valid_0, valid_pred)
print(f'fold all MAE: {score}')

In [None]:
# y_valid_0 が正解
valid_preds_1 = []
for i in range(4):
    valid_pred_1 = models_1[i].predict(X_valid_0, num_iteration=models_1[i].best_iteration)
    score = mean_absolute_error(y_valid_0, valid_pred_1)
    valid_preds_1.append(valid_pred_1)
    print(f'fold {i} MAE: {score}')
valid_pred_1 = (valid_preds_1[0] + valid_preds_1[1] + valid_preds_1[2] + valid_preds_1[3])/4
score = mean_absolute_error(y_valid_0, valid_pred_1)
print(f'fold all MAE: {score}')

In [None]:
# y_valid_0 が正解
valid_preds_cat = []
for i in range(4):
    valid_pred_cat = models_cat[i].predict(X_valid_0)
    score = mean_absolute_error(y_valid_0, valid_pred_cat)
    valid_preds_cat.append(valid_pred_cat)
    print(f'fold {i} MAE: {score}')
valid_pred_cat = (valid_preds_cat[0] + valid_preds_cat[1] + valid_preds_cat[2] + valid_preds_cat[3])/4
score = mean_absolute_error(y_valid_0, valid_pred_cat)
print(f'fold all MAE: {score}')

In [None]:
# catboostモデルでsubmitデータを作る
valid_preds_cat = []
for i in range(5):
    valid_pred_cat = models_cat[i].predict(X_submit)
    valid_preds_cat.append(valid_pred_cat)
test_pred_cat = (valid_preds_cat[0] + valid_preds_cat[1] + valid_preds_cat[2] + valid_preds_cat[3] + valid_preds_cat[4])/5

In [None]:
test_pred_cat

In [None]:
sub_df = pd.read_csv('data/sample_submission.csv')
TARGET_1 = '取引価格（総額）_log_1'
test_pred = np.log10(np.ceil(np.power(10, test_pred_cat)/100000)*100000)
sub_df[TARGET] = test_pred
sub_df.to_csv('output/test_submission.csv', index=False)

In [None]:
# メタモデル用のテストデータ作成
test_preds = []
for i in range(4):
    test_pred = models[i].predict(X_test_0, num_iteration=models[i].best_iteration)
    score = mean_absolute_error(y_test_0, test_pred)
    test_preds.append(test_pred)
    print(f'fold {i} MAE: {score}')
test_pred = (test_preds[0] + test_preds[1] + test_preds[2] + test_preds[3])/4
score = mean_absolute_error(y_test_0, test_pred)
print(f'fold all MAE: {score}')

In [None]:
# y_test_0 が正解
test_preds_1 = []
for i in range(4):
    test_pred_1 = models_1[i].predict(X_test_0, num_iteration=models[i].best_iteration)
    score = mean_absolute_error(y_test_0, test_pred_1)
    test_preds_1.append(test_pred_1)
    print(f'fold {i} MAE: {score}')
test_pred_1 = (test_preds_1[0] + test_preds_1[1] + test_preds_1[2] + test_preds_1[3])/4
score = mean_absolute_error(y_test_0, test_pred_1)
print(f'fold all MAE: {score}')

In [None]:
# メタモデルへの入力の作成
stacked_predictions_valid = np.column_stack((valid_pred, valid_pred_1))
stacked_predictions_test = np.column_stack((test_pred, test_pred_1))

In [None]:
# メタモデルの学習
meta_scores = []
meta_models = []
y_meta_pred = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(stacked_predictions_valid)):
    X_train, X_valid = stacked_predictions_valid[train_indices], stacked_predictions_valid[valid_indices]
    y_train, y_valid = y_valid_0[train_indices], y_valid_0[valid_indices]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)
    lr_scheduler_cb = LrSchedulingCallback(strategy_func=sample_scheduler_func)
    callbacks = [
        lgb.log_evaluation(100),       # ログを100置きに表示
        lr_scheduler_cb,
    ]
    model = lgb.train(
              params=meta_best_params,                    # ハイパーパラメータをセット
              train_set=lgb_train,              # 訓練データを訓練用にセット
              valid_sets=[lgb_train, lgb_eval], # 訓練データとテストデータをセット
              valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
              callbacks=callbacks,
              num_boost_round = 50000                   
              )  

    y_valid_pred = model.predict(X_valid)
    y_meta_pred.append(y_valid_pred_df)
    
    score = mean_absolute_error(y_valid, y_valid_pred)
    print(f'fold {fold} MAE: {score}')
    meta_scores.append(score)

    meta_models.append(model)

cv_score = np.mean(meta_scores)
print(f'CV score: {cv_score}')

In [None]:
test_preds = []
for i in range(4):
    test_pred = meta_models[i].predict(stacked_predictions_test, num_iteration=meta_models[i].best_iteration)
    score = mean_absolute_error(y_test_0, test_pred)
    test_preds.append(test_pred)
    print(f'fold {i} MAE: {score}')
test_pred = (test_preds[0] + test_preds[1] + test_preds[2] + test_preds[3])/4
score = mean_absolute_error(y_test_0, test_pred)
print(f'fold all MAE: {score}')

In [None]:
from sklearn.linear_model import Ridge
# train meta model 
meta_model = Ridge()
meta_model.fit(stacked_predictions, y_valid_0)

In [None]:
stacked_predictions_test = np.column_stack((test_pred, test_pred_1))

In [None]:
test_prediction = meta_model.predict(stacked_predictions_test)

In [None]:
score = mean_absolute_error(y_test_0, test_prediction)
print(f'fold all MAE: {score}')

In [None]:
# fold 0 MAE: 0.07283609637675516
# fold 1 MAE: 0.07251080524602205
# fold 2 MAE: 0.07285066014110207
# fold 3 MAE: 0.07246340988132395

In [None]:
# test_predとtest_pred_1をX軸を面積logとしてグラフ化する。

In [None]:
X_test_df = pd.DataFrame(X_submit, columns=cols)
y_test_pred_df = pd.DataFrame(test_pred, columns=['取引価格（総額）_log'])
X_test_df['取引価格（総額）_log'] = y_test_pred_df['取引価格（総額）_log']
X_test_df['価格/面積log'] = X_test_df['取引価格（総額）_log'] - X_test_df['面積log']
X_test_df

In [None]:
# 価格/面積logについて、標準化を行う。
float_cols = ['価格/面積log']
scaler = StandardScaler()
scale_df = X_test_df[float_cols]
scaler.fit(scale_df)
scale_df = scaler.transform(scale_df)
scale_df = pd.DataFrame(data=scale_df , columns=float_cols)
for colname in scale_df.columns:
    X_test_df[colname] = scale_df[colname]

In [None]:
sns.jointplot(x='面積log_scaled', y='価格/面積log', data=X_test_df, kind='scatter')

In [None]:
X_test_df_1 = pd.DataFrame(X_submit, columns=cols)
y_test_pred_df_1 = pd.DataFrame(test_pred_1, columns=['取引価格（総額）_log'])
X_test_df_1['取引価格（総額）_log'] = y_test_pred_df_1['取引価格（総額）_log']
X_test_df_1['価格/面積log'] = X_test_df_1['取引価格（総額）_log'] - X_test_df_1['面積log']
X_test_df_1

In [None]:
# 価格/面積logについて、標準化を行う。
float_cols = ['価格/面積log']
scaler = StandardScaler()
scale_df = X_test_df_1[float_cols]
scaler.fit(scale_df)
scale_df = scaler.transform(scale_df)
scale_df = pd.DataFrame(data=scale_df , columns=float_cols)
for colname in scale_df.columns:
    X_test_df_1[colname] = scale_df[colname]

In [None]:
sns.jointplot(x='面積log_scaled', y='価格/面積log', data=X_test_df_1, kind='scatter')

In [None]:
# ここで、-1～1に入っていないデータについては、以下の計算式で2モデルをブレンドする。
# min({abs(価格/面積log)-1}^2, 1)*test_pred_1 + (1-min({abs(価格/面積log)-1}^2, 1))*test_pred

In [None]:
x = X_test_df['価格/面積log'].abs() - 1

In [None]:
x.loc[x < 0] = 0

In [None]:
x.loc[x > 0] = x / x.max()

In [None]:
x.loc[x > 0] = x ** 1.5

In [None]:
y = 1 - x

In [None]:
y

In [None]:
pred = X_test_df[['ID', '取引価格（総額）_log']]

In [None]:
pred.columns = ['ID', '取引価格（総額）_log_0']

In [None]:
pred['取引価格（総額）_log_1'] = X_test_df_1['取引価格（総額）_log']

In [None]:
pred['x'] = x

In [None]:
pred['y'] = y

In [None]:
pred

In [None]:
pred['取引価格（総額）_log'] = pred['x'] * pred['取引価格（総額）_log_0'] + pred['y'] * pred['取引価格（総額）_log_1']

In [None]:
pred

In [None]:
# test_pred_tmp

In [None]:
# test_pred_1 = (test_preds[0] + test_preds[1] + test_preds[2] + test_preds[3])/4

In [None]:
# test_pred

In [None]:
# test_pred_1

In [None]:
# models[1].predict(X_submit, num_iteration=models[1].best_iteration)

In [None]:
# models[4].predict(X_submit, num_iteration=models[0].best_iteration)

In [None]:
sub_df = pd.read_csv('data/sample_submission.csv')
ID = 'ID'
TARGET = '取引価格（総額）_log'
TARGET_1 = '取引価格（総額）_log_1'

In [None]:
test_pred = np.log10(np.ceil(np.power(10, pred['取引価格（総額）_log'])/100000)*100000)

In [None]:
test_pred

In [None]:
sub_df[TARGET] = test_pred

In [None]:
# test.reset_index()
# sub_df = pd.merge(sub_df[['ID']], test.reset_index()[['ID', TARGET]], on='ID')
sub_df.to_csv('output/test_submission.csv', index=False)
display(sub_df)

In [None]:
test_pred_1 = np.log10(np.ceil(np.power(10, test_pred_1)/100000)*100000)
test[TARGET] = test_pred_1

In [None]:
test[['面積log', TARGET, TARGET_1]]

In [None]:
test.reset_index()
sub_df = pd.merge(sub_df[['ID']], test.reset_index()[['ID', TARGET]], on='ID')

In [None]:
lgb_train = lgb.Dataset(X_train_df, y_train_df)
lgb_test = lgb.Dataset(X_test_df, y_test_df, reference=lgb_train)
lgb_train_1 = lgb.Dataset(X_train_df_1, y_train_df_1)
lgb_test_1 = lgb.Dataset(X_test_df_1, y_test_df_1, reference=lgb_train_1)

In [None]:
lgb_results = {}                                    # 学習の履歴を入れる入物
callbacks = [
    lgb.log_evaluation(100),       # ログを100置きに表示
    lgb.record_evaluation(lgb_results),
    lr_scheduler_cb,
]
model = lgb.train(
              params=best_params,                    # ハイパーパラメータをセット
              train_set=lgb_train,              # 訓練データを訓練用にセット
              valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
              valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
              callbacks=callbacks,
              num_boost_round = 50000                   
              )  

In [None]:
import matplotlib.pyplot as plt
import japanize_matplotlib

In [None]:
lgb.plot_importance(model, figsize=(12,8), max_num_features=50, importance_type='gain')
plt.tight_layout()
plt.savefig('output/feature_importance.png')
plt.show()
plt.close()

In [None]:
cols

In [None]:
importance = pd.DataFrame(model.feature_importance(), columns=['importance'])
display(importance)

In [None]:
# 

val_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
# 後処理
y_test_actual   = np.power(10, y_test)
val_pred_actual = np.power(10, val_pred)

In [None]:
y_test_actual

In [None]:
np.log10(np.ceil(val_pred_actual/1000000)*1000000)

In [None]:
np.log10(np.round(val_pred_actual, decimals=-6))

In [None]:
mean_absolute_error(y_test, np.log10(np.ceil(val_pred_actual/100000)*100000))

In [None]:
sub_df = pd.read_csv('data/sample_submission.csv')


In [None]:
test_pred = model.predict(X_submit, num_iteration=model.best_iteration)
test_pred = np.log10(np.ceil(np.power(10, test_pred)/100000)*100000)
test[TARGET] = test_pred
test.reset_index()
sub_df = pd.merge(sub_df[['ID']], test.reset_index()[['ID', TARGET]], on='ID')

In [None]:
sub_df.to_csv('output/test_submission.csv', index=False)