# Read data and data processing

In [208]:
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    #y_true, y_pred = check_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [209]:
import pandas as pd
tochi_train = pd.read_csv('../data/train_genba.tsv', sep='\t')
build_train = pd.read_csv('../data/train_goto.tsv', sep='\t')
train = pd.merge(tochi_train, build_train, on="pj_no")

tochi_test = pd.read_csv('../data/test_genba.tsv', sep='\t')
build_test = pd.read_csv('../data/test_goto.tsv', sep='\t')
test = pd.merge(tochi_test, build_test, on="pj_no")

first_submission = pd.DataFrame()
first_submission['id'] = test['id']

In [210]:
# 名前系と相関性が高いカラムはとりあえず削除
# 土地のネームバリューが出てくると思うので、名前系はあとで追加するかも

name_columns = ['bastei_nm1','bastei_nm2','chiseki_kb_hb','eki_nm1','eki_nm2','gk_chu_tm','gk_sho_tm','hy1f_date_su', \
                'hy2f_date_su','jukyo','mseki_yt_hb','tc_mseki','yoseki2','id']
train.drop(name_columns, axis=1, inplace=True)
test.drop(name_columns, axis=1, inplace=True)

In [211]:
# カテゴリ系コラムにある「無」に値を振り分けたり、変換ミスに対処したり、マルバツからBooleanに変換

import numpy as np

train['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
train['hiatari'].fillna('普通', inplace=True)
train['kborjs'].replace('公募','公簿',inplace=True)
test['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
test['hiatari'].fillna('普通', inplace=True)
test['kborjs'].replace('公募','公簿',inplace=True)

maru_columns = ['rs_e_kdate2','rs_e_kdate3','rs_e_m_ari','rs_e_m_nashi','rs_e_parking','rs_e_tahata','rs_e_zoki', \
                'rs_n_kdate2','rs_n_kdate3','rs_n_m_ari','rs_n_m_nashi','rs_n_parking','rs_n_tahata','rs_n_zoki', \
                'rs_s_kdate2','rs_s_kdate3','rs_s_m_ari','rs_s_m_nashi','rs_s_parking','rs_s_tahata','rs_s_zoki', \
                'rs_w_kdate2','rs_w_kdate3','rs_w_m_ari','rs_w_m_nashi','rs_w_parking','rs_w_tahata','rs_w_zoki', \
                'sho_conv','sho_market','sho_shoten','sho_super','shu_bochi','shu_factory','shu_highway', \
                'shu_hvline','shu_jutaku','shu_kaido','shu_kokyo','shu_line_ari','shu_line_nashi','shu_park', \
                'shu_shop','shu_sogi','shu_soon','shu_tower','shu_zoki']

train[maru_columns] = train[maru_columns].replace({'○':1, np.nan:0})
test[maru_columns] = test[maru_columns].replace({'○':1, np.nan:0})

In [212]:
# 他規制や個別要因など、「複数ある場合は1～4」系のカラムに対処

hokakisei=['hokakisei1','hokakisei2','hokakisei3','hokakisei4']
kobetsu=['kobetsu1','kobetsu2','kobetsu3','kobetsu4']

train = pd.concat([train, train[hokakisei].stack().str.get_dummies().sum(level=0), \
                train[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
train.drop(hokakisei+kobetsu, axis=1, inplace=True)
train.iloc[:,137:] = train.iloc[:,137:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])
test = pd.concat([test, test[hokakisei].stack().str.get_dummies().sum(level=0), \
                test[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
test.drop(hokakisei+kobetsu, axis=1, inplace=True)
test.iloc[:,136:] = test.iloc[:,136:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])

In [213]:
# BooleanであるハズがCategoricalになってるカラムに対処

bool_columns = ['bus_yohi','chikukeikaku','fi3m_yohi','fi4m_yohi','gesui','hokakyoka','josui','kaihatsukyoka','kaoku_um', \
                'kborjs','keikakuroad','kinshijiko','t53kyoka','yheki_umu','yheki_yohi']

train[bool_columns] = train[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                   '公営':0,'私営':1,'実測':0,'公簿':1})
test[bool_columns] = test[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                 '公営':0,'私営':1,'実測':0,'公簿':1})

In [214]:
# 最後に、categoricalなカラムを全てone_hot_encode
categorical = ['bas_toho1','bas_toho2','bokachiiki','gas','hiatari','hw_status','jigata','kodochiku','levelplan', \
               'road1_hk','road1_sb','road2_hk','road2_sb','road3_sb','road3_hk','road4_sb','road4_hk','road_st', \
               'rosen_nm1','rosen_nm2','setsudo_hi','setsudo_kj','toshikuiki1','toshikuiki2','usui','yoto1','yoto2']

train = pd.concat([train, pd.get_dummies(train[categorical])], axis=1)
train.drop(categorical, axis=1, inplace=True)
test = pd.concat([test, pd.get_dummies(test[categorical])], axis=1)
test.drop(categorical, axis=1, inplace=True)

In [215]:
# 両方のDataframeに登場しないカラムを除外（価格はキープしとく）

train_columns = list(train.columns.values)
test_columns = list(test.columns.values)
unique_columns = list(set(train_columns) ^ set(test_columns))

y_train = train['keiyaku_pr']
train.drop(unique_columns, axis=1, inplace=True, errors='ignore')
test.drop(unique_columns, axis=1, inplace=True, errors='ignore')

# Try LightGBM

In [216]:
import lightgbm as lgb 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

# X_trainとY_trainをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(train, y_train, test_size=0.33, random_state=0)

# create dataset for lightgbm
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
early_stopping_rounds=20)
                        
print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
# eval
print('The MAPE of prediction is:', mean_absolute_percentage_error(valid_y, y_pred))

Starting training...
[1]	valid_0's l1: 5.07444e+06	valid_0's l2: 4.28824e+13
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 4.92733e+06	valid_0's l2: 4.05485e+13
[3]	valid_0's l1: 4.79207e+06	valid_0's l2: 3.84275e+13
[4]	valid_0's l1: 4.66013e+06	valid_0's l2: 3.63093e+13
[5]	valid_0's l1: 4.53719e+06	valid_0's l2: 3.4444e+13
[6]	valid_0's l1: 4.42091e+06	valid_0's l2: 3.27779e+13
[7]	valid_0's l1: 4.31297e+06	valid_0's l2: 3.12266e+13
[8]	valid_0's l1: 4.20498e+06	valid_0's l2: 2.97447e+13
[9]	valid_0's l1: 4.10777e+06	valid_0's l2: 2.8455e+13
[10]	valid_0's l1: 4.0172e+06	valid_0's l2: 2.72714e+13
[11]	valid_0's l1: 3.92878e+06	valid_0's l2: 2.61602e+13
[12]	valid_0's l1: 3.84616e+06	valid_0's l2: 2.51489e+13
[13]	valid_0's l1: 3.77294e+06	valid_0's l2: 2.42554e+13
[14]	valid_0's l1: 3.70214e+06	valid_0's l2: 2.3438e+13
[15]	valid_0's l1: 3.6327e+06	valid_0's l2: 2.26514e+13
[16]	valid_0's l1: 3.57093e+06	valid_0's l2: 2.19566e+13
[17]	valid_0's l1: 

[176]	valid_0's l1: 2.19577e+06	valid_0's l2: 9.26006e+12
[177]	valid_0's l1: 2.19573e+06	valid_0's l2: 9.2568e+12
[178]	valid_0's l1: 2.19521e+06	valid_0's l2: 9.25446e+12
[179]	valid_0's l1: 2.19449e+06	valid_0's l2: 9.24631e+12
[180]	valid_0's l1: 2.19343e+06	valid_0's l2: 9.23825e+12
[181]	valid_0's l1: 2.19214e+06	valid_0's l2: 9.23182e+12
[182]	valid_0's l1: 2.19037e+06	valid_0's l2: 9.21762e+12
[183]	valid_0's l1: 2.18888e+06	valid_0's l2: 9.20749e+12
[184]	valid_0's l1: 2.18827e+06	valid_0's l2: 9.20618e+12
[185]	valid_0's l1: 2.18851e+06	valid_0's l2: 9.21013e+12
[186]	valid_0's l1: 2.18777e+06	valid_0's l2: 9.20845e+12
[187]	valid_0's l1: 2.18754e+06	valid_0's l2: 9.20662e+12
[188]	valid_0's l1: 2.18743e+06	valid_0's l2: 9.20511e+12
[189]	valid_0's l1: 2.18599e+06	valid_0's l2: 9.20025e+12
[190]	valid_0's l1: 2.18565e+06	valid_0's l2: 9.19808e+12
[191]	valid_0's l1: 2.18402e+06	valid_0's l2: 9.18683e+12
[192]	valid_0's l1: 2.18295e+06	valid_0's l2: 9.17687e+12
[193]	valid_0's

[389]	valid_0's l1: 2.10915e+06	valid_0's l2: 8.81123e+12
[390]	valid_0's l1: 2.10872e+06	valid_0's l2: 8.80811e+12
[391]	valid_0's l1: 2.10882e+06	valid_0's l2: 8.81018e+12
[392]	valid_0's l1: 2.10901e+06	valid_0's l2: 8.81242e+12
[393]	valid_0's l1: 2.10899e+06	valid_0's l2: 8.81325e+12
[394]	valid_0's l1: 2.10859e+06	valid_0's l2: 8.81281e+12
[395]	valid_0's l1: 2.10864e+06	valid_0's l2: 8.81331e+12
[396]	valid_0's l1: 2.10877e+06	valid_0's l2: 8.81341e+12
[397]	valid_0's l1: 2.10865e+06	valid_0's l2: 8.8122e+12
[398]	valid_0's l1: 2.10836e+06	valid_0's l2: 8.81076e+12
[399]	valid_0's l1: 2.10752e+06	valid_0's l2: 8.80794e+12
[400]	valid_0's l1: 2.10754e+06	valid_0's l2: 8.80684e+12
[401]	valid_0's l1: 2.10745e+06	valid_0's l2: 8.80257e+12
[402]	valid_0's l1: 2.10738e+06	valid_0's l2: 8.80251e+12
[403]	valid_0's l1: 2.1074e+06	valid_0's l2: 8.80114e+12
[404]	valid_0's l1: 2.10707e+06	valid_0's l2: 8.79766e+12
[405]	valid_0's l1: 2.10668e+06	valid_0's l2: 8.79614e+12
[406]	valid_0's 

In [218]:
test_pred = gbm.predict(test, num_iteration=gbm.best_iteration)

first_submission['price'] = test_pred
first_submission.to_csv('lightgbm.tsv', sep='\t', index=False, header=False)