# Read data and data processing

In [32]:
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    #y_true, y_pred = check_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [33]:
import pandas as pd
tochi_train = pd.read_csv('../data/train_genba.tsv', sep='\t')
build_train = pd.read_csv('../data/train_goto.tsv', sep='\t')
train = pd.merge(tochi_train, build_train, on="pj_no")

tochi_test = pd.read_csv('../data/test_genba.tsv', sep='\t')
build_test = pd.read_csv('../data/test_goto.tsv', sep='\t')
test = pd.merge(tochi_test, build_test, on="pj_no")

first_submission = pd.DataFrame()
first_submission['id'] = test['id']

In [34]:
# 名前系と相関性が高いカラムはとりあえず削除
# 土地のネームバリューが出てくると思うので、名前系はあとで追加するかも

name_columns = ['bastei_nm1','bastei_nm2','chiseki_kb_hb','eki_nm1','eki_nm2','gk_chu_tm','gk_sho_tm','hy1f_date_su', \
                'hy2f_date_su','mseki_yt_hb','tc_mseki','yoseki2','id']
train.drop(name_columns, axis=1, inplace=True)
test.drop(name_columns, axis=1, inplace=True)

In [35]:
# カテゴリ系コラムにある「無」に値を振り分けたり、変換ミスに対処したり、マルバツからBooleanに変換

import numpy as np

train['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
train['hiatari'].fillna('普通', inplace=True)
train['kborjs'].replace('公募','公簿',inplace=True)
test['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
test['hiatari'].fillna('普通', inplace=True)
test['kborjs'].replace('公募','公簿',inplace=True)

maru_columns = ['rs_e_kdate2','rs_e_kdate3','rs_e_m_ari','rs_e_m_nashi','rs_e_parking','rs_e_tahata','rs_e_zoki', \
                'rs_n_kdate2','rs_n_kdate3','rs_n_m_ari','rs_n_m_nashi','rs_n_parking','rs_n_tahata','rs_n_zoki', \
                'rs_s_kdate2','rs_s_kdate3','rs_s_m_ari','rs_s_m_nashi','rs_s_parking','rs_s_tahata','rs_s_zoki', \
                'rs_w_kdate2','rs_w_kdate3','rs_w_m_ari','rs_w_m_nashi','rs_w_parking','rs_w_tahata','rs_w_zoki', \
                'sho_conv','sho_market','sho_shoten','sho_super','shu_bochi','shu_factory','shu_highway', \
                'shu_hvline','shu_jutaku','shu_kaido','shu_kokyo','shu_line_ari','shu_line_nashi','shu_park', \
                'shu_shop','shu_sogi','shu_soon','shu_tower','shu_zoki']

train[maru_columns] = train[maru_columns].replace({'○':1, np.nan:0})
test[maru_columns] = test[maru_columns].replace({'○':1, np.nan:0})

In [36]:
# 他規制や個別要因など、「複数ある場合は1～4」系のカラムに対処

hokakisei=['hokakisei1','hokakisei2','hokakisei3','hokakisei4']
kobetsu=['kobetsu1','kobetsu2','kobetsu3','kobetsu4']

train = pd.concat([train, train[hokakisei].stack().str.get_dummies().sum(level=0), \
                train[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
train.drop(hokakisei+kobetsu, axis=1, inplace=True)
train.iloc[:,137:] = train.iloc[:,137:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])
test = pd.concat([test, test[hokakisei].stack().str.get_dummies().sum(level=0), \
                test[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
test.drop(hokakisei+kobetsu, axis=1, inplace=True)
test.iloc[:,136:] = test.iloc[:,136:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])

In [37]:
# BooleanであるハズがCategoricalになってるカラムに対処

bool_columns = ['bus_yohi','chikukeikaku','fi3m_yohi','fi4m_yohi','gesui','hokakyoka','josui','kaihatsukyoka','kaoku_um', \
                'kborjs','keikakuroad','kinshijiko','t53kyoka','yheki_umu','yheki_yohi']

train[bool_columns] = train[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                   '公営':0,'私営':1,'実測':0,'公簿':1})
test[bool_columns] = test[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                 '公営':0,'私営':1,'実測':0,'公簿':1})

In [38]:
# 全部埼玉県なので住居から消去
# あと市が抜けてる住所情報は追加してあげる

shi_gun_dic = dict({'にっさい花みず木':'坂戸市にっさい花みず木','西鶴ヶ岡':'ふじみ野市西鶴ヶ岡', \
                    '杉戸町内田':'北葛飾郡杉戸町内田','宮代町宮代台':'南埼玉郡宮代町宮代台', \
                    '大字下日出谷':'桶川市大字下日出谷','杉戸町清地':'北葛飾郡杉戸町', \
                    '松伏町田中':'北葛飾郡松伏町','大字水野字逃水':'狭山市大字水野字逃水'})

train['jukyo'] = train['jukyo'].str.replace('埼玉県','')
test['jukyo'] = test['jukyo'].str.replace('埼玉県','')
train['jukyo'] = train['jukyo'].replace(shi_gun_dic)
test['jukyo'] = test['jukyo'].replace(shi_gun_dic)

jukyo_split_train = train['jukyo'].str.split(r'市|郡', n=1, expand=True)
train['jukyo_shi_gun'] = jukyo_split_train[0]
train.drop('jukyo', axis=1, inplace=True)

jukyo_split_test = test['jukyo'].str.split(r'市|郡', n=1, expand=True)
test['jukyo_shi_gun'] = jukyo_split_test[0]
test.drop('jukyo', axis=1, inplace=True)

In [39]:
# 最後に、categoricalなカラムを全てone_hot_encode
categorical = ['bas_toho1','bas_toho2','bokachiiki','gas','hiatari','hw_status','jigata','kodochiku','levelplan', \
               'road1_hk','road1_sb','road2_hk','road2_sb','road3_sb','road3_hk','road4_sb','road4_hk','road_st', \
               'rosen_nm1','rosen_nm2','setsudo_hi','setsudo_kj','toshikuiki1','toshikuiki2','usui','yoto1','yoto2', \
               'jukyo_shi_gun']

train = pd.concat([train, pd.get_dummies(train[categorical])], axis=1)
train.drop(categorical, axis=1, inplace=True)
test = pd.concat([test, pd.get_dummies(test[categorical])], axis=1)
test.drop(categorical, axis=1, inplace=True)

In [20]:
# 両方のDataframeに登場しないカラムを除外（価格はキープしとく）

train_columns = list(train.columns.values)
test_columns = list(test.columns.values)
unique_columns = list(set(train_columns) ^ set(test_columns))

y_train = train['keiyaku_pr']
train.drop(unique_columns, axis=1, inplace=True, errors='ignore')
test.drop(unique_columns, axis=1, inplace=True, errors='ignore')

# Try LightGBM

In [21]:
import lightgbm as lgb 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

# X_trainとY_trainをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(train, y_train, test_size=0.33, random_state=0)

# create dataset for lightgbm
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
early_stopping_rounds=20)
                        
print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
# eval
print('The MAPE of prediction is:', mean_absolute_percentage_error(valid_y, y_pred))

Starting training...
[1]	valid_0's l1: 5.07444e+06	valid_0's l2: 4.28824e+13
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 4.93667e+06	valid_0's l2: 4.05759e+13
[3]	valid_0's l1: 4.79679e+06	valid_0's l2: 3.83795e+13
[4]	valid_0's l1: 4.66226e+06	valid_0's l2: 3.63322e+13
[5]	valid_0's l1: 4.53608e+06	valid_0's l2: 3.44584e+13
[6]	valid_0's l1: 4.42016e+06	valid_0's l2: 3.27777e+13
[7]	valid_0's l1: 4.31411e+06	valid_0's l2: 3.13041e+13
[8]	valid_0's l1: 4.21268e+06	valid_0's l2: 2.9909e+13
[9]	valid_0's l1: 4.11883e+06	valid_0's l2: 2.86664e+13
[10]	valid_0's l1: 4.03219e+06	valid_0's l2: 2.7527e+13
[11]	valid_0's l1: 3.94954e+06	valid_0's l2: 2.64868e+13
[12]	valid_0's l1: 3.86751e+06	valid_0's l2: 2.54967e+13
[13]	valid_0's l1: 3.7874e+06	valid_0's l2: 2.4575e+13
[14]	valid_0's l1: 3.71657e+06	valid_0's l2: 2.37655e+13
[15]	valid_0's l1: 3.64766e+06	valid_0's l2: 2.2955e+13
[16]	valid_0's l1: 3.5883e+06	valid_0's l2: 2.2247e+13
[17]	valid_0's l1: 3.

[162]	valid_0's l1: 2.18676e+06	valid_0's l2: 9.41839e+12
[163]	valid_0's l1: 2.18695e+06	valid_0's l2: 9.41975e+12
[164]	valid_0's l1: 2.18495e+06	valid_0's l2: 9.41062e+12
[165]	valid_0's l1: 2.18421e+06	valid_0's l2: 9.40343e+12
[166]	valid_0's l1: 2.18335e+06	valid_0's l2: 9.39622e+12
[167]	valid_0's l1: 2.18285e+06	valid_0's l2: 9.39683e+12
[168]	valid_0's l1: 2.18103e+06	valid_0's l2: 9.38882e+12
[169]	valid_0's l1: 2.18064e+06	valid_0's l2: 9.38437e+12
[170]	valid_0's l1: 2.1788e+06	valid_0's l2: 9.37301e+12
[171]	valid_0's l1: 2.17773e+06	valid_0's l2: 9.36679e+12
[172]	valid_0's l1: 2.17729e+06	valid_0's l2: 9.36337e+12
[173]	valid_0's l1: 2.17667e+06	valid_0's l2: 9.36111e+12
[174]	valid_0's l1: 2.17572e+06	valid_0's l2: 9.35512e+12
[175]	valid_0's l1: 2.17517e+06	valid_0's l2: 9.3537e+12
[176]	valid_0's l1: 2.17502e+06	valid_0's l2: 9.34915e+12
[177]	valid_0's l1: 2.17459e+06	valid_0's l2: 9.3439e+12
[178]	valid_0's l1: 2.1737e+06	valid_0's l2: 9.33828e+12
[179]	valid_0's l1

[346]	valid_0's l1: 2.10398e+06	valid_0's l2: 8.91726e+12
[347]	valid_0's l1: 2.10378e+06	valid_0's l2: 8.919e+12
[348]	valid_0's l1: 2.10401e+06	valid_0's l2: 8.92122e+12
[349]	valid_0's l1: 2.1037e+06	valid_0's l2: 8.9189e+12
[350]	valid_0's l1: 2.10386e+06	valid_0's l2: 8.9217e+12
[351]	valid_0's l1: 2.10357e+06	valid_0's l2: 8.91732e+12
[352]	valid_0's l1: 2.10306e+06	valid_0's l2: 8.91523e+12
[353]	valid_0's l1: 2.103e+06	valid_0's l2: 8.91528e+12
[354]	valid_0's l1: 2.10324e+06	valid_0's l2: 8.91677e+12
[355]	valid_0's l1: 2.10374e+06	valid_0's l2: 8.91925e+12
[356]	valid_0's l1: 2.10318e+06	valid_0's l2: 8.91627e+12
[357]	valid_0's l1: 2.10295e+06	valid_0's l2: 8.91631e+12
[358]	valid_0's l1: 2.10276e+06	valid_0's l2: 8.91522e+12
[359]	valid_0's l1: 2.10249e+06	valid_0's l2: 8.91466e+12
[360]	valid_0's l1: 2.10193e+06	valid_0's l2: 8.91539e+12
[361]	valid_0's l1: 2.10175e+06	valid_0's l2: 8.91649e+12
[362]	valid_0's l1: 2.10129e+06	valid_0's l2: 8.91454e+12
[363]	valid_0's l1: 2

In [218]:
test_pred = gbm.predict(test, num_iteration=gbm.best_iteration)

first_submission['price'] = test_pred
first_submission.to_csv('lightgbm.tsv', sep='\t', index=False, header=False)