# Read data and data processing

In [140]:
import pandas as pd
tochi_train = pd.read_csv('../data/train_genba.tsv', sep='\t')
build_train = pd.read_csv('../data/train_goto.tsv', sep='\t')
train = pd.merge(tochi_train, build_train, on="pj_no")

tochi_test = pd.read_csv('../data/test_genba.tsv', sep='\t')
build_test = pd.read_csv('../data/test_goto.tsv', sep='\t')
test = pd.merge(tochi_test, build_test, on="pj_no")

In [141]:
# 名前系と相関性が高いカラムはとりあえず削除
# 土地のネームバリューが出てくると思うので、名前系はあとで追加するかも

name_columns = ['bastei_nm1','bastei_nm2','chiseki_kb_hb','eki_nm1','eki_nm2','gk_chu_tm','gk_sho_tm','hy1f_date_su', \
                'hy2f_date_su','jukyo','mseki_yt_hb','tc_mseki','yoseki2']
train.drop(name_columns, axis=1, inplace=True)
test.drop(name_columns, axis=1, inplace=True)

In [142]:
# カテゴリ系コラムにある「無」に値を振り分けたり、変換ミスに対処したり、マルバツからBooleanに変換

import numpy as np

train['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
train['hiatari'].fillna('普通', inplace=True)
train['kborjs'].replace('公募','公簿',inplace=True)

maru_columns = ['rs_e_kdate2','rs_e_kdate3','rs_e_m_ari','rs_e_m_nashi','rs_e_parking','rs_e_tahata','rs_e_zoki', \
                'rs_n_kdate2','rs_n_kdate3','rs_n_m_ari','rs_n_m_nashi','rs_n_parking','rs_n_tahata','rs_n_zoki', \
                'rs_s_kdate2','rs_s_kdate3','rs_s_m_ari','rs_s_m_nashi','rs_s_parking','rs_s_tahata','rs_s_zoki', \
                'rs_w_kdate2','rs_w_kdate3','rs_w_m_ari','rs_w_m_nashi','rs_w_parking','rs_w_tahata','rs_w_zoki', \
                'sho_conv','sho_market','sho_shoten','sho_super','shu_bochi','shu_factory','shu_highway', \
                'shu_hvline','shu_jutaku','shu_kaido','shu_kokyo','shu_line_ari','shu_line_nashi','shu_park', \
                'shu_shop','shu_sogi','shu_soon','shu_tower','shu_zoki']

train[maru_columns] = train[maru_columns].replace({'○':1, np.nan:0})

In [143]:
# 他規制や個別要因など、「複数ある場合は1～4」系のカラムに対処

hokakisei=['hokakisei1','hokakisei2','hokakisei3','hokakisei4']
kobetsu=['kobetsu1','kobetsu2','kobetsu3','kobetsu4']
train = pd.concat([train, train[hokakisei].stack().str.get_dummies().sum(level=0), \
                train[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
train.drop(hokakisei+kobetsu, axis=1, inplace=True)
train.iloc[:,137:] = train.iloc[:,137:].fillna(0.0)

In [144]:
train

Unnamed: 0,pj_no,chiseki_js_hb,yoto1,yoto2,kempei1,kempei2,yoseki1,josui,gesui,gas,...,眺望良,行き止まり,行き止まり途中,街道沿い,裏道,角地,計画道路,踏切付近,車進入困難,高圧線下
0,0,109.26,工業地域,,60,0.0,200,公営,公共下水,個別プロパン,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,136.15,第一種低層住居専用地域,,60,0.0,200,公営,公共下水,個別プロパン,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,86.92,第一種低層住居専用地域,,50,0.0,100,私営,個別浄化槽,個別プロパン,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,164.91,工業地域,,60,0.0,200,公営,公共下水,都市ガス,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,403.58,第一種低層住居専用地域,,50,0.0,80,公営,公共下水,都市ガス,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4,403.58,第一種低層住居専用地域,,50,0.0,80,公営,公共下水,都市ガス,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4,403.58,第一種低層住居専用地域,,50,0.0,80,公営,公共下水,都市ガス,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5,114.44,第一種住居地域,,60,0.0,200,公営,公共下水,都市ガス,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,6,531.98,指定のない区域,,60,0.0,200,公営,個別浄化槽,個別プロパン,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,6,531.98,指定のない区域,,60,0.0,200,公営,個別浄化槽,個別プロパン,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
