# Read data and data processing

In [291]:
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    #y_true, y_pred = check_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [292]:
import pandas as pd
tochi_train = pd.read_csv('../data/train_genba.tsv', sep='\t')
build_train = pd.read_csv('../data/train_goto.tsv', sep='\t')
train = pd.merge(tochi_train, build_train, on="pj_no")

tochi_test = pd.read_csv('../data/test_genba.tsv', sep='\t')
build_test = pd.read_csv('../data/test_goto.tsv', sep='\t')
test = pd.merge(tochi_test, build_test, on="pj_no")

print ("Train: 販売数は",train.shape[0],"件、特徴量は",train.shape[1],"個。")
print ("Test: 販売数は",test.shape[0],"件、特徴量は",test.shape[1],"個。")

first_submission = pd.DataFrame()
first_submission['id'] = test['id']

Train: 販売数は 6461 件、特徴量は 158 個。
Test: 販売数は 4273 件、特徴量は 157 個。


In [293]:
# 名前系と相関性が高いカラムはとりあえず削除
# 土地のネームバリューが出てくると思うので、名前系はあとで追加するかも

name_columns = ['bastei_nm1','bastei_nm2','chiseki_kb_hb','eki_nm1','eki_nm2','gk_chu_tm','gk_sho_tm','hy1f_date_su', \
                'hy2f_date_su','mseki_yt_hb','tc_mseki','yoseki2','id']
train.drop(name_columns, axis=1, inplace=True)
test.drop(name_columns, axis=1, inplace=True)

y_train = train['keiyaku_pr']

In [294]:
# カテゴリ系コラムにある「無」に値を振り分けたり、変換ミスに対処したり、マルバツからBooleanに変換

import numpy as np

train['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
train['hiatari'].fillna('普通', inplace=True)
train['kborjs'].replace('公募','公簿',inplace=True)
test['fi3m_yohi'].replace('（無）','（不要）',inplace=True)
test['hiatari'].fillna('普通', inplace=True)
test['kborjs'].replace('公募','公簿',inplace=True)

maru_columns = ['rs_e_kdate2','rs_e_kdate3','rs_e_m_ari','rs_e_m_nashi','rs_e_parking','rs_e_tahata','rs_e_zoki', \
                'rs_n_kdate2','rs_n_kdate3','rs_n_m_ari','rs_n_m_nashi','rs_n_parking','rs_n_tahata','rs_n_zoki', \
                'rs_s_kdate2','rs_s_kdate3','rs_s_m_ari','rs_s_m_nashi','rs_s_parking','rs_s_tahata','rs_s_zoki', \
                'rs_w_kdate2','rs_w_kdate3','rs_w_m_ari','rs_w_m_nashi','rs_w_parking','rs_w_tahata','rs_w_zoki', \
                'sho_conv','sho_market','sho_shoten','sho_super','shu_bochi','shu_factory','shu_highway', \
                'shu_hvline','shu_jutaku','shu_kaido','shu_kokyo','shu_line_ari','shu_line_nashi','shu_park', \
                'shu_shop','shu_sogi','shu_soon','shu_tower','shu_zoki']

train[maru_columns] = train[maru_columns].replace({'○':1, np.nan:0})
test[maru_columns] = test[maru_columns].replace({'○':1, np.nan:0})

In [295]:
# 他規制や個別要因など、「複数ある場合は1～4」系のカラムに対処

hokakisei=['hokakisei1','hokakisei2','hokakisei3','hokakisei4']
kobetsu=['kobetsu1','kobetsu2','kobetsu3','kobetsu4']

train = pd.concat([train, train[hokakisei].stack().str.get_dummies().sum(level=0), \
                train[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
train.drop(hokakisei+kobetsu, axis=1, inplace=True)
train.iloc[:,137:] = train.iloc[:,137:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])
test = pd.concat([test, test[hokakisei].stack().str.get_dummies().sum(level=0), \
                test[kobetsu].stack().str.get_dummies().sum(level=0)], axis=1)
test.drop(hokakisei+kobetsu, axis=1, inplace=True)
test.iloc[:,136:] = test.iloc[:,136:].fillna(0.0).apply(lambda x: [0 if y == 0.0 else 1 for y in x])

In [296]:
# BooleanであるハズがCategoricalになってるカラムに対処

bool_columns = ['bus_yohi','chikukeikaku','fi3m_yohi','fi4m_yohi','gesui','hokakyoka','josui','kaihatsukyoka','kaoku_um', \
                'kborjs','keikakuroad','kinshijiko','t53kyoka','yheki_umu','yheki_yohi']

train[bool_columns] = train[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                   '公営':0,'私営':1,'実測':0,'公簿':1})
test[bool_columns] = test[bool_columns].replace({'（不要）':0, '（無）':0,'（要）':1,'（有）':1,'公共下水':0,'個別浄化槽':1,\
                                                 '公営':0,'私営':1,'実測':0,'公簿':1})

In [297]:
# ちょっとデータを追加
# Levelplanから階数と部屋を分割

levelplan_split_train = train['levelplan'].str.split('/', n=1, expand=True)
train['level'] = levelplan_split_train[0]
train['rooms'] = levelplan_split_train[1]

levelplan_split_test = test['levelplan'].str.split('/', n=1, expand=True)
test['level'] = levelplan_split_test[0]
test['rooms'] = levelplan_split_test[1]

In [298]:
# 経度を利用する
# そのままではvarianceが低すぎるので、QuantileTransformerを利用する。

from sklearn.preprocessing import QuantileTransformer

latlon_dic = dict()
with open('longlat.txt','r') as f:
    for line in f:
        addr = line.replace('\n','').split(',')[0]
        latlon = line.replace('\n','').split(',')[1:]
        latlon_dic.update({addr:latlon})

def get_lon(addr):
    try:
        return float(latlon_dic.get(addr)[1])
    except IndexError:
        return np.nan

train['lat'] = train['jukyo'].apply(lambda x: float(latlon_dic.get(x)[0]))
train['lon'] = train['jukyo'].apply(lambda x: get_lon(x))
test['lat'] = test['jukyo'].apply(lambda x: float(latlon_dic.get(x)[0]))
test['lon'] = test['jukyo'].apply(lambda x: get_lon(x))

scaler = QuantileTransformer()
train[['lat','lon']] = scaler.fit_transform(train[['lat','lon']])
test[['lat','lon']] = scaler.transform(test[['lat','lon']])

In [299]:
# 形態素解析

import collections
import MeCab
import mojimoji
from string import digits

remove_digits = str.maketrans('', '', digits)
tagger = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/")

def extract_words(line):
    keyword=[]
    node = tagger.parseToNode(line).next
    while node:
        keyword.append(node.surface)
        node = node.next
    keyword = str(keyword).replace("', '"," ")
    keyword = keyword.replace("\'","")
    keyword = keyword.replace("[","")
    keyword = keyword.replace("]","")
    return keyword

In [300]:
# 全部埼玉県なので住居から消去
# あと市が抜けてる住所情報は追加してあげる

shi_gun_dic = dict({'にっさい花みず木':'坂戸市にっさい花みず木','西鶴ヶ岡':'ふじみ野市西鶴ヶ岡', \
                    '杉戸町内田':'北葛飾郡杉戸町内田','宮代町宮代台':'南埼玉郡宮代町宮代台', \
                    '大字下日出谷':'桶川市大字下日出谷','杉戸町清地':'北葛飾郡杉戸町', \
                    '松伏町田中':'北葛飾郡松伏町','大字水野字逃水':'狭山市大字水野字逃水'})

train['jukyo'] = train['jukyo'].str.replace('埼玉県','')
test['jukyo'] = test['jukyo'].str.replace('埼玉県','')
train['jukyo'] = train['jukyo'].replace(shi_gun_dic)
test['jukyo'] = test['jukyo'].replace(shi_gun_dic)

jukyo_split_train = train['jukyo'].str.split(r'市|郡', n=1, expand=True)
train['jukyo_shi_gun'] = jukyo_split_train[0]
train['town'] = jukyo_split_train[1]
train.drop('jukyo', axis=1, inplace=True)

jukyo_split_test = test['jukyo'].str.split(r'市|郡', n=1, expand=True)
test['jukyo_shi_gun'] = jukyo_split_test[0]
test['town'] = jukyo_split_test[1]
test['town'].fillna('無', inplace=True)
test.drop('jukyo', axis=1, inplace=True)

In [301]:
from sklearn.feature_extraction.text import CountVectorizer

# word count feature
cv_shi_gun = CountVectorizer()
cv_town = CountVectorizer()

train_cv_shi = cv_shi_gun.fit_transform(train.jukyo_shi_gun)
train_cv_shi = pd.DataFrame(train_cv_shi.toarray(),columns=cv_shi_gun.get_feature_names())
test_cv_shi = cv_shi_gun.transform(test.jukyo_shi_gun)
test_cv_shi = pd.DataFrame(test_cv_shi.toarray(),columns=cv_shi_gun.get_feature_names())

train_cv_town = cv_town.fit_transform(train.town)
train_cv_town = pd.DataFrame(train_cv_town.toarray(),columns=cv_town.get_feature_names())
test_cv_town = cv_town.transform(test.town)
test_cv_town = pd.DataFrame(test_cv_town.toarray(),columns=cv_town.get_feature_names())

train.drop('town', axis=1, inplace=True)
test.drop('town', axis=1, inplace=True)

In [302]:
train = pd.concat([train, train_cv_shi, train_cv_town], axis=1)
test = pd.concat([test, test_cv_shi, test_cv_town], axis=1)
train.shape

(6461, 1370)

# EDA

Exploratory data analysis

In [274]:
# 数値とカテゴリに分ける

quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('keiyaku_pr')
quantitative.remove('pj_no')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# NaNが多いフィーチャーを表示

import seaborn as sns
import japanize_matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style("whitegrid")
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

このデータセットでは、メインとサブのフィーチャーが多い。（最寄り駅1、最寄り駅2など）
最寄り駅2がない家は、それに該当するフィーチャー（名前や距離など）がNaNになることが多い。

In [None]:
import scipy.stats as stats

plt.figure(1); plt.title('Johnson SU')
sns.distplot(y_train, kde=False, fit=stats.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y_train, kde=False, fit=stats.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y_train, kde=False, fit=stats.lognorm)

契約価格は、すでにNormal Distributionに近いため、調整は必要なし。

In [None]:
# qualitative情報の、各カテゴリの契約価格平均を割り当てて、それでcorrelationを測る

def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'keiyaku_pr']].groupby(feature).mean()['keiyaku_pr']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Measure strength and direction of monotonic (linear) relationship
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['keiyaku_pr'], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')
    
features = quantitative + qual_encoded
spearman(train, features)

In [None]:
plt.figure(1)
corr = train[quantitative+['keiyaku_pr']].corr()
sns.heatmap(corr)
plt.figure(2)
corr = train[qual_encoded+['keiyaku_pr']].corr()
sns.heatmap(corr)
plt.figure(3)
corr = pd.DataFrame(np.zeros([len(quantitative)+1, len(qual_encoded)+1]), index=quantitative+['keiyaku_pr'], columns=qual_encoded+['SalePrice'])
for q1 in quantitative+['keiyaku_pr']:
    for q2 in qual_encoded+['keiyaku_pr']:
        corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr)

# Try LightGBM

In [303]:
# 最後に、categoricalなカラムを全てone_hot_encode

categorical = ['bas_toho1','bas_toho2','bokachiiki','gas','hiatari','hw_status','jigata','kodochiku','levelplan', \
               'road1_hk','road1_sb','road2_hk','road2_sb','road3_sb','road3_hk','road4_sb','road4_hk','road_st', \
               'rosen_nm1','rosen_nm2','setsudo_hi','setsudo_kj','toshikuiki1','toshikuiki2','usui','yoto1','yoto2', \
               'jukyo_shi_gun','level','rooms']

train = pd.concat([train, pd.get_dummies(train[categorical])], axis=1)
train.drop(categorical, axis=1, inplace=True)
test = pd.concat([test, pd.get_dummies(test[categorical])], axis=1)
test.drop(categorical, axis=1, inplace=True)

In [304]:
# 両方のDataframeに登場しないカラムを除外（価格はキープしとく）

train_columns = list(train.columns.values)
test_columns = list(test.columns.values)
unique_columns = list(set(train_columns) ^ set(test_columns))

train.drop(unique_columns, axis=1, inplace=True, errors='ignore')
test.drop(unique_columns, axis=1, inplace=True, errors='ignore')

In [305]:
print(train.shape)
print(test.shape)

(6461, 1597)
(4273, 1597)


In [306]:
import lightgbm as lgb 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold

# 3分割交差検証を指定し、インスタンス化 
kf = KFold(n_splits=5) 

# スコアとモデルを格納するリスト 
score_list = [] 
models = [] 

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

for fold_, (train_index, valid_index) in enumerate(kf.split(train, y_train)):
    train_x = train.iloc[train_index]
    valid_x = train.iloc[valid_index]
    train_y = y_train[train_index]
    valid_y = y_train[valid_index]
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
    print(f'fold{fold_ + 1} start')
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
                early_stopping_rounds=20,
                verbose_eval=0)
    y_pred = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
    score_list.append(mean_absolute_percentage_error(valid_y, y_pred))
    models.append(gbm)  # 学習が終わったモデルをリストに入れておく
    print(f'fold{fold_ + 1} end\nAccuracy = {mean_absolute_percentage_error(valid_y, y_pred)}')
print(score_list, '平均score', np.mean(score_list))

fold1 start
fold1 end
Accuracy = 9.40647613601396
fold2 start
fold2 end
Accuracy = 9.33552241911634
fold3 start
fold3 end
Accuracy = 9.03553395287467
fold4 start
fold4 end
Accuracy = 11.371655392825806
fold5 start
fold5 end
Accuracy = 9.88582601353566
[9.40647613601396, 9.33552241911634, 9.03553395287467, 11.371655392825806, 9.88582601353566] 平均score 9.807002782873289


In [307]:
test_pred = np.zeros((len(test), 5)) 
for fold_, gbm in enumerate(models):
    pred_ = gbm.predict(test, num_iteration=gbm.best_iteration)# testを予測
    test_pred[:, fold_] = pred_ 
pred = np.mean(test_pred, axis=1)
first_submission['price'] = pred
first_submission.to_csv('lightgbm.tsv', sep='\t', index=False, header=False)