### Import packages

In [1]:
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from utilities import cal_score, convert_types

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Columns

In [2]:
cols_num = ['txn_dt', 'total_floor', 'building_complete_dt', 'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'lat', 'lon', 'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate', 'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'N_50', 'N_500', 'N_1000', 'N_5000', 'N_10000', 'I_10', 'I_50', 'I_100', 'I_250', 'I_500', 'I_1000', 'I_5000', 'I_10000', 'I_MIN', 'II_10', 'II_50', 'II_100', 'II_250', 'II_500', 'II_1000', 'II_5000', 'II_10000', 'II_MIN', 'III_10', 'III_50', 'III_100', 'III_250', 'III_500', 'III_1000', 'III_5000', 'III_10000', 'III_MIN', 'IV_10', 'IV_50', 'IV_100', 'IV_250', 'IV_500', 'IV_1000', 'IV_5000', 'IV_10000', 'IV_MIN', 'V_10', 'V_50', 'V_100', 'V_250', 'V_500', 'V_1000', 'V_5000', 'V_10000', 'V_MIN', 'VI_50', 'VI_100', 'VI_250', 'VI_500', 'VI_1000', 'VI_5000', 'VI_10000', 'VI_MIN', 'VII_10', 'VII_50', 'VII_100', 'VII_250', 'VII_500', 'VII_1000', 'VII_5000', 'VII_10000', 'VII_MIN', 'VIII_10', 'VIII_50', 'VIII_100', 'VIII_250', 'VIII_500', 'VIII_1000', 'VIII_5000', 'VIII_10000', 'VIII_MIN', 'IX_10', 'IX_50', 'IX_100', 'IX_250', 'IX_500', 'IX_1000', 'IX_5000', 'IX_10000', 'IX_MIN', 'X_10', 'X_50', 'X_100', 'X_250', 'X_500', 'X_1000', 'X_5000', 'X_10000', 'X_MIN', 'XI_10', 'XI_50', 'XI_100', 'XI_250', 'XI_500', 'XI_1000', 'XI_5000', 'XI_10000', 'XI_MIN', 'XII_10', 'XII_50', 'XII_100', 'XII_250', 'XII_500', 'XII_1000', 'XII_5000', 'XII_10000', 'XII_MIN', 'XIII_10', 'XIII_50', 'XIII_100', 'XIII_250', 'XIII_500', 'XIII_1000', 'XIII_5000', 'XIII_10000', 'XIII_MIN', 'XIV_10', 'XIV_50', 'XIV_100', 'XIV_250', 'XIV_500', 'XIV_1000', 'XIV_5000', 'XIV_10000', 'XIV_MIN']
cols_cat = ['building_material', 'city', 'building_type', 'building_use', 'parking_way', 'town', 'village']
cols_bin = ['I_index_50', 'I_index_500', 'I_index_1000', 'II_index_50', 'II_index_500', 'II_index_1000', 'III_index_50', 'III_index_500', 'III_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'V_index_50', 'V_index_500', 'V_index_1000', 'VI_10', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'X_index_50', 'X_index_500', 'X_index_1000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000']
cols_feats = cols_num + cols_cat + cols_bin

col_target = 'total_price'
col_target_log1p = 'target'

### Read data

In [3]:
df = pd.read_csv('dataset/train.csv', dtype=object)

In [4]:
df.columns

Index(['building_id', 'building_material', 'city', 'txn_dt', 'total_floor',
       'building_type', 'building_use', 'building_complete_dt', 'parking_way',
       'parking_area',
       ...
       'XIV_500', 'XIV_index_500', 'XIV_1000', 'XIV_index_1000', 'XIV_5000',
       'XIV_index_5000', 'XIV_10000', 'XIV_index_10000', 'XIV_MIN',
       'total_price'],
      dtype='object', length=235)

### Preprocessing

In [5]:
# Convert types
df = convert_types(df, cols_num, col_target=col_target)

# Generate feats (train-test-same feats)
#create_feats(df)

### Feat engineering

In [6]:
from feature_engineering import CategoricalColumnsEncoder

In [7]:
class FeatureEngineering():
    def __init__(self):
        pass
    
    def fit_transform(self, df, cols_cat, cols_bin):
        df = df.copy()
        # Denote categorical-type
        self.cat_encoder = CategoricalColumnsEncoder(mode='pandas')
        self.cat_encoder.fit_transform(df, cols_cat+cols_bin)
        return df
    
    def transform(self, df):
        df = df.copy()
        self.cat_encoder.transform(df)
        return df

### Grid search

In [9]:
# pars
is_log1p = True # if train on log1p target
# pars

# grid search
params_gsearch = {'task': ['train'],'boosting_type': ['gbdt'], 'metric': ['mse', 'mae', 'mape'],
                  'objective': ['mse', 'mae', 'mape'], # 'mae', 'mape'
                  'num_leaves': [255],
                  'learning_rate': [0.01],
                  'feature_fraction': [0.7],
                  'min_data_in_leaf': [20],
                  'lambda_l1': [0.1]}

params_names = params_gsearch.keys()

gsearch = {}
folds = KFold(n_splits=3)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    if is_log1p:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['target']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['target'], reference=lgb_train).construct()
    else:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['total_price']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['total_price'], reference=lgb_train).construct()
        
    # grid search
    for values in itertools.product(*[params_gsearch[key] for key in params_names]):
        params = dict(zip(params_names, values))
        print('params:', params)
        
        model = lgb.train(params, lgb_train,
                          num_boost_round=10000,
                          valid_sets=lgb_val,
                          verbose_eval=200,
                          early_stopping_rounds=200)
        y_pred = model.predict(df_val[cols_feats])
        
        if is_log1p:
            y_pred_expm1 = np.expm1(y_pred)
            y_pred_final = np.clip(y_pred_expm1, 0, None)
        else:
            y_pred_final = y_pred
            
        score = cal_score(df_val['total_price'], y_pred_final)
        tuple_params = tuple(params.items())
        gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results.sort(key= lambda x: x[1])
display(gsearch_results)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mse', 'objective': 'mse', 'num_leaves': 255, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's l2: 0.106711
[400]	valid_0's l2: 0.0567184
[600]	valid_0's l2: 0.0495915
[800]	valid_0's l2: 0.0469991
[1000]	valid_0's l2: 0.0457271
[1200]	valid_0's l2: 0.0449713
[1400]	valid_0's l2: 0.044516
[1600]	valid_0's l2: 0.044191
[1800]	valid_0's l2: 0.0439695
[2000]	valid_0's l2: 0.0438008
[2200]	valid_0's l2: 0.0436661
[2400]	valid_0's l2: 0.0435732
[2600]	valid_0's l2: 0.0434891
[2800]	valid_0's l2: 0.0434246
[3000]	valid_0's l2: 0.0433769
[3200]	valid_0's l2: 0.0433338
[3400]	valid_0's l2: 0.0432912
[3600]	valid_0's l2: 0.0432677
[3800]	valid_0's l2: 0.0432452
[4000]	valid_0's l2: 0.0432298
[4200]	valid_0's l2: 0.0432112
[4400]	valid_0's l2: 0.0431972
[4600]	valid_0's l2: 0.0431844
[4800]	valid_0's 

[5400]	valid_0's l1: 0.14183
[5600]	valid_0's l1: 0.141668
[5800]	valid_0's l1: 0.141523
[6000]	valid_0's l1: 0.141422
[6200]	valid_0's l1: 0.141313
[6400]	valid_0's l1: 0.141221
[6600]	valid_0's l1: 0.141112
[6800]	valid_0's l1: 0.141032
[7000]	valid_0's l1: 0.140971
[7200]	valid_0's l1: 0.140868
[7400]	valid_0's l1: 0.140738
[7600]	valid_0's l1: 0.140629
[7800]	valid_0's l1: 0.140544
[8000]	valid_0's l1: 0.140478
[8200]	valid_0's l1: 0.140404
[8400]	valid_0's l1: 0.140334
[8600]	valid_0's l1: 0.140272
[8800]	valid_0's l1: 0.140204
[9000]	valid_0's l1: 0.140142
[9200]	valid_0's l1: 0.140092
[9400]	valid_0's l1: 0.140062
[9600]	valid_0's l1: 0.140033
[9800]	valid_0's l1: 0.140002
[10000]	valid_0's l1: 0.139971
Did not meet early stopping. Best iteration is:
[9999]	valid_0's l1: 0.139971
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mae', 'objective': 'mape', 'num_leaves': 255, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
Trai

[7600]	valid_0's mape: 0.00918701
[7800]	valid_0's mape: 0.00918401
[8000]	valid_0's mape: 0.00918098
[8200]	valid_0's mape: 0.00917805
[8400]	valid_0's mape: 0.00917496
[8600]	valid_0's mape: 0.00917239
[8800]	valid_0's mape: 0.00917026
[9000]	valid_0's mape: 0.00916838
[9200]	valid_0's mape: 0.00916608
[9400]	valid_0's mape: 0.00916429
[9600]	valid_0's mape: 0.00916255
[9800]	valid_0's mape: 0.00916096
[10000]	valid_0's mape: 0.00915884
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00915884
==== Fold 2 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mse', 'objective': 'mse', 'num_leaves': 255, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's l2: 0.104488
[400]	valid_0's l2: 0.0561633
[600]	valid_0's l2: 0.0493982
[800]	valid_0's l2: 0.0467835
[1000]	valid_0's l2: 0.045375
[1200]	valid_0's l2: 0.0445743
[1400]	valid_0's 

[2800]	valid_0's l1: 0.144384
[3000]	valid_0's l1: 0.143943
[3200]	valid_0's l1: 0.14359
[3400]	valid_0's l1: 0.143279
[3600]	valid_0's l1: 0.143026
[3800]	valid_0's l1: 0.142805
[4000]	valid_0's l1: 0.142603
[4200]	valid_0's l1: 0.142347
[4400]	valid_0's l1: 0.142102
[4600]	valid_0's l1: 0.141913
[4800]	valid_0's l1: 0.14173
[5000]	valid_0's l1: 0.141577
[5200]	valid_0's l1: 0.141479
[5400]	valid_0's l1: 0.141356
[5600]	valid_0's l1: 0.141171
[5800]	valid_0's l1: 0.141029
[6000]	valid_0's l1: 0.140901
[6200]	valid_0's l1: 0.140797
[6400]	valid_0's l1: 0.140703
[6600]	valid_0's l1: 0.140605
[6800]	valid_0's l1: 0.140496
[7000]	valid_0's l1: 0.140329
[7200]	valid_0's l1: 0.14021
[7400]	valid_0's l1: 0.140108
[7600]	valid_0's l1: 0.140024
[7800]	valid_0's l1: 0.13992
[8000]	valid_0's l1: 0.139836
[8200]	valid_0's l1: 0.139766
[8400]	valid_0's l1: 0.139695
[8600]	valid_0's l1: 0.139619
[8800]	valid_0's l1: 0.139542
[9000]	valid_0's l1: 0.13947
[9200]	valid_0's l1: 0.139416
[9400]	valid_0'

[5400]	valid_0's mape: 0.00919204
[5600]	valid_0's mape: 0.0091801
[5800]	valid_0's mape: 0.00916406
[6000]	valid_0's mape: 0.00914942
[6200]	valid_0's mape: 0.00914005
[6400]	valid_0's mape: 0.00913468
[6600]	valid_0's mape: 0.00912852
[6800]	valid_0's mape: 0.00912387
[7000]	valid_0's mape: 0.00912001
[7200]	valid_0's mape: 0.00911611
[7400]	valid_0's mape: 0.00911184
[7600]	valid_0's mape: 0.00910724
[7800]	valid_0's mape: 0.00910021
[8000]	valid_0's mape: 0.00909706
[8200]	valid_0's mape: 0.00909369
[8400]	valid_0's mape: 0.00908966
[8600]	valid_0's mape: 0.00908721
[8800]	valid_0's mape: 0.00908371
[9000]	valid_0's mape: 0.00907899
[9200]	valid_0's mape: 0.00907509
[9400]	valid_0's mape: 0.00907225
[9600]	valid_0's mape: 0.00906904
[9800]	valid_0's mape: 0.0090666
[10000]	valid_0's mape: 0.00906436
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00906436
==== Fold 3 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mse', 'objective': 'mse

[2600]	valid_0's l1: 0.144994
[2800]	valid_0's l1: 0.144005
[3000]	valid_0's l1: 0.143419
[3200]	valid_0's l1: 0.143066
[3400]	valid_0's l1: 0.142765
[3600]	valid_0's l1: 0.14252
[3800]	valid_0's l1: 0.142319
[4000]	valid_0's l1: 0.142093
[4200]	valid_0's l1: 0.14185
[4400]	valid_0's l1: 0.14167
[4600]	valid_0's l1: 0.141504
[4800]	valid_0's l1: 0.141353
[5000]	valid_0's l1: 0.14124
[5200]	valid_0's l1: 0.141145
[5400]	valid_0's l1: 0.141006
[5600]	valid_0's l1: 0.14086
[5800]	valid_0's l1: 0.140719
[6000]	valid_0's l1: 0.140575
[6200]	valid_0's l1: 0.140478
[6400]	valid_0's l1: 0.140403
[6600]	valid_0's l1: 0.140334
[6800]	valid_0's l1: 0.140246
[7000]	valid_0's l1: 0.140152
[7200]	valid_0's l1: 0.14007
[7400]	valid_0's l1: 0.139972
[7600]	valid_0's l1: 0.139888
[7800]	valid_0's l1: 0.139816
[8000]	valid_0's l1: 0.139735
[8200]	valid_0's l1: 0.139662
[8400]	valid_0's l1: 0.139594
[8600]	valid_0's l1: 0.139528
[8800]	valid_0's l1: 0.139461
[9000]	valid_0's l1: 0.139404
[9200]	valid_0's

[5200]	valid_0's mape: 0.00922116
[5400]	valid_0's mape: 0.009214
[5600]	valid_0's mape: 0.00920749
[5800]	valid_0's mape: 0.00920172
[6000]	valid_0's mape: 0.00919636
[6200]	valid_0's mape: 0.00919036
[6400]	valid_0's mape: 0.00918471
[6600]	valid_0's mape: 0.00917636
[6800]	valid_0's mape: 0.00916545
[7000]	valid_0's mape: 0.00915861
[7200]	valid_0's mape: 0.00915354
[7400]	valid_0's mape: 0.00914975
[7600]	valid_0's mape: 0.00914741
[7800]	valid_0's mape: 0.00914465
[8000]	valid_0's mape: 0.00914122
[8200]	valid_0's mape: 0.00913862
[8400]	valid_0's mape: 0.00913659
[8600]	valid_0's mape: 0.00913441
[8800]	valid_0's mape: 0.00913219
[9000]	valid_0's mape: 0.00912995
[9200]	valid_0's mape: 0.00912717
[9400]	valid_0's mape: 0.00912469
[9600]	valid_0's mape: 0.00912221
[9800]	valid_0's mape: 0.00912034
[10000]	valid_0's mape: 0.00911832
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00911832


[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mae'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173158219,
  [5242.851166629266, 5312.853633535916, 5226.857719309474]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173250374,
  [5242.851166629266, 5312.853633535916, 5226.857719585941]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mse'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173268893,
  [5242.851166629266, 5312.853633591472, 5226.857719585941]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt

In [10]:
display(gsearch_results)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mae'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173158219,
  [5242.851166629266, 5312.853633535916, 5226.857719309474]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173250374,
  [5242.851166629266, 5312.853633535916, 5226.857719585941]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mse'),
   ('objective', 'mae'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5260.854173268893,
  [5242.851166629266, 5312.853633591472, 5226.857719585941]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt

**Conclusion: objective - mse; metric - mape (use for now) or mae**