### Import packages

In [1]:
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from utilities import cal_score, convert_types

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Columns

In [2]:
cols_num = ['txn_dt', 'total_floor', 'building_complete_dt', 'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'lat', 'lon', 'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate', 'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'N_50', 'N_500', 'N_1000', 'N_5000', 'N_10000', 'I_10', 'I_50', 'I_100', 'I_250', 'I_500', 'I_1000', 'I_5000', 'I_10000', 'I_MIN', 'II_10', 'II_50', 'II_100', 'II_250', 'II_500', 'II_1000', 'II_5000', 'II_10000', 'II_MIN', 'III_10', 'III_50', 'III_100', 'III_250', 'III_500', 'III_1000', 'III_5000', 'III_10000', 'III_MIN', 'IV_10', 'IV_50', 'IV_100', 'IV_250', 'IV_500', 'IV_1000', 'IV_5000', 'IV_10000', 'IV_MIN', 'V_10', 'V_50', 'V_100', 'V_250', 'V_500', 'V_1000', 'V_5000', 'V_10000', 'V_MIN', 'VI_50', 'VI_100', 'VI_250', 'VI_500', 'VI_1000', 'VI_5000', 'VI_10000', 'VI_MIN', 'VII_10', 'VII_50', 'VII_100', 'VII_250', 'VII_500', 'VII_1000', 'VII_5000', 'VII_10000', 'VII_MIN', 'VIII_10', 'VIII_50', 'VIII_100', 'VIII_250', 'VIII_500', 'VIII_1000', 'VIII_5000', 'VIII_10000', 'VIII_MIN', 'IX_10', 'IX_50', 'IX_100', 'IX_250', 'IX_500', 'IX_1000', 'IX_5000', 'IX_10000', 'IX_MIN', 'X_10', 'X_50', 'X_100', 'X_250', 'X_500', 'X_1000', 'X_5000', 'X_10000', 'X_MIN', 'XI_10', 'XI_50', 'XI_100', 'XI_250', 'XI_500', 'XI_1000', 'XI_5000', 'XI_10000', 'XI_MIN', 'XII_10', 'XII_50', 'XII_100', 'XII_250', 'XII_500', 'XII_1000', 'XII_5000', 'XII_10000', 'XII_MIN', 'XIII_10', 'XIII_50', 'XIII_100', 'XIII_250', 'XIII_500', 'XIII_1000', 'XIII_5000', 'XIII_10000', 'XIII_MIN', 'XIV_10', 'XIV_50', 'XIV_100', 'XIV_250', 'XIV_500', 'XIV_1000', 'XIV_5000', 'XIV_10000', 'XIV_MIN']
cols_cat = ['building_material', 'city', 'building_type', 'building_use', 'parking_way', 'town', 'village']
cols_bin = ['I_index_50', 'I_index_500', 'I_index_1000', 'II_index_50', 'II_index_500', 'II_index_1000', 'III_index_50', 'III_index_500', 'III_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'V_index_50', 'V_index_500', 'V_index_1000', 'VI_10', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'X_index_50', 'X_index_500', 'X_index_1000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000']
cols_feats = cols_num + cols_cat + cols_bin

col_target = 'total_price'
col_target_log1p = 'target'

### Read data

In [3]:
df = pd.read_csv('dataset/train.csv', dtype=object)

In [4]:
df.columns

Index(['building_id', 'building_material', 'city', 'txn_dt', 'total_floor',
       'building_type', 'building_use', 'building_complete_dt', 'parking_way',
       'parking_area',
       ...
       'XIV_500', 'XIV_index_500', 'XIV_1000', 'XIV_index_1000', 'XIV_5000',
       'XIV_index_5000', 'XIV_10000', 'XIV_index_10000', 'XIV_MIN',
       'total_price'],
      dtype='object', length=235)

### Preprocessing

In [5]:
# Convert types
df = convert_types(df, cols_num, col_target=col_target)

# Generate feats (train-test-same feats)
#create_feats(df)

### Feat engineering

In [6]:
from feature_engineering import CategoricalColumnsEncoder

In [7]:
class FeatureEngineering():
    def __init__(self):
        pass
    
    def fit_transform(self, df, cols_cat, cols_bin):
        df = df.copy()
        # Denote categorical-type
        self.cat_encoder = CategoricalColumnsEncoder(mode='pandas')
        self.cat_encoder.fit_transform(df, cols_cat+cols_bin)
        return df
    
    def transform(self, df):
        df = df.copy()
        self.cat_encoder.transform(df)
        return df

### Grid search

In [22]:
def custom_asymmetric_train(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual<0, -2*10.0*residual, -2*residual)
    hess = np.where(residual<0, 2*10.0, 2.0)
    return grad, hess

def hit_rate(y_pred, dataset_true):
    y_true = dataset_true.get_label()
    y_pred = np.clip(np.exp(y_pred.astype("float")) - 1, 0, None)
    y_true = np.clip(np.exp(y_true.astype("float")) - 1, 0, None)
    rate = np.mean(np.abs(y_pred - y_true) <= 0.1 * y_true)
    mape = np.mean(np.abs(y_pred - y_true) / y_true)
    return "hit_rate", np.round(rate, decimals=4)*10000 + (1-mape), False

In [23]:
# pars
is_log1p = True # if train on log1p target
# pars

# grid search
params_gsearch = {'task': ['train'],'boosting_type': ['gbdt'],
                  #'metric': [['mse', 'mae']],
                  'objective': ['mse'], # 'mae', 'mape'
                  'num_leaves': [255],
                  'learning_rate': [0.01],
                  'feature_fraction': [0.7],
                  'min_data_in_leaf': [20],
                  'lambda_l1': [0.1]}

params_gsearch = {'task': ['train'],'boosting_type': ['gbdt'],
                  #'metric': [['mse', 'mae']],
                  'objective': ['mse'], # 'mae', 'mape', 'regression'
                  'num_leaves': [255],
                  'learning_rate': [0.01],
                  'feature_fraction': [0.7],
                  'min_data_in_leaf': [20],
                  'lambda_l1': [0.1]}

params_names = params_gsearch.keys()

gsearch = {}
folds = KFold(n_splits=3)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    if is_log1p:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['target']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['target'], reference=lgb_train).construct()
    else:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['total_price']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['total_price'], reference=lgb_train).construct()
        
    # grid search
    for values in itertools.product(*[params_gsearch[key] for key in params_names]):
        params = dict(zip(params_names, values))
        print('params:', params)
        
        model = lgb.train(params, lgb_train,
                          num_boost_round=10000,
                          valid_sets=lgb_val,
                          verbose_eval=200,
                          #fobj=custom_asymmetric_train,
                          #feval=custom_asymmetric_valid,
                          feval=hit_rate,
                          #early_stopping_rounds=200,
                          )
        y_pred = model.predict(df_val[cols_feats])
        
        if is_log1p:
            y_pred_expm1 = np.expm1(y_pred)
            y_pred_final = np.clip(y_pred_expm1, 0, None)
        else:
            y_pred_final = y_pred
            
        score = cal_score(df_val['total_price'], y_pred_final)
        tuple_params = tuple(params.items())
        gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results.sort(key= lambda x: x[1], reverse = True)
display(gsearch_results)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'num_leaves': 255, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
[200]	valid_0's l2: 0.106711	valid_0's hit_rate: 2912.75
[400]	valid_0's l2: 0.0567184	valid_0's hit_rate: 4294.83
[600]	valid_0's l2: 0.0495915	valid_0's hit_rate: 4708.84
[800]	valid_0's l2: 0.0469991	valid_0's hit_rate: 4904.85
[1000]	valid_0's l2: 0.0457271	valid_0's hit_rate: 5026.85
[1200]	valid_0's l2: 0.0449713	valid_0's hit_rate: 5114.85
[1400]	valid_0's l2: 0.044516	valid_0's hit_rate: 5164.85
[1600]	valid_0's l2: 0.044191	valid_0's hit_rate: 5212.85
[1800]	valid_0's l2: 0.0439695	valid_0's hit_rate: 5260.85
[2000]	valid_0's l2: 0.0438008	valid_0's hit_rate: 5283.85
[2200]	valid_0's l2: 0.0436661	valid_0's hit_rate: 5320.85
[2400]	valid_0's l2: 0.0435732	valid_0's hit_rate: 5332.86
[2600]	valid_0's l2: 0.0434891	valid_0's hit_rate: 5349.86
[2800]	valid_0's l2: 0.0434246	valid_0's hi

[6200]	valid_0's l2: 0.0429463	valid_0's hit_rate: 5436.86
[6400]	valid_0's l2: 0.0429458	valid_0's hit_rate: 5438.86
[6600]	valid_0's l2: 0.042946	valid_0's hit_rate: 5438.86
[6800]	valid_0's l2: 0.0429444	valid_0's hit_rate: 5440.86
[7000]	valid_0's l2: 0.0429479	valid_0's hit_rate: 5440.86
[7200]	valid_0's l2: 0.0429502	valid_0's hit_rate: 5441.86
[7400]	valid_0's l2: 0.0429503	valid_0's hit_rate: 5444.86
[7600]	valid_0's l2: 0.0429513	valid_0's hit_rate: 5450.86
[7800]	valid_0's l2: 0.042952	valid_0's hit_rate: 5444.86
[8000]	valid_0's l2: 0.0429527	valid_0's hit_rate: 5448.86
[8200]	valid_0's l2: 0.0429538	valid_0's hit_rate: 5448.86
[8400]	valid_0's l2: 0.0429552	valid_0's hit_rate: 5448.86
[8600]	valid_0's l2: 0.0429545	valid_0's hit_rate: 5443.86
[8800]	valid_0's l2: 0.0429546	valid_0's hit_rate: 5446.86
[9000]	valid_0's l2: 0.0429546	valid_0's hit_rate: 5444.86
[9200]	valid_0's l2: 0.0429546	valid_0's hit_rate: 5445.86
[9400]	valid_0's l2: 0.0429559	valid_0's hit_rate: 5445.86

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5482.19485441842,
  [5449.8584886062135, 5554.862112396429, 5441.863962252616]]]

In [14]:
display(gsearch_results)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mae'),
   ('objective', 'mse'),
   ('num_leaves', 31),
   ('learning_rate', 0.01),
   ('feature_fraction', 1.0),
   ('min_data_in_leaf', 100),
   ('lambda_l1', 1)),
  5023.182736892787,
  [5014.8459736956465, 5000.84919448768, 5053.853042495033]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mae'),
   ('objective', 'mse'),
   ('num_leaves', 31),
   ('learning_rate', 0.01),
   ('feature_fraction', 1.0),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 1)),
  5026.183789802272,
  [5038.847251680061, 5016.850701760834, 5022.853415965921]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mae'),
   ('objective', 'mse'),
   ('num_leaves', 31),
   ('learning_rate', 0.01),
   ('feature_fraction', 1.0),
   ('min_data_in_leaf', 100),
   ('lambda_l1', 0.1)),
  5028.84949301197,
  [5020.8461883311675, 5016.849252355429, 5048.853038349313]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
  