### Import packages

In [1]:
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from utilities import cal_score, convert_types

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Columns

In [2]:
cols_num = ['txn_dt', 'total_floor', 'building_complete_dt', 'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'lat', 'lon', 'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate', 'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'N_50', 'N_500', 'N_1000', 'N_5000', 'N_10000', 'I_10', 'I_50', 'I_100', 'I_250', 'I_500', 'I_1000', 'I_5000', 'I_10000', 'I_MIN', 'II_10', 'II_50', 'II_100', 'II_250', 'II_500', 'II_1000', 'II_5000', 'II_10000', 'II_MIN', 'III_10', 'III_50', 'III_100', 'III_250', 'III_500', 'III_1000', 'III_5000', 'III_10000', 'III_MIN', 'IV_10', 'IV_50', 'IV_100', 'IV_250', 'IV_500', 'IV_1000', 'IV_5000', 'IV_10000', 'IV_MIN', 'V_10', 'V_50', 'V_100', 'V_250', 'V_500', 'V_1000', 'V_5000', 'V_10000', 'V_MIN', 'VI_50', 'VI_100', 'VI_250', 'VI_500', 'VI_1000', 'VI_5000', 'VI_10000', 'VI_MIN', 'VII_10', 'VII_50', 'VII_100', 'VII_250', 'VII_500', 'VII_1000', 'VII_5000', 'VII_10000', 'VII_MIN', 'VIII_10', 'VIII_50', 'VIII_100', 'VIII_250', 'VIII_500', 'VIII_1000', 'VIII_5000', 'VIII_10000', 'VIII_MIN', 'IX_10', 'IX_50', 'IX_100', 'IX_250', 'IX_500', 'IX_1000', 'IX_5000', 'IX_10000', 'IX_MIN', 'X_10', 'X_50', 'X_100', 'X_250', 'X_500', 'X_1000', 'X_5000', 'X_10000', 'X_MIN', 'XI_10', 'XI_50', 'XI_100', 'XI_250', 'XI_500', 'XI_1000', 'XI_5000', 'XI_10000', 'XI_MIN', 'XII_10', 'XII_50', 'XII_100', 'XII_250', 'XII_500', 'XII_1000', 'XII_5000', 'XII_10000', 'XII_MIN', 'XIII_10', 'XIII_50', 'XIII_100', 'XIII_250', 'XIII_500', 'XIII_1000', 'XIII_5000', 'XIII_10000', 'XIII_MIN', 'XIV_10', 'XIV_50', 'XIV_100', 'XIV_250', 'XIV_500', 'XIV_1000', 'XIV_5000', 'XIV_10000', 'XIV_MIN']
cols_cat = ['building_material', 'city', 'building_type', 'building_use', 'parking_way', 'town', 'village']
cols_bin = ['I_index_50', 'I_index_500', 'I_index_1000', 'II_index_50', 'II_index_500', 'II_index_1000', 'III_index_50', 'III_index_500', 'III_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'V_index_50', 'V_index_500', 'V_index_1000', 'VI_10', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'X_index_50', 'X_index_500', 'X_index_1000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000']
cols_feats = cols_num + cols_cat + cols_bin

col_target = 'total_price'
col_target_log1p = 'target'

### Read data

In [3]:
df = pd.read_csv('dataset/train.csv', dtype=object)

In [4]:
df.columns

Index(['building_id', 'building_material', 'city', 'txn_dt', 'total_floor',
       'building_type', 'building_use', 'building_complete_dt', 'parking_way',
       'parking_area',
       ...
       'XIV_500', 'XIV_index_500', 'XIV_1000', 'XIV_index_1000', 'XIV_5000',
       'XIV_index_5000', 'XIV_10000', 'XIV_index_10000', 'XIV_MIN',
       'total_price'],
      dtype='object', length=235)

### Preprocessing

In [5]:
# Convert types
df = convert_types(df, cols_num, col_target=col_target)

# Generate feats (train-test-same feats)
#create_feats(df)

### Feat engineering

In [6]:
from feature_engineering import CategoricalColumnsEncoder

In [7]:
class FeatureEngineering():
    def __init__(self):
        pass
    
    def fit_transform(self, df, cols_cat, cols_bin):
        df = df.copy()
        # Denote categorical-type
        self.cat_encoder = CategoricalColumnsEncoder(mode='pandas')
        self.cat_encoder.fit_transform(df, cols_cat+cols_bin)
        return df
    
    def transform(self, df):
        df = df.copy()
        self.cat_encoder.transform(df)
        return df

### Grid search

In [8]:
%%time
# pars
is_log1p = True # if train on log1p target
# pars

# grid search
params_gsearch = {'task': ['train'],'boosting_type': ['gbdt'], 'metric': ['mape'],
                  'objective': ['mse'], # 'mae', 'mape'
                  'num_leaves': [255],
                  'learning_rate': [0.01, 0.02, 0.05],
                  'feature_fraction': [0.7],
                  'min_data_in_leaf': [20],
                  'lambda_l1': [0.1]}

params_names = params_gsearch.keys()

gsearch = {}
folds = KFold(n_splits=3)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    if is_log1p:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['target']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['target'], reference=lgb_train).construct()
    else:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['total_price']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['total_price'], reference=lgb_train).construct()
        
    # grid search
    for values in itertools.product(*[params_gsearch[key] for key in params_names]):
        params = dict(zip(params_names, values))
        print('params:', params)
        
        model = lgb.train(params, lgb_train,
                          num_boost_round=10000,
                          valid_sets=lgb_val,
                          verbose_eval=200,
                          early_stopping_rounds=200)
        y_pred = model.predict(df_val[cols_feats])
        
        if is_log1p:
            y_pred_expm1 = np.expm1(y_pred)
            y_pred_final = np.clip(y_pred_expm1, 0, None)
        else:
            y_pred_final = y_pred
            
        score = cal_score(df_val['total_price'], y_pred_final)
        tuple_params = tuple(params.items())
        gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results.sort(key= lambda x: x[1])
display(gsearch_results)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'objective': 'mse', 'num_leaves': 255, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's mape: 0.0157149
[400]	valid_0's mape: 0.0107572
[600]	valid_0's mape: 0.00985822
[800]	valid_0's mape: 0.00949818
[1000]	valid_0's mape: 0.00930301
[1200]	valid_0's mape: 0.00917594
[1400]	valid_0's mape: 0.00909001
[1600]	valid_0's mape: 0.00902346
[1800]	valid_0's mape: 0.00897299
[2000]	valid_0's mape: 0.00893353
[2200]	valid_0's mape: 0.00889884
[2400]	valid_0's mape: 0.00887175
[2600]	valid_0's mape: 0.00884689
[2800]	valid_0's mape: 0.0088266
[3000]	valid_0's mape: 0.00880928
[3200]	valid_0's mape: 0.00879428
[3400]	valid_0's mape: 0.00878036
[3600]	valid_0's mape: 0.00876913
[3800]	valid_0's mape: 0.00875897
[4000]	valid_0's mape: 0.00875061
[4200]	valid_0's mape: 0.00874219
[4400]	valid_0'

[7200]	valid_0's mape: 0.00862525
[7400]	valid_0's mape: 0.00862494
[7600]	valid_0's mape: 0.00862466
[7800]	valid_0's mape: 0.00862436
[8000]	valid_0's mape: 0.00862422
[8200]	valid_0's mape: 0.00862406
[8400]	valid_0's mape: 0.00862404
Early stopping, best iteration is:
[8209]	valid_0's mape: 0.00862404
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'objective': 'mse', 'num_leaves': 255, 'learning_rate': 0.05, 'feature_fraction': 0.7, 'min_data_in_leaf': 20, 'lambda_l1': 0.1}
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's mape: 0.0094055
[400]	valid_0's mape: 0.00900253
[600]	valid_0's mape: 0.00887773
[800]	valid_0's mape: 0.00881865
[1000]	valid_0's mape: 0.00878498
[1200]	valid_0's mape: 0.0087641
[1400]	valid_0's mape: 0.00875294
[1600]	valid_0's mape: 0.0087439
[1800]	valid_0's mape: 0.00873781
[2000]	valid_0's mape: 0.00873407
[2200]	valid_0's mape: 0.00873096
[2400]	valid_0's mape: 0.00872843
[2600]	valid_0's mape: 0.008726

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.05),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5386.1922439952805,
  [5346.855265902674, 5453.860038770365, 5357.861427312805]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.02),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5461.194405837524,
  [5404.857581824218, 5521.861633218514, 5456.864002469839]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5481.861521114605,
  [5449.8584886062135, 5554.862112396429, 5440.863962341173]]]

CPU times: user 5h 32min 10s, sys: 3min 53s, total: 5h 36min 4s
Wall time: 29min 22s


In [9]:
display(gsearch_results)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.05),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5386.1922439952805,
  [5346.855265902674, 5453.860038770365, 5357.861427312805]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.02),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5461.194405837524,
  [5404.857581824218, 5521.861633218514, 5456.864002469839]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('objective', 'mse'),
   ('num_leaves', 255),
   ('learning_rate', 0.01),
   ('feature_fraction', 0.7),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0.1)),
  5481.861521114605,
  [5449.8584886062135, 5554.862112396429, 5440.863962341173]]]

**Conclusion:**
- smaller the learning rate, better the results
- lr=0.01 didn't early stop for 10,000
- lr=0.02 early stop
- let use lr=0.015