### Import packages

In [1]:
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from utilities import cal_score, convert_types

### Columns

In [2]:
cols_num = ['txn_dt', 'total_floor', 'building_complete_dt', 'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'lat', 'lon', 'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate', 'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'N_50', 'N_500', 'N_1000', 'N_5000', 'N_10000', 'I_10', 'I_50', 'I_100', 'I_250', 'I_500', 'I_1000', 'I_5000', 'I_10000', 'I_MIN', 'II_10', 'II_50', 'II_100', 'II_250', 'II_500', 'II_1000', 'II_5000', 'II_10000', 'II_MIN', 'III_10', 'III_50', 'III_100', 'III_250', 'III_500', 'III_1000', 'III_5000', 'III_10000', 'III_MIN', 'IV_10', 'IV_50', 'IV_100', 'IV_250', 'IV_500', 'IV_1000', 'IV_5000', 'IV_10000', 'IV_MIN', 'V_10', 'V_50', 'V_100', 'V_250', 'V_500', 'V_1000', 'V_5000', 'V_10000', 'V_MIN', 'VI_50', 'VI_100', 'VI_250', 'VI_500', 'VI_1000', 'VI_5000', 'VI_10000', 'VI_MIN', 'VII_10', 'VII_50', 'VII_100', 'VII_250', 'VII_500', 'VII_1000', 'VII_5000', 'VII_10000', 'VII_MIN', 'VIII_10', 'VIII_50', 'VIII_100', 'VIII_250', 'VIII_500', 'VIII_1000', 'VIII_5000', 'VIII_10000', 'VIII_MIN', 'IX_10', 'IX_50', 'IX_100', 'IX_250', 'IX_500', 'IX_1000', 'IX_5000', 'IX_10000', 'IX_MIN', 'X_10', 'X_50', 'X_100', 'X_250', 'X_500', 'X_1000', 'X_5000', 'X_10000', 'X_MIN', 'XI_10', 'XI_50', 'XI_100', 'XI_250', 'XI_500', 'XI_1000', 'XI_5000', 'XI_10000', 'XI_MIN', 'XII_10', 'XII_50', 'XII_100', 'XII_250', 'XII_500', 'XII_1000', 'XII_5000', 'XII_10000', 'XII_MIN', 'XIII_10', 'XIII_50', 'XIII_100', 'XIII_250', 'XIII_500', 'XIII_1000', 'XIII_5000', 'XIII_10000', 'XIII_MIN', 'XIV_10', 'XIV_50', 'XIV_100', 'XIV_250', 'XIV_500', 'XIV_1000', 'XIV_5000', 'XIV_10000', 'XIV_MIN']
cols_cat = ['building_material', 'city', 'building_type', 'building_use', 'parking_way', 'town', 'village']
cols_bin = ['I_index_50', 'I_index_500', 'I_index_1000', 'II_index_50', 'II_index_500', 'II_index_1000', 'III_index_50', 'III_index_500', 'III_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'V_index_50', 'V_index_500', 'V_index_1000', 'VI_10', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'X_index_50', 'X_index_500', 'X_index_1000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000']
cols_feats = cols_num + cols_cat + cols_bin

col_target = 'total_price'
col_target_log1p = 'target'

### Read data

In [3]:
df = pd.read_csv('dataset/train.csv', dtype=object)

In [4]:
df.columns

Index(['building_id', 'building_material', 'city', 'txn_dt', 'total_floor',
       'building_type', 'building_use', 'building_complete_dt', 'parking_way',
       'parking_area',
       ...
       'XIV_500', 'XIV_index_500', 'XIV_1000', 'XIV_index_1000', 'XIV_5000',
       'XIV_index_5000', 'XIV_10000', 'XIV_index_10000', 'XIV_MIN',
       'total_price'],
      dtype='object', length=235)

### Preprocessing

In [5]:
# Convert types
df = convert_types(df, cols_num, col_target=col_target)

# Generate feats (train-test-same feats)
#create_feats(df)

In [6]:
df.head()

Unnamed: 0,building_id,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,...,XIV_index_500,XIV_1000,XIV_index_1000,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,total_price,target
0,e3mMIMR3JJqCaXz1,8,21,18674.0,4.0,3,2,6271.0,2,,...,1,157.0,1,2483.0,1,6011.0,1,34.469803,647603.75,13.381036
1,LgwzgklNvy4QCtq5,8,7,18800.0,5.0,1,2,7885.0,2,,...,1,115.0,1,15872.0,1,32221.0,1,40.073574,3321452.0,15.015913
2,ucIR2NLLsC3T650L,8,7,19289.0,4.0,1,2,6028.0,2,,...,1,212.0,1,15760.0,1,32228.0,1,54.462082,9570885.0,16.074236
3,jre1pJhcQj91Kdky,8,21,20385.0,24.0,0,2,18325.0,0,,...,1,125.0,1,2568.0,1,7271.0,1,99.628967,14215011.0,16.469809
4,rQpYpY9nRG7X5mmr,1,21,20657.0,2.0,4,2,6880.0,2,,...,1,47.0,1,2587.0,1,7442.0,1,124.131233,762712.0,13.544637


In [7]:
df.info(300)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 236 columns):
building_id                object
building_material          object
city                       object
txn_dt                     float32
total_floor                float32
building_type              object
building_use               object
building_complete_dt       float32
parking_way                object
parking_area               float32
parking_price              float32
txn_floor                  float32
land_area                  float32
building_area              float32
town                       object
lat                        float32
lon                        float32
village_income_median      float32
town_population            float32
town_area                  float32
town_population_density    float32
doc_rate                   float32
master_rate                float32
bachelor_rate              float32
jobschool_rate             float32
highschool_rate       

### Feat engineering

In [8]:
from feature_engineering import CategoricalColumnsEncoder

In [9]:
class FeatureEngineering():
    def __init__(self):
        pass
    
    def fit_transform(self, df, cols_cat, cols_bin):
        df = df.copy()
        # Denote categorical-type
        self.cat_encoder = CategoricalColumnsEncoder(mode='pandas')
        self.cat_encoder.fit_transform(df, cols_cat+cols_bin)
        return df
    
    def transform(self, df):
        df = df.copy()
        self.cat_encoder.transform(df)
        return df

### Grid search

In [10]:
# pars
is_log1p = False # if train on log1p target
# pars

# grid search
params_fix = {'task': 'train',
              'boosting_type': 'gbdt',
              'metric': 'mape',
              'learning_rate': 0.015,
              }

#### Round 1

In [11]:
%%time
params_gsearch1 = {'objective': ['mse', 'mae', 'mape'],
                   'num_leaves': [255, 511],           # may reduce in dim-reduction exp
                   'feature_fraction': [0.5, 0.75, 1.0],
                   'min_data_in_leaf': [5, 20, 50]
                   }

gsearch = {}
folds = KFold(n_splits=3, shuffle=True, random_state=123)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    if is_log1p:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['target']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['target'], reference=lgb_train).construct()
    else:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['total_price']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['total_price'], reference=lgb_train).construct()
        
    # grid search
    for values in itertools.product(*[params_gsearch1[key] for key in params_gsearch1]):
        params = params_fix.copy()
        params.update( dict(zip(params_gsearch1.keys(), values)) )
        print('params:', params)

        model = lgb.train(params, lgb_train,
                          num_boost_round=10000,
                          valid_sets=lgb_val,
                          verbose_eval=2000,
                          early_stopping_rounds=200)
        y_pred = model.predict(df_val[cols_feats])
        
        if is_log1p:
            y_pred_expm1 = np.expm1(y_pred)
            y_pred_final = np.clip(y_pred_expm1, 0, None)
        else:
            y_pred_final = y_pred
            
        score = cal_score(df_val['total_price'], y_pred_final)
        tuple_params = tuple(params.items())
        gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results1 = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results1.sort(key= lambda x: x[1], reverse=True)
display(gsearch_results1)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.224583
[4000]	valid_0's mape: 0.217386
[6000]	valid_0's mape: 0.215695
[8000]	valid_0's mape: 0.215167
[10000]	valid_0's mape: 0.214936
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.214936
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[352]	valid_0's mape: 0.299328
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50}
Trainin

[2000]	valid_0's mape: 0.164264
Early stopping, best iteration is:
[3095]	valid_0's mape: 0.162738
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 0.75, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.164709
[4000]	valid_0's mape: 0.162287
[6000]	valid_0's mape: 0.161122
[8000]	valid_0's mape: 0.160393
[10000]	valid_0's mape: 0.160012
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.160012
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1409]	valid_0's mape: 0.172388
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective'

[2000]	valid_0's mape: 0.24328
[4000]	valid_0's mape: 0.240761
[6000]	valid_0's mape: 0.239124
[8000]	valid_0's mape: 0.238069
Early stopping, best iteration is:
[9730]	valid_0's mape: 0.237389
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.213379
[4000]	valid_0's mape: 0.210414
[6000]	valid_0's mape: 0.208945
[8000]	valid_0's mape: 0.207681
[10000]	valid_0's mape: 0.206719
Did not meet early stopping. Best iteration is:
[9999]	valid_0's mape: 0.206719
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 511, 'feature_fraction': 0.5, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[729]	valid_0's mape: 0.260645
params: {'

Early stopping, best iteration is:
[335]	valid_0's mape: 0.305615
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 0.5, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[313]	valid_0's mape: 0.333597
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.201381
[4000]	valid_0's mape: 0.199509
[6000]	valid_0's mape: 0.199258
[8000]	valid_0's mape: 0.199202
[10000]	valid_0's mape: 0.199184
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.199184
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'featu

params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.163049
[4000]	valid_0's mape: 0.161863
Early stopping, best iteration is:
[4098]	valid_0's mape: 0.16171
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.265588
Early stopping, best iteration is:
[1947]	valid_0's mape: 0.264934
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.243572
[4000]

[10000]	valid_0's mape: 0.223652
Did not meet early stopping. Best iteration is:
[9983]	valid_0's mape: 0.223651
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[345]	valid_0's mape: 0.312558
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[330]	valid_0's mape: 0.350571
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 255, 'feature_fraction': 0.75, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 

[2000]	valid_0's mape: 0.166472
[4000]	valid_0's mape: 0.164023
Early stopping, best iteration is:
[3953]	valid_0's mape: 0.163928
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1168]	valid_0's mape: 0.177239
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.16919
Early stopping, best iteration is:
[3372]	valid_0's mape: 0.167195
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 50}
Training until validation scores don

params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 511, 'feature_fraction': 0.5, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1407]	valid_0's mape: 0.25098
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 511, 'feature_fraction': 0.5, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.211964
[4000]	valid_0's mape: 0.208678
[6000]	valid_0's mape: 0.207084
Early stopping, best iteration is:
[7082]	valid_0's mape: 0.206509
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mape', 'num_leaves': 511, 'feature_fraction': 0.5, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.179662
[4000

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mse'),
   ('num_leaves', 511),
   ('feature_fraction', 1.0),
   ('min_data_in_leaf', 5)),
  5068.815361201078,
  [5059.822590806029, 5087.808112964584, 5058.815379832622]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mse'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 5)),
  5059.806372887585,
  [5083.809621836049, 5074.80162158584, 5020.807875240866]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50)),
  5052.838400261109,
  [5130.8416503865865, 4932.836160968937, 5094.837389427806]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('obj

Wall time: 17h 38min 36s


In [14]:
display(gsearch_results1)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mse'),
   ('num_leaves', 511),
   ('feature_fraction', 1.0),
   ('min_data_in_leaf', 5)),
  5068.815361201078,
  [5059.822590806029, 5087.808112964584, 5058.815379832622]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mse'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 5)),
  5059.806372887585,
  [5083.809621836049, 5074.80162158584, 5020.807875240866]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50)),
  5052.838400261109,
  [5130.8416503865865, 4932.836160968937, 5094.837389427806]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('obj

#### Round 2

In [12]:
%%time
params_gsearch2 = {'lambda_l1': [0, 0.01, 0.1],
                   'lambda_l2': [0, 0.01, 0.1]
                  }

gsearch = {}
folds = KFold(n_splits=3, shuffle=True, random_state=123)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    if is_log1p:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['target']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['target'], reference=lgb_train).construct()
    else:
        lgb_train = lgb.Dataset(df_train[cols_feats], df_train['total_price']).construct()
        lgb_val = lgb.Dataset(df_val[cols_feats], df_val['total_price'], reference=lgb_train).construct()
        
    # grid search
    # pick top 5 params from round 1
    for result1 in gsearch_results1[:5]:
        params1 = dict(result1[0])
        for values in itertools.product(*[params_gsearch2[key] for key in params_gsearch2]):
            params = params1.copy()
            params.update( dict(zip(params_gsearch2.keys(), values)) )
            print('params:', params)

            model = lgb.train(params, lgb_train,
                              num_boost_round=10000,
                              valid_sets=lgb_val,
                              verbose_eval=2000,
                              early_stopping_rounds=200)
            y_pred = model.predict(df_val[cols_feats])

            if is_log1p:
                y_pred_expm1 = np.expm1(y_pred)
                y_pred_final = np.clip(y_pred_expm1, 0, None)
            else:
                y_pred_final = y_pred

            score = cal_score(df_val['total_price'], y_pred_final)
            tuple_params = tuple(params.items())
            gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results2 = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results2.sort(key= lambda x: x[1], reverse=True)
display(gsearch_results2)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5, 'lambda_l1': 0, 'lambda_l2': 0}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.179925
[4000]	valid_0's mape: 0.178242
[6000]	valid_0's mape: 0.178005
[8000]	valid_0's mape: 0.177955
[10000]	valid_0's mape: 0.17794
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.17794
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5, 'lambda_l1': 0, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.180389
[4000]	valid_0's mape: 0.178571
[6000]	valid_0's mape: 0.178334
[8000]	valid_0's mape: 0.178285
[10000]	valid_0's mape: 0.178274
Did not meet early stopping. 

[10000]	valid_0's mape: 0.190649
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.190649
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.193094
[4000]	valid_0's mape: 0.191193
[6000]	valid_0's mape: 0.190943
[8000]	valid_0's mape: 0.190901
[10000]	valid_0's mape: 0.190885
Did not meet early stopping. Best iteration is:
[9996]	valid_0's mape: 0.190885
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0.1}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.192542
[4000]	valid_0's mape: 0.190821
[6000]	valid_0's map

params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 50, 'lambda_l1': 0.1, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.166025
[4000]	valid_0's mape: 0.164168
[6000]	valid_0's mape: 0.163387
Early stopping, best iteration is:
[7441]	valid_0's mape: 0.163017
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 50, 'lambda_l1': 0.1, 'lambda_l2': 0.1}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.165355
[4000]	valid_0's mape: 0.163512
[6000]	valid_0's mape: 0.162657
Early stopping, best iteration is:
[5983]	valid_0's mape: 0.162619
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 

[8000]	valid_0's mape: 0.193257
[10000]	valid_0's mape: 0.193243
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.193243
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.195383
[4000]	valid_0's mape: 0.193726
[6000]	valid_0's mape: 0.193483
[8000]	valid_0's mape: 0.193438
[10000]	valid_0's mape: 0.193424
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.193424
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.196121
[4000]	valid_0's mape: 

Early stopping, best iteration is:
[1727]	valid_0's mape: 0.162318
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.163049
[4000]	valid_0's mape: 0.161863
Early stopping, best iteration is:
[4098]	valid_0's mape: 0.16171
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.164377
Early stopping, best iteration is:
[1983]	valid_0's mape: 0.164112
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction

Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.189179
[4000]	valid_0's mape: 0.187582
[6000]	valid_0's mape: 0.18737
[8000]	valid_0's mape: 0.187336
[10000]	valid_0's mape: 0.187324
Did not meet early stopping. Best iteration is:
[9997]	valid_0's mape: 0.187324
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5, 'lambda_l1': 0.01, 'lambda_l2': 0}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.187049
[4000]	valid_0's mape: 0.1854
[6000]	valid_0's mape: 0.185215
[8000]	valid_0's mape: 0.185176
[10000]	valid_0's mape: 0.185167
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.185167
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mse', 'num_leaves': 511, 'feature_fraction': 1.0, 'min_data_in_leaf': 5,

[2000]	valid_0's mape: 0.164538
[4000]	valid_0's mape: 0.16291
Early stopping, best iteration is:
[4268]	valid_0's mape: 0.162799
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0.01}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.163563
[4000]	valid_0's mape: 0.161659
[6000]	valid_0's mape: 0.161017
[8000]	valid_0's mape: 0.160602
Early stopping, best iteration is:
[8147]	valid_0's mape: 0.160586
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0.1}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.164297
[4000]	valid_0's mape: 0.16248
Early stopping, best iteration is:
[4501]	valid_0's m

Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.165906
[4000]	valid_0's mape: 0.163612
[6000]	valid_0's mape: 0.162738
Early stopping, best iteration is:
[6539]	valid_0's mape: 0.162627
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 0.75, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0.1}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.164881
[4000]	valid_0's mape: 0.162574
Early stopping, best iteration is:
[4840]	valid_0's mape: 0.162099
params: {'task': 'train', 'boosting_type': 'gbdt', 'metric': 'mape', 'learning_rate': 0.015, 'objective': 'mae', 'num_leaves': 255, 'feature_fraction': 0.75, 'min_data_in_leaf': 50, 'lambda_l1': 0.01, 'lambda_l2': 0}
Training until validation scores don't improve for 200 rounds.
[2000]	valid_0's mape: 0.166151
[4000]	valid_0's mape: 0.163821
Early stopping, best

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0.1),
   ('lambda_l2', 0)),
  5099.840262356635,
  [5122.841172168337, 5053.83927293868, 5122.840341962888]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0.01),
   ('lambda_l2', 0)),
  5091.839465138038,
  [5111.841291586655, 5058.8386467667715, 5104.838457060687]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0),
   ('lambda_l2', 0.1)),
  5089.172793211539,
  [5081.840608358431, 5082.83966662220

Wall time: 1d 13h 37min 17s


In [15]:
display(gsearch_results2)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0.1),
   ('lambda_l2', 0)),
  5099.840262356635,
  [5122.841172168337, 5053.83927293868, 5122.840341962888]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0.01),
   ('lambda_l2', 0)),
  5091.839465138038,
  [5111.841291586655, 5058.8386467667715, 5104.838457060687]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('objective', 'mae'),
   ('num_leaves', 511),
   ('feature_fraction', 0.75),
   ('min_data_in_leaf', 50),
   ('lambda_l1', 0),
   ('lambda_l2', 0.1)),
  5089.172793211539,
  [5081.840608358431, 5082.83966662220