# Exp 7 - New feature - building_area per land_area

### Import packages

In [1]:
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from utilities import cal_score, convert_types

### Columns

In [2]:
cols_num = ['txn_dt', 'total_floor', 'building_complete_dt', 'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'lat', 'lon', 'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate', 'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'N_50', 'N_500', 'N_1000', 'N_5000', 'N_10000', 'I_10', 'I_50', 'I_100', 'I_250', 'I_500', 'I_1000', 'I_5000', 'I_10000', 'I_MIN', 'II_10', 'II_50', 'II_100', 'II_250', 'II_500', 'II_1000', 'II_5000', 'II_10000', 'II_MIN', 'III_10', 'III_50', 'III_100', 'III_250', 'III_500', 'III_1000', 'III_5000', 'III_10000', 'III_MIN', 'IV_10', 'IV_50', 'IV_100', 'IV_250', 'IV_500', 'IV_1000', 'IV_5000', 'IV_10000', 'IV_MIN', 'V_10', 'V_50', 'V_100', 'V_250', 'V_500', 'V_1000', 'V_5000', 'V_10000', 'V_MIN', 'VI_50', 'VI_100', 'VI_250', 'VI_500', 'VI_1000', 'VI_5000', 'VI_10000', 'VI_MIN', 'VII_10', 'VII_50', 'VII_100', 'VII_250', 'VII_500', 'VII_1000', 'VII_5000', 'VII_10000', 'VII_MIN', 'VIII_10', 'VIII_50', 'VIII_100', 'VIII_250', 'VIII_500', 'VIII_1000', 'VIII_5000', 'VIII_10000', 'VIII_MIN', 'IX_10', 'IX_50', 'IX_100', 'IX_250', 'IX_500', 'IX_1000', 'IX_5000', 'IX_10000', 'IX_MIN', 'X_10', 'X_50', 'X_100', 'X_250', 'X_500', 'X_1000', 'X_5000', 'X_10000', 'X_MIN', 'XI_10', 'XI_50', 'XI_100', 'XI_250', 'XI_500', 'XI_1000', 'XI_5000', 'XI_10000', 'XI_MIN', 'XII_10', 'XII_50', 'XII_100', 'XII_250', 'XII_500', 'XII_1000', 'XII_5000', 'XII_10000', 'XII_MIN', 'XIII_10', 'XIII_50', 'XIII_100', 'XIII_250', 'XIII_500', 'XIII_1000', 'XIII_5000', 'XIII_10000', 'XIII_MIN', 'XIV_10', 'XIV_50', 'XIV_100', 'XIV_250', 'XIV_500', 'XIV_1000', 'XIV_5000', 'XIV_10000', 'XIV_MIN']
cols_cat = ['building_material', 'city', 'building_type', 'building_use', 'parking_way', 'town', 'village']
cols_bin = ['I_index_50', 'I_index_500', 'I_index_1000', 'II_index_50', 'II_index_500', 'II_index_1000', 'III_index_50', 'III_index_500', 'III_index_1000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'V_index_50', 'V_index_500', 'V_index_1000', 'VI_10', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'X_index_50', 'X_index_500', 'X_index_1000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000']
cols_feats = cols_num + cols_cat + cols_bin

col_target = 'total_price'
col_target_log1p = 'target'

### Read data

In [3]:
df = pd.read_csv('dataset/train.csv', dtype=object)

In [4]:
df.columns

Index(['building_id', 'building_material', 'city', 'txn_dt', 'total_floor',
       'building_type', 'building_use', 'building_complete_dt', 'parking_way',
       'parking_area',
       ...
       'XIV_500', 'XIV_index_500', 'XIV_1000', 'XIV_index_1000', 'XIV_5000',
       'XIV_index_5000', 'XIV_10000', 'XIV_index_10000', 'XIV_MIN',
       'total_price'],
      dtype='object', length=235)

### Preprocessing

In [5]:
# Convert types
df = convert_types(df, cols_num, col_target=col_target)

# Generate feats (train-test-same feats)
#create_feats(df)

### Target transform

In [6]:
def target_transform(df):  # Add new columns inplace!!
    '''Transform the original target to the target for model to train on'''
    df['log_total_price'] = np.log1p(df['total_price'])
    return df

def target_inverse_transform(df, y_pred):
    '''Inverse transorform the model target to the original target of our problem'''
    y_pred_expm1 = np.expm1(y_pred)
    y_pred_final = np.clip(y_pred_expm1, 0, None)
    return y_pred_final            

In [7]:
df = target_transform(df)

In [8]:
# transformed target fit by the model
col_model_target = 'log_total_price'
#col_model_target = 'total_price'

### Feat engineering

In [9]:
df.head()

Unnamed: 0,building_id,building_material,city,txn_dt,total_floor,building_type,building_use,building_complete_dt,parking_way,parking_area,...,XIV_5000,XIV_index_5000,XIV_10000,XIV_index_10000,XIV_MIN,total_price,target,price_per_area,log_price_per_area,log_total_price
0,e3mMIMR3JJqCaXz1,8,21,18674.0,4.0,3,2,6271.0,2,,...,2483.0,1,6011.0,1,34.469803,647603.75,13.381036,189458.9,12.151933,13.381036
1,LgwzgklNvy4QCtq5,8,7,18800.0,5.0,1,2,7885.0,2,,...,15872.0,1,32221.0,1,40.073574,3321452.0,15.015913,821875.3,13.619346,15.015913
2,ucIR2NLLsC3T650L,8,7,19289.0,4.0,1,2,6028.0,2,,...,15760.0,1,32228.0,1,54.462082,9570885.0,16.074236,1713898.0,14.354281,16.074236
3,jre1pJhcQj91Kdky,8,21,20385.0,24.0,0,2,18325.0,0,,...,2568.0,1,7271.0,1,99.628967,14215011.0,16.469809,1048070.0,13.862462,16.469809
4,rQpYpY9nRG7X5mmr,1,21,20657.0,2.0,4,2,6880.0,2,,...,2587.0,1,7442.0,1,124.131233,762712.0,13.544637,162690.8,11.999613,13.544637


In [10]:
from feature_engineering import CategoricalColumnsEncoder

In [11]:
class FeatureEngineering():
    def __init__(self):
        pass
    
    def fit_transform(self, df, cols_cat, cols_bin):
        df = df.copy()
        # Denote categorical-type
        self.cat_encoder = CategoricalColumnsEncoder(mode='pandas')
        self.cat_encoder.fit_transform(df, cols_cat+cols_bin)
        df['building_per_land_area'] = df['building_area'] / df['land_area']
        return df
    
    def transform(self, df):
        df = df.copy()
        self.cat_encoder.transform(df)
        df['building_per_land_area'] = df['building_area'] / df['land_area']
        return df

In [12]:
if 'building_per_land_area' not in cols_num:
    cols_num = cols_num + ['building_per_land_area']
    cols_feats = cols_num + cols_cat + cols_bin

### Grid search

In [13]:
# grid search
params_fix = {'task': 'train',
              'boosting_type': 'gbdt',
              'objective': 'mse',
              'metric': 'mape',
              'learning_rate': 0.015,
              }
lgb_other_params = {'num_boost_round': 10000,
                    'verbose_eval': 2000,
                    'early_stopping_rounds': 1000,
                   }

#### Round 1

In [14]:
%%time
params_gsearch1 = {'num_leaves': [63, 255, 511],           # may reduce in dim-reduction exp
                   'feature_fraction': [0.5, 0.75, 1.0],
                   'min_data_in_leaf': [5, 20, 50]
                   }

gsearch = {}
folds = KFold(n_splits=3, shuffle=True, random_state=123)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    lgb_train = lgb.Dataset(df_train[cols_feats], df_train[col_model_target]).construct()
    lgb_val = lgb.Dataset(df_val[cols_feats], df_val[col_model_target], reference=lgb_train).construct()
        
    # grid search
    for values in itertools.product(*[params_gsearch1[key] for key in params_gsearch1]):
        params = params_fix.copy()
        params.update( dict(zip(params_gsearch1.keys(), values)) )
        print('params:', params)

        model = lgb.train(params, lgb_train, valid_sets=lgb_val, **lgb_other_params)

        y_pred = model.predict(df_val[cols_feats])
        
        y_pred_final = target_inverse_transform(df_val, y_pred)
            
        score = cal_score(df_val['total_price'], y_pred_final)
        print(score)
        tuple_params = tuple(params.items())
        gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results1 = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results1.sort(key= lambda x: x[1], reverse=True)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 63, 'feature_fraction': 0.5, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00920153
[4000]	valid_0's mape: 0.00885253
[6000]	valid_0's mape: 0.00869965
[8000]	valid_0's mape: 0.00862097
[10000]	valid_0's mape: 0.00856954
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00856954
5438.86301663445
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 63, 'feature_fraction': 0.5, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00918472
[4000]	valid_0's mape: 0.00882637
[6000]	valid_0's mape: 0.00867709
[8000]	valid_0's mape: 0.00860363
[10000]	valid_0's mape: 0.00855722
Did not meet early stopping. Best iteration is:
[10000

5451.8594256590095
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00891484
[4000]	valid_0's mape: 0.00881421
[6000]	valid_0's mape: 0.00878968
[8000]	valid_0's mape: 0.00878196
[10000]	valid_0's mape: 0.00877913
Did not meet early stopping. Best iteration is:
[9997]	valid_0's mape: 0.00877912
5442.85926116186
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 1.0, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00894392
[4000]	valid_0's mape: 0.00886062
[6000]	valid_0's mape: 0.00884432
[8000]	valid_0's mape: 0.00884068
[10000]	valid_0's mape: 0.00884009
Did not meet early stopping. Best iteration is:
[9

[2000]	valid_0's mape: 0.00930362
[4000]	valid_0's mape: 0.00898454
[6000]	valid_0's mape: 0.0088562
[8000]	valid_0's mape: 0.00878837
[10000]	valid_0's mape: 0.00875071
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00875071
5387.861900253113
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 63, 'feature_fraction': 1.0, 'min_data_in_leaf': 5}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.0094658
[4000]	valid_0's mape: 0.00914449
[6000]	valid_0's mape: 0.00900896
[8000]	valid_0's mape: 0.00893189
[10000]	valid_0's mape: 0.00889251
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00889251
5367.859980636984
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 63, 'feature_fraction': 1.0, 'min_data_in_leaf': 20}
Training until validation scores do

[2000]	valid_0's mape: 0.00877939
[4000]	valid_0's mape: 0.00873731
[6000]	valid_0's mape: 0.00873128
[8000]	valid_0's mape: 0.00872976
[10000]	valid_0's mape: 0.00872941
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00872941
5474.862568542014
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 20}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00875665
[4000]	valid_0's mape: 0.00872884
[6000]	valid_0's mape: 0.00872507
[8000]	valid_0's mape: 0.00872379
[10000]	valid_0's mape: 0.00872317
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00872317
5454.862564839149
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 511, 'feature_fraction': 0.75, 'min_data_in_leaf': 50}
Training until validation sc

[2000]	valid_0's mape: 0.00872924
[4000]	valid_0's mape: 0.00862566
[6000]	valid_0's mape: 0.00860266
[8000]	valid_0's mape: 0.00859485
[10000]	valid_0's mape: 0.0085918
Did not meet early stopping. Best iteration is:
[9999]	valid_0's mape: 0.00859179
5528.860886262806
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.0087284
[4000]	valid_0's mape: 0.00862735
[6000]	valid_0's mape: 0.00860587
[8000]	valid_0's mape: 0.00860028
[10000]	valid_0's mape: 0.00859856
Did not meet early stopping. Best iteration is:
[9980]	valid_0's mape: 0.00859854
5479.860481407688
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.75, 'min_data_in_leaf': 5}
Training until validation scores d

[8000]	valid_0's mape: 0.00898137
Early stopping, best iteration is:
[7150]	valid_0's mape: 0.00898129
5371.853285917429
Wall time: 13h 46min 59s


In [15]:
display(gsearch_results1)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 20)),
  5537.19730187704,
  [5530.865148314858, 5551.865871053458, 5528.860886262806]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 5)),
  5528.530761484195,
  [5532.864981333453, 5539.865555650635, 5512.861747468498]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 50)),
  5512.196954158579,
  [5533.865008097947, 5522.865372970104, 5479.860481407688]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_

#### Round 2

In [16]:
%%time
params_gsearch2 = {'lambda_l1': [0, 0.01, 0.1],
                   'lambda_l2': [0, 0.01, 0.1]
                  }

gsearch = {}
folds = KFold(n_splits=3, shuffle=True, random_state=246)
for i_fold, (itrain, ival) in enumerate(folds.split(df)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    df_train = df.loc[itrain]
    df_val = df.loc[ival]
    
    # feat eng
    feat_eng = FeatureEngineering()
    df_train = feat_eng.fit_transform(df_train, cols_cat, cols_bin)
    df_val = feat_eng.transform(df_val)

    # Construct lgb dataset
    lgb_train = lgb.Dataset(df_train[cols_feats], df_train[col_model_target]).construct()
    lgb_val = lgb.Dataset(df_val[cols_feats], df_val[col_model_target], reference=lgb_train).construct()
        
    # grid search
    # pick top 3 params from round 1
    for result1 in gsearch_results1[:3]:
        params1 = dict(result1[0])
        for values in itertools.product(*[params_gsearch2[key] for key in params_gsearch2]):
            params = params1.copy()
            params.update( dict(zip(params_gsearch2.keys(), values)) )
            print('params:', params)
            
            model = lgb.train(params, lgb_train, valid_sets=lgb_val, **lgb_other_params)

            y_pred = model.predict(df_val[cols_feats])

            y_pred_final = target_inverse_transform(df_val, y_pred)
            
            score = cal_score(df_val['total_price'], y_pred_final)
            print(score)
            tuple_params = tuple(params.items())
            gsearch[tuple_params] = gsearch.get(tuple_params, []) + [score]

# aggregate, sort gsearch results
gsearch_results2 = [[key, np.mean(val), val] for key, val in gsearch.items()]
gsearch_results2.sort(key= lambda x: x[1], reverse=True)

==== Fold 1 ====
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20, 'lambda_l1': 0, 'lambda_l2': 0}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00870683
[4000]	valid_0's mape: 0.00859522
[6000]	valid_0's mape: 0.00857165
[8000]	valid_0's mape: 0.0085648
[10000]	valid_0's mape: 0.00856282
Did not meet early stopping. Best iteration is:
[9997]	valid_0's mape: 0.00856282
5540.863323876507
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20, 'lambda_l1': 0, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00870904
[4000]	valid_0's mape: 0.00859524
[6000]	valid_0's mape: 0.00857135
[8000]	valid_0's mape: 0.00856337
[10000]	valid_0's ma

5540.863774012067
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00870053
[4000]	valid_0's mape: 0.00858115
[6000]	valid_0's mape: 0.00855334
[8000]	valid_0's mape: 0.00854206
[10000]	valid_0's mape: 0.00853737
Did not meet early stopping. Best iteration is:
[10000]	valid_0's mape: 0.00853737
5576.863948669197
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 5, 'lambda_l1': 0.1, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00869954
[4000]	valid_0's mape: 0.00858032
[6000]	valid_0's mape: 0.0085537
[8000]	valid_0's mape: 0.00854347
[10000]	valid_0'

5586.863289113547
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20, 'lambda_l1': 0.01, 'lambda_l2': 0}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00867503
[4000]	valid_0's mape: 0.00856792
[6000]	valid_0's mape: 0.00854398
[8000]	valid_0's mape: 0.00853681
[10000]	valid_0's mape: 0.00853409
Did not meet early stopping. Best iteration is:
[9997]	valid_0's mape: 0.00853408
5567.86345986746
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20, 'lambda_l1': 0.01, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00869961
[4000]	valid_0's mape: 0.00858969
[6000]	valid_0's mape: 0.0085671
[8000]	valid_0's mape: 0.00856037
[10000]	valid_

5578.863775246249
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00870535
[4000]	valid_0's mape: 0.00860358
[6000]	valid_0's mape: 0.00858485
[8000]	valid_0's mape: 0.0085795
[10000]	valid_0's mape: 0.00857849
Did not meet early stopping. Best iteration is:
[9921]	valid_0's mape: 0.00857835
5541.862397640403
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50, 'lambda_l1': 0, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00873422
[4000]	valid_0's mape: 0.00863003
[6000]	valid_0's mape: 0.00861067
[8000]	valid_0's mape: 0.00860673
[10000]	valid_0's m

[2000]	valid_0's mape: 0.00866912
[4000]	valid_0's mape: 0.00855118
[6000]	valid_0's mape: 0.00852362
[8000]	valid_0's mape: 0.00851343
[10000]	valid_0's mape: 0.00850928
Did not meet early stopping. Best iteration is:
[9999]	valid_0's mape: 0.00850927
5536.864328085671
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 20, 'lambda_l1': 0.1, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00867243
[4000]	valid_0's mape: 0.00855541
[6000]	valid_0's mape: 0.00852688
[8000]	valid_0's mape: 0.00851696
[10000]	valid_0's mape: 0.00851205
Did not meet early stopping. Best iteration is:
[9990]	valid_0's mape: 0.00851204
5558.864354661227
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf':

[2000]	valid_0's mape: 0.00865243
[4000]	valid_0's mape: 0.00853718
[6000]	valid_0's mape: 0.00851383
[8000]	valid_0's mape: 0.0085081
[10000]	valid_0's mape: 0.00850558
Did not meet early stopping. Best iteration is:
[9993]	valid_0's mape: 0.00850557
5523.864622833541
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf': 50, 'lambda_l1': 0.01, 'lambda_l2': 0.01}
Training until validation scores don't improve for 1000 rounds.
[2000]	valid_0's mape: 0.00864928
[4000]	valid_0's mape: 0.00853704
[6000]	valid_0's mape: 0.00851578
[8000]	valid_0's mape: 0.00850953
[10000]	valid_0's mape: 0.00850747
Did not meet early stopping. Best iteration is:
[9915]	valid_0's mape: 0.00850744
5532.864634987517
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'mse', 'metric': 'mape', 'learning_rate': 0.015, 'num_leaves': 255, 'feature_fraction': 0.5, 'min_data_in_leaf':

In [17]:
display(gsearch_results2)

[[(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 5),
   ('lambda_l1', 0.01),
   ('lambda_l2', 0.01)),
  5573.197479493909,
  [5571.863946760461, 5586.863980462948, 5560.864511258318]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 20),
   ('lambda_l1', 0),
   ('lambda_l2', 0.1)),
  5570.196991228332,
  [5570.863562206722, 5586.863289113547, 5552.864122364727]],
 [(('task', 'train'),
   ('boosting_type', 'gbdt'),
   ('objective', 'mse'),
   ('metric', 'mape'),
   ('learning_rate', 0.015),
   ('num_leaves', 255),
   ('feature_fraction', 0.5),
   ('min_data_in_leaf', 5),
   ('lambda_l1', 0),
   ('lambda_l2', 0.01)),
  5564.530794751401,
  [5582.863947405401, 5555.863677030766

### Conclusion



Baseline Best params: ??  
   ('task', 'train'),  
   ('boosting_type', 'gbdt'),  
   ('objective', 'mse'),  
   ('metric', 'mape'),  
   ('learning_rate', 0.015),  
   ('num_leaves', 255),  
   ('feature_fraction', 0.5),  
   ('min_data_in_leaf', 5),  
   ('lambda_l1', 0.1),  
   ('lambda_l2', 0)  
  
Bestline scores: *5515.529915334645*  
[5532.864763855017, 5514.864357360793, 5498.860624788125]