In [17]:
%autosave 0

Autosave disabled


In [18]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error
from utilities import cal_score, cal_mape

In [19]:
stack_idx = '15'
models = '1-23,25-27'
use_test_kfold = set([2, 7, 8, 12, 13])

is_per_area = True
add_intercept = True

### Read CV predictions and test

In [20]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [21]:
idx_models = parse_models(models)
print(idx_models)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27]


In [22]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]
files_cv = {idx: [f for f in files_in_output 
                  if f.startswith('model-%02d-' % idx) and f.endswith('cv.csv')][0] 
            for idx in idx_models}
files_test_one = {idx: [f for f in files_in_output 
                        if f.startswith('model-%02d-' % idx) and f.endswith('test-one.csv')][0]
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output 
                       if f.startswith('model-%02d-' % idx) and f.endswith('test-kfold.csv')][0]
                 for idx in idx_models}

In [23]:
for k in files_cv: 
    print('%2d'%k, files_cv[k])
    print('%2d'%k, files_test_kf[k])
    print('%2d'%k, files_test_one[k])

 1 model-01-lgb-cv.csv
 1 model-01-lgb-test-kfold.csv
 1 model-01-lgb-test-one.csv
 2 model-02-keras-search-cv.csv
 2 model-02-keras-search-test-kfold.csv
 2 model-02-keras-search-test-one.csv
 3 model-03-lgb-feats-selection-cv.csv
 3 model-03-lgb-feats-selection-test-kfold.csv
 3 model-03-lgb-feats-selection-test-one.csv
 4 model-04-lgb-PCA-cv.csv
 4 model-04-lgb-PCA-test-kfold.csv
 4 model-04-lgb-PCA-test-one.csv
 5 model-05-lgb-wo-per-area-cv.csv
 5 model-05-lgb-wo-per-area-test-kfold.csv
 5 model-05-lgb-wo-per-area-test-one.csv
 6 model-06-lgb-lr0.001-cv.csv
 6 model-06-lgb-lr0.001-test-kfold.csv
 6 model-06-lgb-lr0.001-test-one.csv
 7 model-07-keras-embedding-cv.csv
 7 model-07-keras-embedding-test-kfold.csv
 7 model-07-keras-embedding-test-one.csv
 8 model-08-keras-search-long-cv.csv
 8 model-08-keras-search-long-test-kfold.csv
 8 model-08-keras-search-long-test-one.csv
 9 model-09-lgb-feats-selection-75-cv.csv
 9 model-09-lgb-feats-selection-75-test-kfold.csv
 9 model-09-lgb-fea

#### Load area

In [24]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [25]:
cv = df_train[['building_id', 'building_area', 'total_price']]
test = df_test[['building_id', 'building_area']]

In [26]:
print('CV predictions:')
print(len(idx_models))
for i, idx_model in enumerate(idx_models):
    f = files_cv[idx_model]
    print(f)
#    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id', 'total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict': 'pred_{}'.format(idx_model)})
    cv[f'log_pred_{idx_model}'] = np.log1p(cv[f'pred_{idx_model}'])
    cv[f'log_parea_pred_{idx_model}'] = np.log1p( cv[f'pred_{idx_model}'] / cv['building_area'] )

CV predictions:
26
model-01-lgb-cv.csv
model-02-keras-search-cv.csv
model-03-lgb-feats-selection-cv.csv
model-04-lgb-PCA-cv.csv
model-05-lgb-wo-per-area-cv.csv
model-06-lgb-lr0.001-cv.csv
model-07-keras-embedding-cv.csv
model-08-keras-search-long-cv.csv
model-09-lgb-feats-selection-75-cv.csv
model-10-lgb-feats-selection-75-lr-0.001-cv.csv
model-11-rf-cv.csv
model-12-predict-keras-search-prelu-cv.csv
model-13-predict-keras-he_uni-cv.csv
model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
model-16-lgb-feats-selection-68-lr-0.001-mix5-cv.csv
model-17-lgb-feats-selection-70-lr-0.001-mix5-cv.csv
model-18-lgb-feats-selection-70-lr-0.001-p5-cv.csv
model-19-lgb-search-bins-lr-0.0005-cv.csv
model-20-lgb-lr-0.0008-mix5-cv.csv
model-21-lgb-wo-per-area-long-cv.csv
model-22-lgb-wo-per-area-long-2-cv.csv
model-23-lgb-binary-cv.csv
model-25-lgb-search-bins-lr-0.0005-250-cv.csv
model-26-lgb-search-bins-lr-0.0005-350-cv.csv
model-27-lgb-feat_rm_n

In [27]:
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

print('Test predictions:')
for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price': 'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx}'] = np.log1p( test[f'pred_{idx}'] / test['building_area'] )

Test predictions:
No. 0 file: model-01-lgb-test-one.csv
No. 1 file: model-02-keras-search-test-kfold.csv
No. 2 file: model-03-lgb-feats-selection-test-one.csv
No. 3 file: model-04-lgb-PCA-test-one.csv
No. 4 file: model-05-lgb-wo-per-area-test-one.csv
No. 5 file: model-06-lgb-lr0.001-test-one.csv
No. 6 file: model-07-keras-embedding-test-kfold.csv
No. 7 file: model-08-keras-search-long-test-kfold.csv
No. 8 file: model-09-lgb-feats-selection-75-test-one.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-test-one.csv
No. 10 file: model-11-rf-test-one.csv
No. 11 file: model-12-predict-keras-search-prelu-test-kfold.csv
No. 12 file: model-13-predict-keras-he_uni-test-kfold.csv
No. 13 file: model-14-lgb-feats-selection-75-lr-0.001-rand-test-one.csv
No. 14 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-test-one.csv
No. 15 file: model-16-lgb-feats-selection-68-lr-0.001-mix5-test-one.csv
No. 16 file: model-17-lgb-feats-selection-70-lr-0.001-mix5-test-one.csv
No. 17 file: model-18-l

In [28]:
display(cv.head())
display(test.head())

Unnamed: 0,building_id,building_area,total_price,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,...,log_pred_25,log_parea_pred_25,pred_26,log_pred_26,log_parea_pred_26,pred_27,log_pred_27,log_parea_pred_27,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,647603.8,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,...,13.368623,12.13952,645283.2,13.377446,12.148343,647790.5,13.381324,12.152221,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3321452.0,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,...,14.964998,13.56843,3164660.0,14.967557,13.570989,3127411.0,14.955716,13.559149,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9570885.0,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,...,16.099372,14.379417,9832843.0,16.101239,14.381284,9745913.0,16.092359,14.372404,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,14215010.0,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,...,16.36068,13.753334,12741790.0,16.360398,13.753051,12595480.0,16.348848,13.741502,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,762712.0,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,...,13.950262,12.405236,1114412.0,13.923838,12.378812,1131428.0,13.938992,12.393966,13.544637,11.999613


Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_23,pred_25,log_pred_25,log_parea_pred_25,pred_26,log_pred_26,log_parea_pred_26,pred_27,log_pred_27,log_parea_pred_27
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.235527,14562140.0,16.493936,15.264829,13581200.0,16.424197,15.195091,14054240.0,16.458435,15.229328
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.130465,3903389.0,15.177356,13.132737,3915563.0,15.18047,13.135851,3932854.0,15.184876,13.140257
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.837332,10640550.0,16.180183,13.681162,10764140.0,16.19173,13.69271,10400410.0,16.157355,13.658335
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.785526,6006309.0,15.608321,14.796389,6034948.0,15.613078,14.801146,5905111.0,15.591329,14.779397
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.195684,1091787.0,13.903327,12.143065,1096191.0,13.907353,12.147091,1048505.0,13.862877,12.102615


### Check models scores

In [29]:
for i, idx_model in enumerate(idx_models):
    print('%2d'%i, 'model-%02d'%idx_model, '%.6f'%cal_score(cv['total_price'], cv[f'pred_{idx_model}']))

 0 model-01 5870.873059
 1 model-02 5400.852164
 2 model-03 5877.873452
 3 model-04 5713.867808
 4 model-05 5724.869598
 5 model-06 5886.873769
 6 model-07 5171.836449
 7 model-08 5514.858826
 8 model-09 5872.873118
 9 model-10 5897.873845
10 model-11 5075.838018
11 model-12 5486.856963
12 model-13 5506.858055
13 model-14 5908.873901
14 model-15 5900.873836
15 model-16 5907.874126
16 model-17 5905.874165
17 model-18 5908.874297
18 model-19 5911.874156
19 model-20 5908.874040
20 model-21 5758.870702
21 model-22 5752.870671
22 model-23 5852.871357
23 model-25 5908.874238
24 model-26 5918.873998
25 model-27 5892.873194


In [30]:
cv['constant_1'] = 1
test['constant_1'] = 1

if is_per_area:
    cols_opt = [f'log_parea_pred_{idx}' for idx in idx_models]
else:
    cols_opt = [f'log_pred_{idx}' for idx in idx_models]

if add_intercept:
    cols_opt.append('constant_1')

### Define opt function

In [31]:
def objective(x, metric):
    cv_pred_final = cv.loc[:,cols_opt].dot(x)
    
    if is_per_area:
        cv_pred_final = np.expm1(cv_pred_final) * cv['building_area']
    else:
        cv_pred_final = np.expm1(cv_pred_final)

    global best_score
    global best_coeffs
    score = cal_score(cv['total_price'], cv_pred_final)
    if score > best_score[metric]:
        best_score[metric] = score
        best_coeffs[metric] = x.copy()
        print('find better score:')
        print('score: ', score)
        print('coeffs: ', x)
        print()
    
    if metric == 'mape':
        return cal_mape(cv['total_price'], cv_pred_final)
    elif metric == 'mse':
        return mean_squared_error(cv['total_price'], cv_pred_final)
    elif metric == 'mae':
        return mean_absolute_error(cv['total_price'], cv_pred_final)
    else:
        raise Exception('metric unknown: {}'.format(metric))
#    return 1 - (cal_score(cv['total_price'], cv_pred_final)/10000)

### Optimize

In [32]:
best_score = {}
best_coeffs = {}

len_x = len(cols_opt)
rev_len_x = 1/len_x
x0s = [ [1/len_x for i in range(len_x)],
       [0 for i in range(len_x)],
       [0.1 for i in range(len_x)],
       [0.2 for i in range(len_x)],
       [0.5 for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)] ]
bounds = tuple([(0,None) for i in range(len_x-1)] + [(None, None)])

for metric in ['mape']:
#for metric in ['mape', 'mae', 'mse']:
    best_score[metric] = 0
    best_coeffs[metric] = []
    for x0 in x0s:
        print('Optimizing with init x0: {}'.format(x0))
        print()
        display(minimize(objective, x0, args=(metric), bounds=bounds, tol=1e-4))

Optimizing with init x0: [0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035, 0.037037037037037035]

find better score:
score:  221.63000982252524
coeffs:  [0.03703704 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704
 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704
 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704
 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704 0.03703704
 0.03703704 0.03703704 0.03703704]

find better score:
s

      fun: 0.12468742934995816
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([0.08012324, 0.08046836, 0.07975742, 0.08048161, 0.07883048,
       0.07994983, 0.08068062, 0.0795981 , 0.08004719, 0.07985691,
       0.09120675, 0.08036959, 0.08059984, 0.0797955 , 0.07987056,
       0.07968933, 0.07964874, 0.079501  , 0.07958463, 0.07965904,
       0.07884517, 0.07881486, 0.0787879 , 0.07938445, 0.07974885,
       0.07953639, 0.013721  ])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 308
      nit: 3
   status: 0
  success: True
        x: array([0.0383303 , 0.03832994, 0.03833033, 0.03833032, 0.03833046,
       0.03833031, 0.0383298 , 0.0383304 , 0.0383303 , 0.03833033,
       0.03833015, 0.03833028, 0.03833019, 0.03833034, 0.03833033,
       0.03833034, 0.03833035, 0.03833036, 0.03833034, 0.03833034,
       0.03833046, 0.03833046, 0.03833041, 0.03833036, 0.03833032,
       0.03833034, 0.03713223])

Optimizing with init x0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



      fun: 1.0
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([-2.65232281e-05, -2.65121258e-05, -2.65232281e-05, -2.65343303e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.66231481e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65232281e-05,
       -2.65121258e-05, -2.65232281e-05, -2.65232281e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65121258e-05,
       -2.65232281e-05, -2.65232281e-05, -2.08721929e-06])
  message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 28
      nit: 0
   status: 0
  success: True
        x: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Optimizing with init x0: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

find better score:
score:  5934.8758155327405
coeffs:  [ 0.03995878  0.03492177  0.04002801  0.04155648  0.04005009  0.03980939
  0.0319225   0.03614408  0.03980149  0.03983933  0.04225859  0.03664198
  0.03605399  0.03989509  0.03987791  0.03989896  0.03984965  0.04004124
  0.04001604  0.03987616  0.04012029  0.04010826  0.03895787  0.04001785
  0.04005102  0.04013738 -0.25406262]

find better score:
score:  5944.87597896387
coeffs:  [ 0.03953324  0.03445959  0.03960299  0.04114258  0.03962525  0.03938278
  0.03143849  0.03569082  0.03937482  0.03941293  0.04184955  0.03619233
  0.03560004  0.0394691   0.0394518   0.03947301  0.03942334  0.03961633
  0.03959094  0.03945003  0.03969596  0.03968384  0.03852508  0.03959276
  0.03962616  0.03971316 -0.10081488]



      fun: 0.123968494993015
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([-0.02529189, -0.02560284, -0.02567702, -0.02559187, -0.02669572,
       -0.02549886, -0.0256159 , -0.02640645, -0.02536467, -0.02558011,
       -0.01734908, -0.02580503, -0.02570488, -0.02563885, -0.02556242,
       -0.02575438, -0.02580373, -0.02600106, -0.02588443, -0.0257931 ,
       -0.02666214, -0.02670689, -0.02667053, -0.02602314, -0.02576856,
       -0.0260401 , -0.00209815])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 1792
      nit: 49
   status: 0
  success: True
        x: array([ 0.03968921,  0.0346291 ,  0.039759  ,  0.04129444,  0.03978179,
        0.03953925,  0.03161611,  0.03585757,  0.03953125,  0.03956938,
        0.04199476,  0.0363574 ,  0.03576662,  0.03962544,  0.03960814,
        0.03962939,  0.03957989,  0.03977247,  0.03974709,  0.0396065 ,
        0.03985228,  0.03984022,  0.03868451,  0.03974899,  0.03978215,
        0.039869

Optimizing with init x0: [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]



      fun: 0.9999249561380518
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.61121116e-03, 1.67468261e-03, 1.60835789e-03, 1.59097180e-03,
       1.62586611e-03, 1.61072267e-03, 1.73124848e-03, 1.67199588e-03,
       1.61076708e-03, 1.61088920e-03, 1.56127333e-03, 1.65629732e-03,
       1.69088077e-03, 1.60986779e-03, 1.61038960e-03, 1.61111124e-03,
       1.61106684e-03, 1.60869096e-03, 1.60879088e-03, 1.60983449e-03,
       1.62507785e-03, 1.62521108e-03, 1.62423408e-03, 1.60803593e-03,
       1.60902403e-03, 1.60728097e-03, 8.95394869e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 2996
      nit: 106
   status: 0
  success: True
        x: array([  0.19863602,   0.198185  ,   0.19864438,   0.19876743,
         0.19861573,   0.19862974,   0.1979499 ,   0.19819901,
         0.19862709,   0.1986285 ,   0.19889525,   0.19834491,
         0.1981638 ,   0.19863372,   0.19863089,   0.19863103,
         0.19862677,   0.19864232,

Optimizing with init x0: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]



      fun: 1.0000201218069065
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([9.05808761e-04, 9.09050613e-04, 9.03677133e-04, 9.02966590e-04,
       9.15179044e-04, 9.02788955e-04, 9.25703958e-04, 9.38493727e-04,
       9.02966590e-04, 9.03410680e-04, 8.85469476e-04, 9.08606523e-04,
       9.34807787e-04, 9.03654929e-04, 9.03654929e-04, 9.03788155e-04,
       9.03588315e-04, 9.03654929e-04, 9.02766750e-04, 9.03987996e-04,
       9.15378884e-04, 9.14424092e-04, 9.05053810e-04, 9.02478092e-04,
       9.02700137e-04, 9.01523300e-04, 5.13589171e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 8484
      nit: 302
   status: 0
  success: True
        x: array([   0.49999404,    0.50000454,    0.49999512,    0.49999156,
          0.49998668,    0.4999953 ,    0.49999603,    0.50000195,
          0.49999543,    0.49999539,    0.49999654,    0.49999872,
          0.49999431,    0.49999529,    0.49999544,    0.49999491,
          0.499995

Optimizing with init x0: [-0.32612340149380836, 0.3850984581378499, 0.5248725503576949, -0.019393071429940456, -0.15146589874438612, -0.19698797494023945, -0.3523293853834723, -0.07960173398708137, 1.169463323991356, 1.5500823502615255, 1.7045111447843215, -0.09989693244825995, -0.45897566418448565, -0.7342030826900312, 0.8663445103122507, -0.23963654215132346, 0.32313255660597195, 0.2974032636854622, -0.6091325616779175, -0.0704313255051451, 0.3529762529871914, -0.2646393968712387, -0.1382334399538735, -0.28088785935152366, -1.1761705922614456, 0.29446454103717074, -0.059758858582909885]



      fun: 0.9999682624431258
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.74613657e-03, 1.74287251e-03, 1.74481540e-03, 1.74176229e-03,
       1.75714998e-03, 1.74426029e-03, 1.74924519e-03, 1.75680581e-03,
       1.74473769e-03, 1.74508186e-03, 1.70433667e-03, 1.74030790e-03,
       1.76564319e-03, 1.74480430e-03, 1.74500414e-03, 1.74577020e-03,
       1.74495973e-03, 1.74505965e-03, 1.74447123e-03, 1.74451564e-03,
       1.75761627e-03, 1.75757187e-03, 1.74348314e-03, 1.74451564e-03,
       1.74439352e-03, 1.74286141e-03, 9.95092897e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 4536
      nit: 161
   status: 0
  success: True
        x: array([   0.        ,    0.38497544,    0.52472622,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          1.16931836,    1.54993698,    1.70439959,    0.        ,
          0.        ,    0.        ,    0.86619899,    0.        ,
          0.322987

Optimizing with init x0: [0.11690314279572289, -1.6649014221225087, -1.516384269807705, -1.4275880637683598, -1.4558868836800327, -0.9728129159563224, -1.163876798697098, -0.4182399313992773, 1.6574652405646964, -1.682863705513128, 1.1839513354803801, -0.09311838089551269, -0.8345171727482175, -0.2436150703381832, -0.38067117962364144, -0.05785173878115092, -1.8237362896542904, 1.3144660642892252, 0.28227509019597513, -0.4611497671334818, 1.6137873714765105, -0.38839744125356573, -0.3019208131142435, 0.6304768405751355, -0.7489224235065313, -0.8574857767978388, 1.802256278928514]



      fun: 0.9999541503757818
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([-2.44648746e-04, -2.47579734e-04, -2.45603538e-04, -2.43194354e-04,
       -2.40951703e-04, -2.46003218e-04, -2.38897790e-04, -2.37765363e-04,
       -2.46191956e-04, -2.45958809e-04, -2.41018316e-04, -2.43927101e-04,
       -2.38786768e-04, -2.45814480e-04, -2.45836684e-04, -2.45858889e-04,
       -2.45881093e-04, -2.45758969e-04, -2.46103138e-04, -2.45714560e-04,
       -2.41651144e-04, -2.41895393e-04, -2.44793075e-04, -2.46314080e-04,
       -2.46103138e-04, -2.45992116e-04, -1.45661261e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 4172
      nit: 148
   status: 0
  success: True
        x: array([   0.11659005,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          1.65715643,    0.        ,    1.18370939,    0.        ,
          0.        ,    0.        ,    0.        ,    0. 

Optimizing with init x0: [-1.1644217078034254, -0.628143566451876, 0.7954857952026582, 0.7635671646147046, 0.427780497628105, 0.8161728663444225, -0.19665030789397828, 0.5966194837677237, -0.8488021243156341, 1.3222255470993383, 0.1850064970920219, -0.5022368155177832, 1.2423054896027201, 2.1316257176832822, 0.1159215617397534, 1.5449642500205105, -0.04086269593243011, 0.19358526696737946, -0.931102399432431, -0.30314520684791674, -0.5534880716228003, -0.4713229150639486, 1.173196878527645, -0.6271879318183566, 0.9619879266414644, 1.2485175321635928, 0.6011222815054769]



      fun: 1.0000238234104921
 hess_inv: <27x27 LbfgsInvHessProduct with dtype=float64>
      jac: array([9.71134284e-04, 9.74820225e-04, 9.68802816e-04, 9.68025660e-04,
       9.81237314e-04, 9.67892433e-04, 9.92739224e-04, 1.00646158e-03,
       9.68070069e-04, 9.68536362e-04, 9.49351708e-04, 9.74109682e-04,
       1.00248698e-03, 9.68780611e-04, 9.68825020e-04, 9.68913838e-04,
       9.68736202e-04, 9.68758407e-04, 9.67848024e-04, 9.69158087e-04,
       9.81459358e-04, 9.80437953e-04, 9.70312719e-04, 9.67514957e-04,
       9.67781411e-04, 9.66493552e-04, 5.50670620e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 8876
      nit: 316
   status: 0
  success: True
        x: array([ 0.00000000e+00,  0.00000000e+00,  7.95478617e-01,  7.63555344e-01,
        4.27759898e-01,  8.16166209e-01,  0.00000000e+00,  5.96607693e-01,
        0.00000000e+00,  1.32221889e+00,  1.85000113e-01,  0.00000000e+00,
        1.24228689e+00,  2.13161879e+00,  1.15914807e-01,  1.

In [33]:
display(best_score)
display(best_coeffs)

{'mape': 5944.87597896387}

{'mape': array([ 0.03953324,  0.03445959,  0.03960299,  0.04114258,  0.03962525,
         0.03938278,  0.03143849,  0.03569082,  0.03937482,  0.03941293,
         0.04184955,  0.03619233,  0.03560004,  0.0394691 ,  0.0394518 ,
         0.03947301,  0.03942334,  0.03961633,  0.03959094,  0.03945003,
         0.03969596,  0.03968384,  0.03852508,  0.03959276,  0.03962616,
         0.03971316, -0.10081488])}

### Compute submission

In [None]:
test_pred_final = pd.DataFrame({'building_id': test['building_id']})

test_pred_final['total_price'] = test.loc[:,cols_opt].dot(best_coeffs['mape'])

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_spopt-parea_{}_{}_poscoeffs.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_spopt_{}_{}_poscoeffs.csv'.format(stack_idx, models), index=False)

### Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(cv['log_parea_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price'] / test['building_area']), bins=100, label='test',
         normed=True, alpha=0.7)
plt.xlabel('log(price/area + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(cv['log_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(price + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(np.log1p(cv['building_area']), bins=100, label='train', normed=True)
plt.hist(np.log1p(test['building_area']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(building_area + 1)'); plt.ylabel('ratio'); plt.yscale('log')
plt.legend(); plt.grid(); plt.show()

In [None]:
model-01-lgb-cv.csv
model-02-keras-search-cv.csv
model-03-lgb-feats-selection-cv.csv
model-04-lgb-PCA-cv.csv
model-05-lgb-wo-per-area-cv.csv
model-06-lgb-lr0.001-cv.csv
model-07-keras-embedding-cv.csv
model-08-keras-search-long-cv.csv
No. 8 file: model-09-lgb-feats-selection-75-cv.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-cv.csv
No. 10 file: model-11-rf-cv.csv
No. 11 file: model-12-predict-keras-search-prelu-cv.csv
No. 12 file: model-13-predict-keras-he_uni-cv.csv
No. 13 file: model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
No. 14 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
No. 15 file: model-16-lgb-feats-selection-68-lr-0.001-mix5-cv.csv
No. 16 file: model-17-lgb-feats-selection-70-lr-0.001-mix5-cv.csv
No. 17 file: model-18-lgb-feats-selection-70-lr-0.001-p5-cv.csv
No. 18 file: model-19-lgb-search-bins-lr-0.0005-cv.csv
No. 19 file: model-20-lgb-lr-0.0008-mix5-cv.csv
No. 20 file: model-21-lgb-wo-per-area-long-cv.csv
No. 21 file: model-22-lgb-wo-per-area-long-2-cv.csv
No. 22 file: model-25-lgb-search-bins-lr-0.0005-250-cv.csv
No. 23 file: model-26-lgb-search-bins-lr-0.0005-350-cv.csv
No. 24 file: model-27-lgb-feat_rm_new-cv.csv

In [None]:
print([1/17 if i in [3, 4, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26] else 0 \
 for i in list(range(1,24)) + list(range(25,28))])