In [1]:
%autosave 0

Autosave disabled


In [2]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error
from utilities import cal_score, cal_mape

In [3]:
stack_idx = '16'
models = '1-27'
use_test_kfold = set([2, 7, 8, 12, 13])

is_per_area = True
add_intercept = True

### Read CV predictions and test

In [4]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [5]:
idx_models = parse_models(models)
print(idx_models)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]
files_cv = {idx: [f for f in files_in_output 
                  if f.startswith('model-%02d-' % idx) and f.endswith('cv.csv')][0] 
            for idx in idx_models}
files_test_one = {idx: [f for f in files_in_output 
                        if f.startswith('model-%02d-' % idx) and f.endswith('test-one.csv')][0]
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output 
                       if f.startswith('model-%02d-' % idx) and f.endswith('test-kfold.csv')][0]
                 for idx in idx_models}

In [7]:
for k in files_cv: 
    print('%2d'%k, files_cv[k])
    print('%2d'%k, files_test_kf[k])
    print('%2d'%k, files_test_one[k])

 1 model-01-lgb-cv.csv
 1 model-01-lgb-test-kfold.csv
 1 model-01-lgb-test-one.csv
 2 model-02-keras-search-cv.csv
 2 model-02-keras-search-test-kfold.csv
 2 model-02-keras-search-test-one.csv
 3 model-03-lgb-feats-selection-cv.csv
 3 model-03-lgb-feats-selection-test-kfold.csv
 3 model-03-lgb-feats-selection-test-one.csv
 4 model-04-lgb-PCA-cv.csv
 4 model-04-lgb-PCA-test-kfold.csv
 4 model-04-lgb-PCA-test-one.csv
 5 model-05-lgb-wo-per-area-cv.csv
 5 model-05-lgb-wo-per-area-test-kfold.csv
 5 model-05-lgb-wo-per-area-test-one.csv
 6 model-06-lgb-lr0.001-cv.csv
 6 model-06-lgb-lr0.001-test-kfold.csv
 6 model-06-lgb-lr0.001-test-one.csv
 7 model-07-keras-embedding-cv.csv
 7 model-07-keras-embedding-test-kfold.csv
 7 model-07-keras-embedding-test-one.csv
 8 model-08-keras-search-long-cv.csv
 8 model-08-keras-search-long-test-kfold.csv
 8 model-08-keras-search-long-test-one.csv
 9 model-09-lgb-feats-selection-75-cv.csv
 9 model-09-lgb-feats-selection-75-test-kfold.csv
 9 model-09-lgb-fea

#### Load area

In [8]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [9]:
cv = df_train[['building_id', 'building_area', 'total_price']]
test = df_test[['building_id', 'building_area']]

In [10]:
print('CV predictions:')
print(len(idx_models))
for i, idx_model in enumerate(idx_models):
    f = files_cv[idx_model]
    print(f)
#    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id', 'total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict': 'pred_{}'.format(idx_model)})
    cv[f'log_pred_{idx_model}'] = np.log1p(cv[f'pred_{idx_model}'])
    cv[f'log_parea_pred_{idx_model}'] = np.log1p( cv[f'pred_{idx_model}'] / cv['building_area'] )

CV predictions:
27
model-01-lgb-cv.csv
model-02-keras-search-cv.csv
model-03-lgb-feats-selection-cv.csv
model-04-lgb-PCA-cv.csv
model-05-lgb-wo-per-area-cv.csv
model-06-lgb-lr0.001-cv.csv
model-07-keras-embedding-cv.csv
model-08-keras-search-long-cv.csv
model-09-lgb-feats-selection-75-cv.csv
model-10-lgb-feats-selection-75-lr-0.001-cv.csv
model-11-rf-cv.csv
model-12-predict-keras-search-prelu-cv.csv
model-13-predict-keras-he_uni-cv.csv
model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
model-16-lgb-feats-selection-68-lr-0.001-mix5-cv.csv
model-17-lgb-feats-selection-70-lr-0.001-mix5-cv.csv
model-18-lgb-feats-selection-70-lr-0.001-p5-cv.csv
model-19-lgb-search-bins-lr-0.0005-cv.csv
model-20-lgb-lr-0.0008-mix5-cv.csv
model-21-lgb-wo-per-area-long-cv.csv
model-22-lgb-wo-per-area-long-2-cv.csv
model-23-lgb-binary-cv.csv
model-24-lgb-binary-augment-cv.csv
model-25-lgb-search-bins-lr-0.0005-250-cv.csv
model-26-lgb-search-bins-lr-0.000

In [11]:
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

print('Test predictions:')
for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price': 'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx}'] = np.log1p( test[f'pred_{idx}'] / test['building_area'] )

Test predictions:
No. 0 file: model-01-lgb-test-one.csv
No. 1 file: model-02-keras-search-test-kfold.csv
No. 2 file: model-03-lgb-feats-selection-test-one.csv
No. 3 file: model-04-lgb-PCA-test-one.csv
No. 4 file: model-05-lgb-wo-per-area-test-one.csv
No. 5 file: model-06-lgb-lr0.001-test-one.csv
No. 6 file: model-07-keras-embedding-test-kfold.csv
No. 7 file: model-08-keras-search-long-test-kfold.csv
No. 8 file: model-09-lgb-feats-selection-75-test-one.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-test-one.csv
No. 10 file: model-11-rf-test-one.csv
No. 11 file: model-12-predict-keras-search-prelu-test-kfold.csv
No. 12 file: model-13-predict-keras-he_uni-test-kfold.csv
No. 13 file: model-14-lgb-feats-selection-75-lr-0.001-rand-test-one.csv
No. 14 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-test-one.csv
No. 15 file: model-16-lgb-feats-selection-68-lr-0.001-mix5-test-one.csv
No. 16 file: model-17-lgb-feats-selection-70-lr-0.001-mix5-test-one.csv
No. 17 file: model-18-l

In [12]:
display(cv.head())
display(test.head())

Unnamed: 0,building_id,building_area,total_price,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,...,log_pred_25,log_parea_pred_25,pred_26,log_pred_26,log_parea_pred_26,pred_27,log_pred_27,log_parea_pred_27,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,647603.8,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,...,13.368623,12.13952,645283.2,13.377446,12.148343,647790.5,13.381324,12.152221,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3321452.0,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,...,14.964998,13.56843,3164660.0,14.967557,13.570989,3127411.0,14.955716,13.559149,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9570885.0,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,...,16.099372,14.379417,9832843.0,16.101239,14.381284,9745913.0,16.092359,14.372404,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,14215010.0,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,...,16.36068,13.753334,12741790.0,16.360398,13.753051,12595480.0,16.348848,13.741502,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,762712.0,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,...,13.950262,12.405236,1114412.0,13.923838,12.378812,1131428.0,13.938992,12.393966,13.544637,11.999613


Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_24,pred_25,log_pred_25,log_parea_pred_25,pred_26,log_pred_26,log_parea_pred_26,pred_27,log_pred_27,log_parea_pred_27
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.189986,14562140.0,16.493936,15.264829,13581200.0,16.424197,15.195091,14054240.0,16.458435,15.229328
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.140432,3903389.0,15.177356,13.132737,3915563.0,15.18047,13.135851,3932854.0,15.184876,13.140257
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.643765,10640550.0,16.180183,13.681162,10764140.0,16.19173,13.69271,10400410.0,16.157355,13.658335
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.778239,6006309.0,15.608321,14.796389,6034948.0,15.613078,14.801146,5905111.0,15.591329,14.779397
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.188114,1091787.0,13.903327,12.143065,1096191.0,13.907353,12.147091,1048505.0,13.862877,12.102615


### Check models scores

In [13]:
for i, idx_model in enumerate(idx_models):
    print('%2d'%i, 'model-%02d'%idx_model, '%.6f'%cal_score(cv['total_price'], cv[f'pred_{idx_model}']))

 0 model-01 5870.873059
 1 model-02 5400.852164
 2 model-03 5877.873452
 3 model-04 5713.867808
 4 model-05 5724.869598
 5 model-06 5886.873769
 6 model-07 5171.836449
 7 model-08 5514.858826
 8 model-09 5872.873118
 9 model-10 5897.873845
10 model-11 5075.838018
11 model-12 5486.856963
12 model-13 5506.858055
13 model-14 5908.873901
14 model-15 5900.873836
15 model-16 5907.874126
16 model-17 5905.874165
17 model-18 5908.874297
18 model-19 5911.874156
19 model-20 5908.874040
20 model-21 5758.870702
21 model-22 5752.870671
22 model-23 5852.871357
23 model-24 5866.873835
24 model-25 5908.874238
25 model-26 5918.873998
26 model-27 5892.873194


In [14]:
cv['constant_1'] = 1
test['constant_1'] = 1

if is_per_area:
    cols_opt = [f'log_parea_pred_{idx}' for idx in idx_models]
else:
    cols_opt = [f'log_pred_{idx}' for idx in idx_models]

if add_intercept:
    cols_opt.append('constant_1')

### Define opt function

In [15]:
def objective(x, metric):
    cv_pred_final = cv.loc[:,cols_opt].dot(x)
    
    if is_per_area:
        cv_pred_final = np.expm1(cv_pred_final) * cv['building_area']
    else:
        cv_pred_final = np.expm1(cv_pred_final)

    global best_score
    global best_coeffs
    score = cal_score(cv['total_price'], cv_pred_final)
    if score > best_score[metric]:
        best_score[metric] = score
        best_coeffs[metric] = x.copy()
        print('find better score:')
        print('score: ', score)
        print('coeffs: ', x)
        print()
    
    if metric == 'mape':
        return cal_mape(cv['total_price'], cv_pred_final)
    elif metric == 'mse':
        return mean_squared_error(cv['total_price'], cv_pred_final)
    elif metric == 'mae':
        return mean_absolute_error(cv['total_price'], cv_pred_final)
    else:
        raise Exception('metric unknown: {}'.format(metric))
#    return 1 - (cal_score(cv['total_price'], cv_pred_final)/10000)

### Optimize

In [16]:
best_score = {}
best_coeffs = {}

len_x = len(cols_opt)
rev_len_x = 1/len_x
x0s = [ [1/len_x for i in range(len_x)],
       [0 for i in range(len_x)],
       [0.1 for i in range(len_x)],
       [0.2 for i in range(len_x)],
       [0.5 for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)] ]
bounds = tuple([(0,None) for i in range(len_x-1)] + [(None, None)])

for metric in ['mape']:
#for metric in ['mape', 'mae', 'mse']:
    best_score[metric] = 0
    best_coeffs[metric] = []
    for x0 in x0s:
        print('Optimizing with init x0: {}'.format(x0))
        print()
        display(minimize(objective, x0, args=(metric), bounds=bounds, tol=1e-4))

Optimizing with init x0: [0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571, 0.03571428571428571]

find better score:
score:  243.6399206950022
coeffs:  [0.03571429 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429
 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429
 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429
 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429 0.03571429
 0.03571429 0.03571429 0.03571429 0.03571429]

find better scor

find better score:
score:  5915.875338244682
coeffs:  [0.03690223 0.03690193 0.03690224 0.03690225 0.03690232 0.03690223
 0.03690179 0.0369023  0.03690222 0.03690224 0.03690249 0.03690223
 0.03690216 0.03690224 0.03690224 0.03690224 0.03690225 0.03690225
 0.03690224 0.03690224 0.03690231 0.03690233 0.03690227 0.03690229
 0.03690225 0.03690223 0.03690223 0.03580202]

find better score:
score:  5915.875338244686
coeffs:  [0.03690223 0.03690193 0.03690224 0.03690225 0.03690232 0.03690223
 0.03690179 0.0369023  0.03690222 0.03690224 0.03690249 0.03690223
 0.03690216 0.03690224 0.03690224 0.03690224 0.03690225 0.03690225
 0.03690224 0.03690224 0.03690231 0.03690232 0.03690227 0.0369023
 0.03690225 0.03690223 0.03690223 0.03580202]

find better score:
score:  5925.875395585104
coeffs:  [0.03691353 0.03691322 0.03691355 0.03691355 0.03691367 0.03691354
 0.03691308 0.03691363 0.03691353 0.03691355 0.0369134  0.03691353
 0.03691345 0.03691356 0.03691355 0.03691356 0.03691357 0.03691358
 0.03691

      fun: 0.12460441489567801
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([0.04107366, 0.04121492, 0.0406952 , 0.04137629, 0.03972942,
       0.04088965, 0.04147199, 0.04038841, 0.04098206, 0.04080005,
       0.05190581, 0.04114459, 0.0413315 , 0.04073569, 0.04081299,
       0.04064149, 0.04059947, 0.04046795, 0.04055162, 0.0406248 ,
       0.03976569, 0.03973459, 0.03973859, 0.03928523, 0.04036179,
       0.04070968, 0.04046556, 0.01070464])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 319
      nit: 3
   status: 0
  success: True
        x: array([0.03691353, 0.03691322, 0.03691355, 0.03691355, 0.03691367,
       0.03691354, 0.03691308, 0.03691363, 0.03691353, 0.03691355,
       0.0369134 , 0.03691353, 0.03691345, 0.03691356, 0.03691355,
       0.03691356, 0.03691357, 0.03691358, 0.03691356, 0.03691356,
       0.03691366, 0.03691367, 0.03691362, 0.03691366, 0.03691357,
       0.03691354, 0.03691356, 0.03580257])

Optimizing with init x0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



      fun: 1.0
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([-2.65232281e-05, -2.65121258e-05, -2.65232281e-05, -2.65343303e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.66231481e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65232281e-05,
       -2.65121258e-05, -2.65232281e-05, -2.65232281e-05, -2.65232281e-05,
       -2.65232281e-05, -2.65232281e-05, -2.65121258e-05, -2.65121258e-05,
       -2.65121258e-05, -2.65232281e-05, -2.65232281e-05, -2.08721929e-06])
  message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 29
      nit: 0
   status: 0
  success: True
        x: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Optimizing with init x0: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]



  """
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]


      fun: 0.7100881848248042
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([-0.90649976, -0.89725284, -0.9066065 , -0.9097274 , -0.90712279,
       -0.90620126, -0.89187626, -0.89951891, -0.90617036, -0.90623833,
       -0.91235566, -0.90074465, -0.89998888, -0.90634101, -0.90630836,
       -0.90637392, -0.90630903, -0.90665599, -0.90657802, -0.90628575,
       -0.9073119 , -0.90726084, -0.90461537, -0.90742799, -0.90654952,
       -0.9066889 , -0.90690266, -0.08321384])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 1508
      nit: 46
   status: 0
  success: True
        x: array([  0.07770974,   0.07565391,   0.07774107,   0.07836397,
         0.07774319,   0.07765078,   0.07443797,   0.07615331,
         0.07764719,   0.07766316,   0.07864393,   0.07636419,
         0.07610567,   0.07768567,   0.07767891,   0.0776873 ,
         0.07766662,   0.07774508,   0.07773504,   0.07767852,
         0.07776933,   0.07776529,   0.0773095 

Optimizing with init x0: [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]



      fun: 0.9999179881064014
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.46928025e-03, 1.50989221e-03, 1.46715973e-03, 1.45510270e-03,
       1.48081547e-03, 1.46855861e-03, 1.54753987e-03, 1.51153534e-03,
       1.46869183e-03, 1.46884727e-03, 1.42760248e-03, 1.49730228e-03,
       1.52517998e-03, 1.46812562e-03, 1.46852530e-03, 1.46912482e-03,
       1.46892498e-03, 1.46742618e-03, 1.46734846e-03, 1.46808121e-03,
       1.48046020e-03, 1.48050461e-03, 1.47715173e-03, 1.46271883e-03,
       1.46687107e-03, 1.46745949e-03, 1.46601620e-03, 8.25783886e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 3248
      nit: 111
   status: 0
  success: True
        x: array([  0.19895663,   0.19867018,   0.19896247,   0.19904417,
         0.19894095,   0.1989544 ,   0.19852767,   0.19866017,
         0.1989519 ,   0.19895267,   0.19915842,   0.19877756,
         0.19863743,   0.19895601,   0.19895404,   0.19895398,
         0.1989515

Optimizing with init x0: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]



      fun: 1.0000282121439756
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.04776188e-03, 1.05175868e-03, 1.04540820e-03, 1.04438680e-03,
       1.05795372e-03, 1.04447562e-03, 1.06969988e-03, 1.08415499e-03,
       1.04465325e-03, 1.04516396e-03, 1.02431397e-03, 1.05080389e-03,
       1.08002496e-03, 1.04540820e-03, 1.04543041e-03, 1.04556364e-03,
       1.04534159e-03, 1.04538600e-03, 1.04443121e-03, 1.04580788e-03,
       1.05826459e-03, 1.05719877e-03, 1.04691811e-03, 1.04769526e-03,
       1.04414255e-03, 1.04438680e-03, 1.04303233e-03, 5.94635452e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 9164
      nit: 315
   status: 0
  success: True
        x: array([   0.49999192,    0.49999968,    0.49999374,    0.49998984,
          0.49998296,    0.49999436,    0.49998659,    0.49999023,
          0.49999444,    0.49999419,    0.49999545,    0.49999375,
          0.49998518,    0.49999414,    0.49999421,    0.49999401,
  

Optimizing with init x0: [-0.006171548125817633, -0.6839408141194336, 0.11535958701412853, -0.6223710626412973, 0.838452742632921, 0.3657222623626336, 0.6312945692737545, 0.07653834570302179, 0.9176843450142236, -0.45632297762480384, 1.998172179178511, 0.43251013272327177, -1.0699228317047584, 0.12255867260633085, 0.3828961511176666, 0.19821543025483912, -0.8003487322408913, 1.4511672787595358, -0.9304527747794283, -0.8284175138024734, -0.8951095903410852, 0.1529490136742184, 0.08987383851370694, -0.2680052096166296, -0.038818029459293135, -1.2213984602034016, -0.3894135785838446, 0.12474045497848404]



      fun: 0.9999615192734861
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([-1.11444187e-04, -1.20781163e-04, -1.12965193e-04, -1.08790754e-04,
       -1.04694031e-04, -1.13831167e-04, -1.10800258e-04, -1.02062803e-04,
       -1.13964393e-04, -1.13620224e-04, -1.10955689e-04, -1.14896981e-04,
       -1.03783648e-04, -1.13320464e-04, -1.13375975e-04, -1.13298260e-04,
       -1.13520304e-04, -1.13098420e-04, -1.13753451e-04, -1.13187237e-04,
       -1.05471187e-04, -1.05915277e-04, -1.13231646e-04, -1.10556009e-04,
       -1.14008802e-04, -1.13775656e-04, -1.13953291e-04, -7.22755189e-06])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 4930
      nit: 169
   status: 0
  success: True
        x: array([ 0.00000000e+00,  0.00000000e+00,  1.15231732e-01,  0.00000000e+00,
        8.38309923e-01,  3.65597197e-01,  6.31223869e-01,  7.64313700e-02,
        9.17558966e-01,  0.00000000e+00,  1.99807325e+00,  4.32415657e-01,
        0.0000000

Optimizing with init x0: [1.1565246157895481, -0.10241728057551829, 2.5427021037413753, 3.135509629790989, 1.596129111448769, 0.9586372657138222, -1.5596945753614941, -1.4797911539546291, -1.2502750612605593, -0.6707887529346401, -0.931235543349877, 1.0581727395018021, 0.3547340938280621, -0.27315053404110523, 0.17532941268331692, 0.5733832780017227, 1.2841699153971757, 0.5896783627761278, 2.138618419536294, -0.18212269320442614, 0.2114967507684034, 0.879684443697266, -1.3380967722662302, -0.38817616151623247, 0.1378005646640042, -2.0816593489082225, 0.6014516420240814, 0.4973640361103583]



      fun: 1.000002081631378
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.49761350e+121, 1.50409812e+121, 1.49630986e+121, 1.49209867e+121,
       1.50412104e+121, 1.49566160e+121, 1.51128018e+121, 1.52620736e+121,
       1.49612944e+121, 1.49646702e+121, 1.46450686e+121, 1.49879210e+121,
       1.52416643e+121, 1.49646476e+121, 1.49656001e+121, 1.49673535e+121,
       1.49649661e+121, 1.49641660e+121, 1.49578121e+121, 1.49670135e+121,
       1.50532718e+121, 1.50455957e+121, 1.49651005e+121, 1.49700346e+121,
       1.49575045e+121, 1.49569112e+121, 1.49424519e+121, 8.55427652e+119])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 1218
      nit: 1
   status: 2
  success: False
        x: array([ 1.15652462,  0.        ,  2.5427021 ,  3.13550963,  1.59612911,
        0.95863727,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.05817274,  0.35473409,  0.        ,  0.17532941,
        0.57338328,  1.28416992,  0.58967836,

Optimizing with init x0: [0.7408683110665444, 2.2145306424460958, -0.29092640162055766, -0.15963104637280906, 0.1496076279087828, 1.3415464695618569, -0.32406679625762475, 0.19721555829392762, -0.48326086888414654, 0.31812127290456027, 0.43631205383825356, -0.18045380929533977, -0.39352755245009075, -1.0291608936679046, -1.3546980154739374, -0.886377399538567, 0.06746234077847273, -0.2672471725465588, 0.7623143416412979, -0.8839920879119842, -0.2632486079137709, -0.6959964243534731, -0.9350654187750874, 0.908578205413302, 0.5268221629127621, 0.10813816048849637, -0.7209603141979491, 1.6400426704710886]



      fun: 0.9999516354799837
 hess_inv: <28x28 LbfgsInvHessProduct with dtype=float64>
      jac: array([8.69115890e-04, 8.95394869e-04, 8.71991368e-04, 8.60300720e-04,
       8.43036752e-04, 8.71069883e-04, 8.49620374e-04, 9.08839670e-04,
       8.72524275e-04, 8.72102390e-04, 8.55338023e-04, 8.72624195e-04,
       8.71658301e-04, 8.72324435e-04, 8.72257822e-04, 8.69948558e-04,
       8.71636097e-04, 8.71369643e-04, 8.71702710e-04, 8.73812134e-04,
       8.49453841e-04, 8.47177883e-04, 8.67550476e-04, 8.73146000e-04,
       8.72635297e-04, 8.71402950e-04, 8.71913652e-04, 4.95603558e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 5046
      nit: 173
   status: 0
  success: True
        x: array([ 7.40821144e-01,  2.21446596e+00,  0.00000000e+00,  0.00000000e+00,
        1.49578929e-01,  1.34149795e+00,  0.00000000e+00,  1.97142191e-01,
        0.00000000e+00,  3.18072228e-01,  4.36272109e-01,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.0

In [17]:
display(best_score)
display(best_coeffs)

{'mape': 5925.875395585104}

{'mape': array([0.03691353, 0.03691322, 0.03691355, 0.03691355, 0.03691367,
        0.03691354, 0.03691308, 0.03691363, 0.03691353, 0.03691355,
        0.0369134 , 0.03691353, 0.03691345, 0.03691356, 0.03691355,
        0.03691356, 0.03691357, 0.03691358, 0.03691356, 0.03691356,
        0.03691366, 0.03691367, 0.03691362, 0.03691366, 0.03691357,
        0.03691354, 0.03691356, 0.03580257])}

### Compute submission

In [None]:
test_pred_final = pd.DataFrame({'building_id': test['building_id']})

test_pred_final['total_price'] = test.loc[:,cols_opt].dot(best_coeffs['mape'])

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_spopt-parea_{}_{}_poscoeffs.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_spopt_{}_{}_poscoeffs.csv'.format(stack_idx, models), index=False)

### Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(cv['log_parea_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price'] / test['building_area']), bins=100, label='test',
         normed=True, alpha=0.7)
plt.xlabel('log(price/area + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(cv['log_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(price + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(np.log1p(cv['building_area']), bins=100, label='train', normed=True)
plt.hist(np.log1p(test['building_area']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(building_area + 1)'); plt.ylabel('ratio'); plt.yscale('log')
plt.legend(); plt.grid(); plt.show()

In [None]:
print([1/17 if i in [3, 4, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26] else 0 \
 for i in list(range(1,24)) + list(range(25,28))])