In [1]:
%autosave 0

Autosave disabled


In [2]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error
from utilities import cal_score, cal_mape

In [3]:
stack_idx = '17'
models = '1-31'
use_test_kfold = set([2, 7, 8, 12, 13])

is_per_area = False
add_intercept = True

### Read CV predictions and test

In [4]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [5]:
idx_models = parse_models(models)
print(idx_models)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]
files_cv = {idx: [f for f in files_in_output 
                  if f.startswith('model-%02d-' % idx) and f.endswith('cv.csv')][0] 
            for idx in idx_models}
files_test_one = {idx: [f for f in files_in_output 
                        if f.startswith('model-%02d-' % idx) and f.endswith('test-one.csv')][0]
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output 
                       if f.startswith('model-%02d-' % idx) and f.endswith('test-kfold.csv')][0]
                 for idx in idx_models}

In [7]:
for k in files_cv: 
    print('%2d'%k, files_cv[k])
    print('%2d'%k, files_test_kf[k])
    print('%2d'%k, files_test_one[k])

 1 model-01-lgb-cv.csv
 1 model-01-lgb-test-kfold.csv
 1 model-01-lgb-test-one.csv
 2 model-02-keras-search-cv.csv
 2 model-02-keras-search-test-kfold.csv
 2 model-02-keras-search-test-one.csv
 3 model-03-lgb-feats-selection-cv.csv
 3 model-03-lgb-feats-selection-test-kfold.csv
 3 model-03-lgb-feats-selection-test-one.csv
 4 model-04-lgb-PCA-cv.csv
 4 model-04-lgb-PCA-test-kfold.csv
 4 model-04-lgb-PCA-test-one.csv
 5 model-05-lgb-wo-per-area-cv.csv
 5 model-05-lgb-wo-per-area-test-kfold.csv
 5 model-05-lgb-wo-per-area-test-one.csv
 6 model-06-lgb-lr0.001-cv.csv
 6 model-06-lgb-lr0.001-test-kfold.csv
 6 model-06-lgb-lr0.001-test-one.csv
 7 model-07-keras-embedding-cv.csv
 7 model-07-keras-embedding-test-kfold.csv
 7 model-07-keras-embedding-test-one.csv
 8 model-08-keras-search-long-cv.csv
 8 model-08-keras-search-long-test-kfold.csv
 8 model-08-keras-search-long-test-one.csv
 9 model-09-lgb-feats-selection-75-cv.csv
 9 model-09-lgb-feats-selection-75-test-kfold.csv
 9 model-09-lgb-fea

#### Load area

In [8]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [9]:
cv = df_train[['building_id', 'building_area', 'total_price']]
test = df_test[['building_id', 'building_area']]

In [10]:
print('CV predictions:')
print(len(idx_models))
for i, idx_model in enumerate(idx_models):
    f = files_cv[idx_model]
    print(f)
#    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id', 'total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict': 'pred_{}'.format(idx_model)})
    cv[f'log_pred_{idx_model}'] = np.log1p(cv[f'pred_{idx_model}'])
    cv[f'log_parea_pred_{idx_model}'] = np.log1p( cv[f'pred_{idx_model}'] / cv['building_area'] )

CV predictions:
31
model-01-lgb-cv.csv
model-02-keras-search-cv.csv
model-03-lgb-feats-selection-cv.csv
model-04-lgb-PCA-cv.csv
model-05-lgb-wo-per-area-cv.csv
model-06-lgb-lr0.001-cv.csv
model-07-keras-embedding-cv.csv
model-08-keras-search-long-cv.csv
model-09-lgb-feats-selection-75-cv.csv
model-10-lgb-feats-selection-75-lr-0.001-cv.csv
model-11-rf-cv.csv
model-12-predict-keras-search-prelu-cv.csv
model-13-predict-keras-he_uni-cv.csv
model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
model-16-lgb-feats-selection-68-lr-0.001-mix5-cv.csv
model-17-lgb-feats-selection-70-lr-0.001-mix5-cv.csv
model-18-lgb-feats-selection-70-lr-0.001-p5-cv.csv
model-19-lgb-search-bins-lr-0.0005-cv.csv
model-20-lgb-lr-0.0008-mix5-cv.csv
model-21-lgb-wo-per-area-long-cv.csv
model-22-lgb-wo-per-area-long-2-cv.csv
model-23-lgb-binary-cv.csv
model-24-lgb-binary-augment-cv.csv
model-25-lgb-search-bins-lr-0.0005-250-cv.csv
model-26-lgb-search-bins-lr-0.000

In [11]:
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

print('Test predictions:')
for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price': 'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx}'] = np.log1p( test[f'pred_{idx}'] / test['building_area'] )

Test predictions:
No. 0 file: model-01-lgb-test-one.csv
No. 1 file: model-02-keras-search-test-kfold.csv
No. 2 file: model-03-lgb-feats-selection-test-one.csv
No. 3 file: model-04-lgb-PCA-test-one.csv
No. 4 file: model-05-lgb-wo-per-area-test-one.csv
No. 5 file: model-06-lgb-lr0.001-test-one.csv
No. 6 file: model-07-keras-embedding-test-kfold.csv
No. 7 file: model-08-keras-search-long-test-kfold.csv
No. 8 file: model-09-lgb-feats-selection-75-test-one.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-test-one.csv
No. 10 file: model-11-rf-test-one.csv
No. 11 file: model-12-predict-keras-search-prelu-test-kfold.csv
No. 12 file: model-13-predict-keras-he_uni-test-kfold.csv
No. 13 file: model-14-lgb-feats-selection-75-lr-0.001-rand-test-one.csv
No. 14 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-test-one.csv
No. 15 file: model-16-lgb-feats-selection-68-lr-0.001-mix5-test-one.csv
No. 16 file: model-17-lgb-feats-selection-70-lr-0.001-mix5-test-one.csv
No. 17 file: model-18-l

In [12]:
display(cv.head())
display(test.head())

Unnamed: 0,building_id,building_area,total_price,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,...,log_pred_29,log_parea_pred_29,pred_30,log_pred_30,log_parea_pred_30,pred_31,log_pred_31,log_parea_pred_31,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,647603.8,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,...,13.349368,12.120265,668726.2,13.413132,12.184028,678901.1,13.428232,12.199129,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3321452.0,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,...,14.961478,13.56491,3180461.0,14.972537,13.575969,2996046.0,14.912804,13.516237,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9570885.0,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,...,16.094058,14.374103,9739343.0,16.091684,14.371729,9840726.0,16.10204,14.382085,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,14215010.0,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,...,16.351248,13.743901,12424920.0,16.335215,13.727868,12463210.0,16.338292,13.730945,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,762712.0,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,...,13.936413,12.391388,1227169.0,14.020221,12.475195,1159105.0,13.96316,12.418134,13.544637,11.999613


Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_28,pred_29,log_pred_29,log_parea_pred_29,pred_30,log_pred_30,log_parea_pred_30,pred_31,log_pred_31,log_parea_pred_31
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.284537,14588510.0,16.495745,15.266638,13418230.0,16.412125,15.183018,12356580.0,16.329699,15.100592
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.138282,3930381.0,15.184247,13.139628,3931973.0,15.184652,13.140033,3933268.0,15.184982,13.140363
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.683307,10536250.0,16.170332,13.671312,12212390.0,16.317962,13.818941,10399000.0,16.15722,13.658199
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.799625,5922934.0,15.594343,14.78241,5957837.0,15.600218,14.788286,5944207.0,15.597928,14.785996
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.131217,1092658.0,13.904125,12.143863,1110080.0,13.919943,12.159682,1140621.0,13.947084,12.186822


### Check models scores

In [13]:
for i, idx_model in enumerate(idx_models):
    print('%2d'%i, 'model-%02d'%idx_model, '%.6f'%cal_score(cv['total_price'], cv[f'pred_{idx_model}']))

 0 model-01 5870.873059
 1 model-02 5400.852164
 2 model-03 5877.873452
 3 model-04 5713.867808
 4 model-05 5724.869598
 5 model-06 5886.873769
 6 model-07 5171.836449
 7 model-08 5514.858826
 8 model-09 5872.873118
 9 model-10 5897.873845
10 model-11 5075.838018
11 model-12 5486.856963
12 model-13 5506.858055
13 model-14 5908.873901
14 model-15 5900.873836
15 model-16 5907.874126
16 model-17 5905.874165
17 model-18 5908.874297
18 model-19 5911.874156
19 model-20 5908.874040
20 model-21 5758.870702
21 model-22 5752.870671
22 model-23 5852.871357
23 model-24 5866.873835
24 model-25 5908.874238
25 model-26 5918.873998
26 model-27 5892.873194
27 model-28 5901.874202
28 model-29 5884.873848
29 model-30 5864.872101
30 model-31 5783.868923


In [14]:
cv['constant_1'] = 1
test['constant_1'] = 1

if is_per_area:
    cols_opt = [f'log_parea_pred_{idx}' for idx in idx_models]
else:
    cols_opt = [f'log_pred_{idx}' for idx in idx_models]

if add_intercept:
    cols_opt.append('constant_1')

### Define opt function

In [15]:
def objective(x, idx, metric, best_score, best_coeffs, verbose):
    cv_pred_final = cv.loc[idx,cols_opt].dot(x)
    
    if is_per_area:
        cv_pred_final = np.expm1(cv_pred_final) * cv.loc[idx,'building_area']
    else:
        cv_pred_final = np.expm1(cv_pred_final)

    score = cal_score(cv.loc[idx,'total_price'], cv_pred_final)
    if score > best_score[metric]:
        best_score[metric] = score
        best_coeffs[metric] = x.copy()
        if verbose:
            print('find better score:')
            print('score: ', score)
            print('coeffs: ', x)
            print()
    
    if metric == 'mape':
        return cal_mape(cv.loc[idx,'total_price'], cv_pred_final)
    elif metric == 'mse':
        return mean_squared_error(cv.loc[idx,'total_price'], cv_pred_final)
    elif metric == 'mae':
        return mean_absolute_error(cv.loc[idx,'total_price'], cv_pred_final)
    elif metric == 'smooth':
        return cal_score_smooth(cv['total_price'], cv_pred_final)
    else:
        raise Exception('metric unknown: {}'.format(metric))
#    return 1 - (cal_score(cv['total_price'], cv_pred_final)/10000)

### CV

In [16]:
from sklearn.model_selection import KFold

cv = cv.reset_index(drop=True)
#cv = cv.head(100)

len_x = len(cols_opt)
rev_len_x = 1/len_x
x0s = [[1/len_x for i in range(len_x)],
       [0, 0, 0.05263157894736842, 0, 0.05263157894736842, 0, 0, 0.05263157894736842, 0, 0, 0, 0.05263157894736842,
        0.05263157894736842, 0, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842,
        0, 0.05263157894736842, 0, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842,
        0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0, 0.05263157894736842,
        0],
       [0 for i in range(len_x)],
       [0.1 for i in range(len_x)],
       [0.2 for i in range(len_x)],
       [0.5 for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)] ]

score_list = []

kf = KFold(shuffle= True)
for idx_train, idx_val in kf.split(cv):

    best_score = {}
    best_coeffs = {}

    for metric in ['mape']:
    #for metric in ['mape', 'mae', 'mse']:
        best_score[metric] = 0
        best_coeffs[metric] = []
        for x0 in x0s:
#            print('Optimizing with init x0: {}'.format(x0))
#            print()
            minimize(objective, x0, args=(idx_train, metric, best_score, best_coeffs, False), tol=1e-4)
    
    val_pred_final = cv.loc[idx_val, cols_opt].dot(best_coeffs['mape'])
    if is_per_area:
        val_pred_final = np.expm1(val_pred_final) * cv.loc[idx_val,'building_area']
    else:
        val_pred_final = np.expm1(val_pred_final)
    score = cal_score(cv.loc[idx_val, 'total_price'], val_pred_final)
    
    score_list.append(score)

print('CV score ?: {}; {}'.format(np.mean(score_list), score_list))

  import sys
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  import sys
  import sys
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  import sys
  import sys
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  import sys


CV score ?: 5953.542888454808; [5963.87614336965, 5933.875253045275, 5962.877268949499]


### Optimize

In [17]:
best_score = {}
best_coeffs = {}

len_x = len(cols_opt)
rev_len_x = 1/len_x
x0s = [[1/len_x for i in range(len_x)],
       [0, 0, 0.05263157894736842, 0, 0.05263157894736842, 0, 0, 0.05263157894736842, 0, 0, 0, 0.05263157894736842,
        0.05263157894736842, 0, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842,
        0, 0.05263157894736842, 0, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842,
        0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0.05263157894736842, 0, 0.05263157894736842,
        0],
       [0 for i in range(len_x)],
       [0.1 for i in range(len_x)],
       [0.2 for i in range(len_x)],
       [0.5 for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)] ]

for metric in ['mape']:
#for metric in ['mape', 'mae', 'mse']:
    best_score[metric] = 0
    best_coeffs[metric] = []
    for x0 in x0s:
        print('Optimizing with init x0: {}'.format(x0))
        print()
        minimize(objective, x0, args=(cv.index, metric, best_score, best_coeffs, True), tol=1e-4)

Optimizing with init x0: [0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125]

find better score:
score:  228.6362609930938
coeffs:  [0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125
 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125
 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125 0.03125
 0.03125 0.03125 0.03125 0.03125 0.03125]

find better score:
score:  228.6362611317718
coeffs:  [0.03125001 0.03125    0.03125    0.03125    0.03125    0.03125
 0.03125    0.03125    0.03125    0.03125    0.03125    0.03125
 0.03125    0.03125    0.03125    0.03125    0.03125    0.03125
 0.03125    0.03125    0.03125    0.03125    0.03125    0.03125
 0.03125    0.03125    0.03125    0.03125    0.03125    0.0

find better score:
score:  5958.876627948367
coeffs:  [ 0.02787818  0.03637302  0.03259742  0.02941103  0.04604568  0.03024134
  0.03408882  0.0457697   0.0289421   0.0306263  -0.07975444  0.03636437
  0.03502899  0.03149795  0.03051846  0.03274055  0.0332368   0.03518183
  0.03342985  0.03224797  0.04530024  0.04575447  0.03972202  0.0495342
  0.03549926  0.03162705  0.03588846  0.03424779  0.03520987  0.04408628
  0.04257159 -0.03911001]

find better score:
score:  5960.876668142638
coeffs:  [ 0.02635538  0.03645316  0.03251682  0.02965929  0.0493096   0.02946144
  0.03329448  0.04888462  0.02777094  0.02999079 -0.10128821  0.03657154
  0.03536909  0.03111985  0.02984551  0.0326986   0.03331159  0.03584088
  0.03364813  0.03199743  0.04834688  0.04893121  0.04128503  0.05396502
  0.03614813  0.03146285  0.03738042  0.03451507  0.03587883  0.04703462
  0.04503235 -0.05367066]

find better score:
score:  5960.876675515882
coeffs:  [ 0.02534738  0.03593241  0.03230726  0.03039076  0.050

  import sys
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  import sys


Optimizing with init x0: [-1.0002816207289291, 0.8691142193256435, -1.274593543507279, -0.0779756333541868, -0.20610742103419472, 0.4070408143466146, 0.4774126130344732, -1.0008209208579426, -0.439125127889783, -0.21725806695024838, 0.13701729669915785, 0.7151621143278234, 1.8484122348784255, -0.7106097962812747, 0.38359106456607756, -0.5116532089218171, -1.121776952663538, -0.6663317249733918, 0.09025391324086693, 0.8711553651693659, -1.0508298931292392, -0.20565958861547584, -0.6915383376782511, -0.7718862664896665, -0.46212994851815886, -0.6301641734863925, -0.7570704974409744, 1.2847203651427057, 0.42038450004015876, -0.009274428761146182, -2.167099473395166, -1.9848805360476711]

Optimizing with init x0: [-1.7211276202790733, 0.2821952493733353, -0.12568715556383206, -1.4327786749016507, -0.5191585186317217, -0.550185288318647, 1.9135545996782932, 0.18629908102060921, 0.6328894250307792, -0.6017552688156182, 1.3554739608344475, -1.6892385550859053, -1.5458406323876936, -0.70554177

In [18]:
display(best_score)
display(best_coeffs)

{'mape': 5967.876864732974}

{'mape': array([-0.04451305,  0.01554676,  0.04277345,  0.08348524,  0.08342526,
        -0.02442229,  0.02469982,  0.09148885, -0.03243444, -0.0288779 ,
        -0.08703782,  0.00519713,  0.03297476, -0.02196949,  0.02124937,
         0.03620286,  0.03864656,  0.05404498, -0.01135417,  0.02154517,
         0.03235905,  0.08781572,  0.06278014,  0.13330872,  0.04461335,
         0.03728363,  0.1014006 ,  0.03370472,  0.0467694 ,  0.05973887,
         0.06474666, -0.09147154])}

### Compute submission

In [None]:
test_pred_final = pd.DataFrame({'building_id': test['building_id']})

test_pred_final['total_price'] = test.loc[:,cols_opt].dot(best_coeffs['mape'])

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_spopt-parea_{}_{}.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_spopt_{}_{}.csv'.format(stack_idx, models), index=False)

### Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(cv['log_parea_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price'] / test['building_area']), bins=100, label='test',
         normed=True, alpha=0.7)
plt.xlabel('log(price/area + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(cv['log_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(price + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(np.log1p(cv['building_area']), bins=100, label='train', normed=True)
plt.hist(np.log1p(test['building_area']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(building_area + 1)'); plt.ylabel('ratio'); plt.yscale('log')
plt.legend(); plt.grid(); plt.show()