In [1]:
%autosave 0

Autosave disabled


In [2]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error
from utilities import cal_score, cal_mape

In [3]:
stack_idx = '11'
models = '1-11,14-17'
use_test_kfold = set([2, 7, 8, 12, 13])

is_per_area = True
add_intercept = True

### Read CV predictions and test

In [4]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [5]:
idx_models = parse_models(models)
print(idx_models)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17]


In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]
files_cv = {idx: [f for f in files_in_output 
                  if f.startswith('model-%02d-' % idx) and f.endswith('cv.csv')][0] 
            for idx in idx_models}
files_test_one = {idx: [f for f in files_in_output 
                        if f.startswith('model-%02d-' % idx) and f.endswith('test-one.csv')][0]
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output 
                       if f.startswith('model-%02d-' % idx) and f.endswith('test-kfold.csv')][0]
                 for idx in idx_models}

In [7]:
for k in files_cv: 
    print('%2d'%k, files_cv[k])
    print('%2d'%k, files_test_kf[k])
    print('%2d'%k, files_test_one[k])

 1 model-01-lgb-cv.csv
 1 model-01-lgb-test-kfold.csv
 1 model-01-lgb-test-one.csv
 2 model-02-keras-search-cv.csv
 2 model-02-keras-search-test-kfold.csv
 2 model-02-keras-search-test-one.csv
 3 model-03-lgb-feats-selection-cv.csv
 3 model-03-lgb-feats-selection-test-kfold.csv
 3 model-03-lgb-feats-selection-test-one.csv
 4 model-04-lgb-PCA-cv.csv
 4 model-04-lgb-PCA-test-kfold.csv
 4 model-04-lgb-PCA-test-one.csv
 5 model-05-lgb-wo-per-area-cv.csv
 5 model-05-lgb-wo-per-area-test-kfold.csv
 5 model-05-lgb-wo-per-area-test-one.csv
 6 model-06-lgb-lr0.001-cv.csv
 6 model-06-lgb-lr0.001-test-kfold.csv
 6 model-06-lgb-lr0.001-test-one.csv
 7 model-07-keras-embedding-cv.csv
 7 model-07-keras-embedding-test-kfold.csv
 7 model-07-keras-embedding-test-one.csv
 8 model-08-keras-search-long-cv.csv
 8 model-08-keras-search-long-test-kfold.csv
 8 model-08-keras-search-long-test-one.csv
 9 model-09-lgb-feats-selection-75-cv.csv
 9 model-09-lgb-feats-selection-75-test-kfold.csv
 9 model-09-lgb-fea

#### Load area

In [8]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [9]:
cv = df_train[['building_id', 'building_area', 'total_price']]
test = df_test[['building_id', 'building_area']]

In [10]:
print('CV predictions:')
print(len(idx_models))
for i, idx_model in enumerate(idx_models):
    f = files_cv[idx_model]
    print(f)
#    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id', 'total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict': 'pred_{}'.format(idx_model)})
    cv[f'log_pred_{idx_model}'] = np.log1p(cv[f'pred_{idx_model}'])
    cv[f'log_parea_pred_{idx_model}'] = np.log1p( cv[f'pred_{idx_model}'] / cv['building_area'] )

CV predictions:
15
model-01-lgb-cv.csv
model-02-keras-search-cv.csv
model-03-lgb-feats-selection-cv.csv
model-04-lgb-PCA-cv.csv
model-05-lgb-wo-per-area-cv.csv
model-06-lgb-lr0.001-cv.csv
model-07-keras-embedding-cv.csv
model-08-keras-search-long-cv.csv
model-09-lgb-feats-selection-75-cv.csv
model-10-lgb-feats-selection-75-lr-0.001-cv.csv
model-11-rf-cv.csv
model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
model-16-lgb-feats-selection-68-lr-0.001-mix5-cv.csv
model-17-lgb-feats-selection-70-lr-0.001-mix5-cv.csv


In [11]:
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

print('Test predictions:')
for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price': 'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx}'] = np.log1p( test[f'pred_{idx}'] / test['building_area'] )

Test predictions:
No. 0 file: model-01-lgb-test-one.csv
No. 1 file: model-02-keras-search-test-kfold.csv
No. 2 file: model-03-lgb-feats-selection-test-one.csv
No. 3 file: model-04-lgb-PCA-test-one.csv
No. 4 file: model-05-lgb-wo-per-area-test-one.csv
No. 5 file: model-06-lgb-lr0.001-test-one.csv
No. 6 file: model-07-keras-embedding-test-kfold.csv
No. 7 file: model-08-keras-search-long-test-kfold.csv
No. 8 file: model-09-lgb-feats-selection-75-test-one.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-test-one.csv
No. 10 file: model-11-rf-test-one.csv
No. 11 file: model-14-lgb-feats-selection-75-lr-0.001-rand-test-one.csv
No. 12 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-test-one.csv
No. 13 file: model-16-lgb-feats-selection-68-lr-0.001-mix5-test-one.csv
No. 14 file: model-17-lgb-feats-selection-70-lr-0.001-mix5-test-one.csv


In [12]:
display(cv.head())
display(test.head())

Unnamed: 0,building_id,building_area,total_price,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,...,log_pred_15,log_parea_pred_15,pred_16,log_pred_16,log_parea_pred_16,pred_17,log_pred_17,log_parea_pred_17,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,647603.8,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,...,13.411432,12.182328,654246.2,13.391241,12.162137,655244.9,13.392766,12.163663,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3321452.0,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,...,14.96392,13.567353,3128284.0,14.955995,13.559428,3130186.0,14.956603,13.560036,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9570885.0,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,...,16.09143,14.371475,9758797.0,16.09368,14.373725,9787304.0,16.096597,14.376642,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,14215010.0,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,...,16.351229,13.743883,12668680.0,16.354644,13.747297,12708930.0,16.357815,13.750469,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,762712.0,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,...,13.923267,12.378241,1122232.0,13.930831,12.385805,1110012.0,13.919882,12.374856,13.544637,11.999613


Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_14,pred_15,log_pred_15,log_parea_pred_15,pred_16,log_pred_16,log_parea_pred_16,pred_17,log_pred_17,log_parea_pred_17
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.270289,14763880.0,16.507694,15.278587,13886340.0,16.446416,15.21731,14499050.0,16.489594,15.260487
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.14467,3928353.0,15.183731,13.139112,3918650.0,15.181258,13.136639,3917960.0,15.181082,13.136463
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.677153,10629000.0,16.179096,13.680076,10735460.0,16.189063,13.690042,10633430.0,16.179513,13.680493
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.81515,6085773.0,15.621464,14.809532,6080935.0,15.620669,14.808737,6119193.0,15.626941,14.815009
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.116189,1061918.0,13.875588,12.115326,1072461.0,13.885468,12.125206,1084776.0,13.896885,12.136623


### Check models scores

In [13]:
for i, idx_model in enumerate(idx_models):
    print('%2d'%i, 'model-%02d'%idx_model, '%.6f'%cal_score(cv['total_price'], cv[f'pred_{idx_model}']))

 0 model-01 5870.873059
 1 model-02 5400.852164
 2 model-03 5877.873452
 3 model-04 5713.867808
 4 model-05 5724.869598
 5 model-06 5886.873769
 6 model-07 5171.836449
 7 model-08 5514.858826
 8 model-09 5872.873118
 9 model-10 5897.873845
10 model-11 5075.838018
11 model-14 5908.873901
12 model-15 5900.873836
13 model-16 5907.874126
14 model-17 5905.874165


In [14]:
cv['constant_1'] = 1
test['constant_1'] = 1

if is_per_area:
    cols_opt = [f'log_parea_pred_{idx}' for idx in idx_models]
else:
    cols_opt = [f'log_pred_{idx}' for idx in idx_models]

if add_intercept:
    cols_opt.append('constant_1')

### Define opt function

In [15]:
def objective(x, metric):
    cv_pred_final = cv.loc[:,cols_opt].dot(x)
    
    if is_per_area:
        cv_pred_final = np.expm1(cv_pred_final) * cv['building_area']
    else:
        cv_pred_final = np.expm1(cv_pred_final)

    global best_score
    global best_coeffs
    score = cal_score(cv['total_price'], cv_pred_final)
    if score > best_score[metric]:
        best_score[metric] = score
        best_coeffs[metric] = x.copy()
        print('find better score:')
        print('score: ', score)
        print('coeffs: ', x)
        print()
    
    if metric == 'mape':
        return cal_mape(cv['total_price'], cv_pred_final)
    elif metric == 'mse':
        return mean_squared_error(cv['total_price'], cv_pred_final)
    elif metric == 'mae':
        return mean_absolute_error(cv['total_price'], cv_pred_final)
    else:
        raise Exception('metric unknown: {}'.format(metric))
#    return 1 - (cal_score(cv['total_price'], cv_pred_final)/10000)

### Optimize

In [16]:
best_score = {}
best_coeffs = {}

len_x = len(cols_opt)
rev_len_x = 1/len_x
x0s = [ [1/len_x for i in range(len_x)],
       [0 for i in range(len_x)],
       [0.1 for i in range(len_x)],
       [0.2 for i in range(len_x)],
       [0.5 for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)],
       [np.random.randn()+1/len_x for i in range(len_x)] ]

#for metric in ['mape']:
for metric in ['mape', 'mae', 'mse']:
    best_score[metric] = 0
    best_coeffs[metric] = []
    for x0 in x0s:
        print('Optimizing with init x0: {}'.format(x0))
        print()
        try:
            minimize(objective, x0, args=(metric))
        except:
            pass

Optimizing with init x0: [0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]

find better score:
score:  32.46306661839966
coeffs:  [0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625
 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625]

find better score:
score:  32.46306670970732
coeffs:  [0.06250001 0.0625     0.0625     0.0625     0.0625     0.0625
 0.0625     0.0625     0.0625     0.0625     0.0625     0.0625
 0.0625     0.0625     0.0625     0.0625    ]

find better score:
score:  32.46306670971289
coeffs:  [0.0625     0.0625     0.0625     0.06250001 0.0625     0.0625
 0.0625     0.0625     0.0625     0.0625     0.0625     0.0625
 0.0625     0.0625     0.0625     0.0625    ]

find better score:
score:  32.46306670977765
coeffs:  [0.0625     0.0625     0.0625     0.0625     0.0625     0.0625
 0.0625     0.0625     0.0625     0.0625     0.06250001 0.0625
 0.0625     0.0625     0.0625     0.0625 

find better score:
score:  5951.876385450154
coeffs:  [ 0.06442318  0.0504569   0.07831481  0.0753702   0.12443708  0.07260677
  0.03846533  0.09497738  0.06760441  0.08058043 -0.08048914  0.08264033
  0.07990334  0.08741263  0.08881394 -0.08575263]

find better score:
score:  5954.876420476083
coeffs:  [ 0.05116755  0.04078176  0.07437522  0.08413418  0.14681545  0.06486983
  0.02747244  0.11266676  0.05676691  0.07798508 -0.0680091   0.08132371
  0.07685474  0.08912088  0.0913066  -0.11487316]

find better score:
score:  5954.876424586274
coeffs:  [ 0.05066112  0.04036093  0.07467303  0.08437222  0.14872186  0.0648133
  0.02733632  0.11378618  0.05646342  0.07839802 -0.07281855  0.0818519
  0.07722971  0.08989301  0.09215081 -0.11863717]

find better score:
score:  5954.876424586277
coeffs:  [ 0.05066112  0.04036093  0.07467303  0.08437222  0.14872186  0.0648133
  0.02733632  0.11378618  0.05646342  0.07839802 -0.07281855  0.0818519
  0.07722971  0.08989301  0.09215081 -0.11863715]



  """
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  """


Optimizing with init x0: [0.03972207102621735, -0.9297438947176777, -0.15485387425449446, 0.9020171214756327, 2.7290192415094934, 0.06937253292819484, 0.5451364361054207, 2.07466701220561, -0.08043049136928021, 1.7801025546900116, -0.506898548749846, 0.19925180292488592, 1.580100366001101, -0.3103856451108657, -0.5265560879545358, -1.4684488384457435]



  """
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  """


Optimizing with init x0: [-2.1122369666782315, 0.2305242708454739, -0.622899062733903, 0.1938246311237144, -0.740870755054716, -0.41802791346156526, -0.6080437897961751, -1.106691071677159, -1.1120260599266687, -0.25173667047862713, 1.4871211331380767, 1.0460408618170747, 1.53451960163091, 0.20075410725724077, -0.024741814675871984, 0.6381472567807771]

Optimizing with init x0: [-0.16210389449191376, 2.0282426185568743, 0.8463494933417586, 0.5688863756724017, 1.0926407715149298, -2.6202247860606844, 0.2906880026066536, -0.14117014463081132, 0.8153404568504985, 0.9082642060294497, 0.433621925361761, 0.8977118335482885, 0.04148670286377694, -2.6977170872532854, 1.237665318980641, -1.404187071738447]

Optimizing with init x0: [0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]

find better score:
score:  32.46306661839966
coeffs:  [0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625
 0.0625 0.06

  """


Optimizing with init x0: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

Optimizing with init x0: [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

Optimizing with init x0: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]



  """


Optimizing with init x0: [0.03972207102621735, -0.9297438947176777, -0.15485387425449446, 0.9020171214756327, 2.7290192415094934, 0.06937253292819484, 0.5451364361054207, 2.07466701220561, -0.08043049136928021, 1.7801025546900116, -0.506898548749846, 0.19925180292488592, 1.580100366001101, -0.3103856451108657, -0.5265560879545358, -1.4684488384457435]



  """


Optimizing with init x0: [-2.1122369666782315, 0.2305242708454739, -0.622899062733903, 0.1938246311237144, -0.740870755054716, -0.41802791346156526, -0.6080437897961751, -1.106691071677159, -1.1120260599266687, -0.25173667047862713, 1.4871211331380767, 1.0460408618170747, 1.53451960163091, 0.20075410725724077, -0.024741814675871984, 0.6381472567807771]

Optimizing with init x0: [-0.16210389449191376, 2.0282426185568743, 0.8463494933417586, 0.5688863756724017, 1.0926407715149298, -2.6202247860606844, 0.2906880026066536, -0.14117014463081132, 0.8153404568504985, 0.9082642060294497, 0.433621925361761, 0.8977118335482885, 0.04148670286377694, -2.6977170872532854, 1.237665318980641, -1.404187071738447]

Optimizing with init x0: [0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]

find better score:
score:  32.46306661839966
coeffs:  [0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625
 0.0625 0.06

  """


Optimizing with init x0: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

Optimizing with init x0: [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

Optimizing with init x0: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]



  """


Optimizing with init x0: [0.03972207102621735, -0.9297438947176777, -0.15485387425449446, 0.9020171214756327, 2.7290192415094934, 0.06937253292819484, 0.5451364361054207, 2.07466701220561, -0.08043049136928021, 1.7801025546900116, -0.506898548749846, 0.19925180292488592, 1.580100366001101, -0.3103856451108657, -0.5265560879545358, -1.4684488384457435]



  """


Optimizing with init x0: [-2.1122369666782315, 0.2305242708454739, -0.622899062733903, 0.1938246311237144, -0.740870755054716, -0.41802791346156526, -0.6080437897961751, -1.106691071677159, -1.1120260599266687, -0.25173667047862713, 1.4871211331380767, 1.0460408618170747, 1.53451960163091, 0.20075410725724077, -0.024741814675871984, 0.6381472567807771]

Optimizing with init x0: [-0.16210389449191376, 2.0282426185568743, 0.8463494933417586, 0.5688863756724017, 1.0926407715149298, -2.6202247860606844, 0.2906880026066536, -0.14117014463081132, 0.8153404568504985, 0.9082642060294497, 0.433621925361761, 0.8977118335482885, 0.04148670286377694, -2.6977170872532854, 1.237665318980641, -1.404187071738447]



  """


In [17]:
display(best_score)
display(best_coeffs)

{'mape': 5954.876424586277,
 'mae': 5914.8749480355655,
 'mse': 5828.869558149154}

{'mape': array([ 0.05066112,  0.04036093,  0.07467303,  0.08437222,  0.14872186,
         0.0648133 ,  0.02733632,  0.11378618,  0.05646342,  0.07839802,
        -0.07281855,  0.0818519 ,  0.07722971,  0.08989301,  0.09215081,
        -0.11863715]),
 'mae': array([-0.08153267,  0.02620861,  0.52079418, -0.00813862,  0.09563322,
         0.02024393,  0.10788832,  0.14710438,  0.05631462, -0.11357892,
        -0.16181791,  1.40911754, -1.99682421,  0.04225086,  0.9368028 ,
        -0.00809569]),
 'mse': array([0.06674593, 0.06730138, 0.06674593, 0.06711623, 0.06276525,
        0.06674593, 0.06776425, 0.06730138, 0.06674593, 0.06711623,
        0.06674593, 0.06702366, 0.06702366, 0.06711623, 0.06702366,
        0.00472128])}

### Compute submission

In [None]:
test_pred_final = pd.DataFrame({'building_id': test['building_id']})

test_pred_final['total_price'] = test.loc[:,cols_opt].dot(best_coeffs['mape'])

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_spopt-parea_{}_{}.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_spopt_{}_{}.csv'.format(stack_idx, models), index=False)

### Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(cv['log_parea_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price'] / test['building_area']), bins=100, label='test',
         normed=True, alpha=0.7)
plt.xlabel('log(price/area + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(cv['log_total_price'], bins=100, label='train true', normed=True)
plt.hist(np.log1p(test_pred_final['total_price']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(price + 1)'); plt.ylabel('ratio')
plt.legend(); plt.grid(); plt.show()

In [None]:
plt.hist(np.log1p(cv['building_area']), bins=100, label='train', normed=True)
plt.hist(np.log1p(test['building_area']), bins=100, label='test', normed=True, alpha=0.7)
plt.xlabel('log(building_area + 1)'); plt.ylabel('ratio'); plt.yscale('log')
plt.legend(); plt.grid(); plt.show()

In [None]:
print([1/17 if i in [3, 4, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26] else 0 \
 for i in list(range(1,24)) + list(range(25,28))])