In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import KFold
from utilities import cal_score

In [2]:
stack_idx = '10'
models = '1-11,14-15'
use_test_kfold = set([2,7,8])

is_per_area = False

### Read CV predictions and test

In [3]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [4]:
idx_models = parse_models(models)

In [5]:
idx_models

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15]

In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]

files_cv = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'cv' in f][0] for idx in idx_models}

files_test_one = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-one' in f][0] \
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-kfold' in f][0] \
                 for idx in idx_models}

In [7]:
print(files_cv)
print(files_test_kf)
print(files_test_one)

{1: 'model-01-lgb-cv.csv', 2: 'model-02-keras-search-cv.csv', 3: 'model-03-lgb-feats-selection-cv.csv', 4: 'model-04-lgb-PCA-cv.csv', 5: 'model-05-lgb-wo-per-area-cv.csv', 6: 'model-06-lgb-lr0.001-cv.csv', 7: 'model-07-keras-embedding-cv.csv', 8: 'model-08-keras-search-long-cv.csv', 9: 'model-09-lgb-feats-selection-75-cv.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-cv.csv', 11: 'model-11-rf-cv.csv', 14: 'model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv', 15: 'model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv'}
{1: 'model-01-lgb-test-kfold.csv', 2: 'model-02-keras-search-test-kfold.csv', 3: 'model-03-lgb-feats-selection-test-kfold.csv', 4: 'model-04-lgb-PCA-test-kfold.csv', 5: 'model-05-lgb-wo-per-area-test-kfold.csv', 6: 'model-06-lgb-lr0.001-test-kfold.csv', 7: 'model-07-keras-embedding-test-kfold.csv', 8: 'model-08-keras-search-long-test-kfold.csv', 9: 'model-09-lgb-feats-selection-75-test-kfold.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-test-kfold.csv', 11:

#### Load area

In [8]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [9]:
cv = df_train[['building_id', 'building_area']]
test = df_test[['building_id', 'building_area']]

print('CV predictions:')
for i, idx in enumerate(idx_models):
    f = files_cv[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id','total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict':'pred_{}'.format(idx_models[i])})
    cv[f'log_pred_{idx_models[i]}'] = np.log1p(cv[f'pred_{idx_models[i]}'])
    cv[f'log_parea_pred_{idx_models[i]}'] = np.log1p( cv[f'pred_{idx_models[i]}'] / cv['building_area'] )

cv = pd.merge(cv, df[['building_id','total_price']], on='building_id')
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

print('Test predictions:')
for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    print('No. {} file: {}'.format(i, f))
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price':'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx_models[i]}'] = np.log1p( test[f'pred_{idx_models[i]}'] / test['building_area'] )


CV predictions:
No. 0 file: model-01-lgb-cv.csv
No. 1 file: model-02-keras-search-cv.csv
No. 2 file: model-03-lgb-feats-selection-cv.csv
No. 3 file: model-04-lgb-PCA-cv.csv
No. 4 file: model-05-lgb-wo-per-area-cv.csv
No. 5 file: model-06-lgb-lr0.001-cv.csv
No. 6 file: model-07-keras-embedding-cv.csv
No. 7 file: model-08-keras-search-long-cv.csv
No. 8 file: model-09-lgb-feats-selection-75-cv.csv
No. 9 file: model-10-lgb-feats-selection-75-lr-0.001-cv.csv
No. 10 file: model-11-rf-cv.csv
No. 11 file: model-14-lgb-feats-selection-75-lr-0.001-rand-cv.csv
No. 12 file: model-15-lgb-feats-selection-75-lr-0.001-rand323-cv.csv
Test predictions:
No. 0 file: model-01-lgb-test-one.csv
No. 1 file: model-02-keras-search-test-kfold.csv
No. 2 file: model-03-lgb-feats-selection-test-one.csv
No. 3 file: model-04-lgb-PCA-test-one.csv
No. 4 file: model-05-lgb-wo-per-area-test-one.csv
No. 5 file: model-06-lgb-lr0.001-test-one.csv
No. 6 file: model-07-keras-embedding-test-kfold.csv
No. 7 file: model-08-keras

In [10]:
cv.head()

Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_11,pred_14,log_pred_14,log_parea_pred_14,pred_15,log_pred_15,log_parea_pred_15,total_price,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,13.408579,...,12.618375,669433.6,13.414189,12.185085,667590.4,13.411432,12.182328,647603.75,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,14.940179,...,13.568522,3145496.0,14.961483,13.564915,3153173.0,14.96392,13.567353,3321452.0,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,16.099407,...,14.329188,9748917.0,16.092667,14.372712,9736865.0,16.09143,14.371475,9570885.0,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,16.346013,...,13.693612,12604450.0,16.34956,13.742214,12625500.0,16.351229,13.743883,14215011.0,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,13.936329,...,12.243638,1120621.0,13.929394,12.384368,1113775.0,13.923267,12.378241,762712.0,13.544637,11.999613


In [11]:
test.head()

Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_10,pred_11,log_pred_11,log_parea_pred_11,pred_14,log_pred_14,log_parea_pred_14,pred_15,log_pred_15,log_parea_pred_15
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.245186,10849480.0,16.199628,14.970521,14641870.0,16.499396,15.270289,14763880.0,16.507694,15.278587
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.137139,3840545.0,15.161125,13.116506,3950248.0,15.189289,13.14467,3928353.0,15.183731,13.139112
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.68407,8393805.0,15.943005,13.443984,10597980.0,16.176174,13.677153,10629000.0,16.179096,13.680076
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.808952,5798727.0,15.573149,14.761217,6120056.0,15.627082,14.81515,6085773.0,15.621464,14.809532
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.113012,1017048.0,13.832416,12.072154,1062834.0,13.876451,12.116189,1061918.0,13.875588,12.115326


### Make Xy

In [12]:
if is_per_area:
    X = cv[['log_parea_pred_{}'.format(idx) for idx in idx_models]]
else:
    X = cv[['log_pred_{}'.format(idx) for idx in idx_models]]

if is_per_area:
    y = cv['log_parea_total_price']
else:
    y = cv['log_total_price']

### start regression

In [13]:
#reg = LassoCV(alphas=[0]+list(np.logspace(-4, 3, 7)), max_iter=100000, tol=1e-6, n_jobs=-1)
#reg.fit(X, y)

#print(reg.alpha_)
#print(reg.mse_path_)
#print(reg.coef_, reg.intercept_)

In [14]:
#for a in [0]+list(np.logspace(-4, 3, 7)):
#    reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
#    reg_single.fit(X, y)
#    print(reg_single.coef_, reg.intercept_)
#    print(reg_single.score(X,y))

In [15]:
alphas = [0, 0.0001, 0.0002, 0.0005, 0.0008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02]
gsearch = {}

folds = KFold(n_splits=3, shuffle=True, random_state=1208)
for i_fold, (itrain, ival) in enumerate(folds.split(X)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    X_train = X.iloc[itrain]
    X_val = X.iloc[ival]
    y_train = y.iloc[itrain]
    y_val = y.iloc[ival]
    
    # random sample - grid search
    for a in alphas:
        if a == 0:
            reg_single = LinearRegression()
        else:
            reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
        reg_single.fit(X_train, y_train)
        
        y_pred = reg_single.predict(X_val)
        if is_per_area:
            y_pred_final = np.expm1(y_pred) * cv.iloc[ival]['building_area']
            y_true_final = np.expm1(y_val) * cv.iloc[ival]['building_area']
        else:
            y_pred_final = np.expm1(y_pred)
            y_true_final = np.expm1(y_val)
        score = cal_score(y_true_final, y_pred_final)

        print('alpha, score:', a, score)
        gsearch[a] = gsearch.get(a,[]) + [score]

results = [[key, np.mean(value), value] for key, value, in gsearch.items()]
results.sort(key= lambda x: x[1], reverse=True)
for item in results:
    print(item)

==== Fold 1 ====
alpha, score: 0 5975.874868069612
alpha, score: 0.0001 5977.87493558506
alpha, score: 0.0002 5972.874944859689
alpha, score: 0.0005 5966.874880784547
alpha, score: 0.0008 5965.874693005316
alpha, score: 0.001 5962.874668889054
alpha, score: 0.002 5967.8746792445745
alpha, score: 0.005 5966.874660533554
alpha, score: 0.008 5970.874558976486
alpha, score: 0.01 5970.874444140524
alpha, score: 0.02 5950.8733793291
==== Fold 2 ====
alpha, score: 0 5926.875528823429
alpha, score: 0.0001 5929.875667848964
alpha, score: 0.0002 5932.875706053405
alpha, score: 0.0005 5938.875741592222
alpha, score: 0.0008 5938.87566144204
alpha, score: 0.001 5942.875597339285
alpha, score: 0.002 5950.8756220243695
alpha, score: 0.005 5940.875633720462
alpha, score: 0.008 5937.875564720708
alpha, score: 0.01 5936.8754721656505
alpha, score: 0.02 5906.874532700707
==== Fold 3 ====
alpha, score: 0 5890.874520636779
alpha, score: 0.0001 5895.8745985753685
alpha, score: 0.0002 5887.874621888219
alpha

In [16]:
alpha_set = results[0][0]
print(alpha_set)
if alpha_set == 0:
    reg = LinearRegression()
else:
    reg = Lasso(alpha=alpha_set, max_iter=1000000, tol=1e-6)
reg.fit(X, y)

0.0001


Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=1e-06, warm_start=False)

In [17]:
print(reg.coef_, reg.intercept_)

[ 0.          0.0561674   0.08121574  0.01876432  0.23048302  0.
  0.10545712  0.04645659  0.          0.07741483 -0.10540269  0.49260326
  0.        ] -0.047196158777669694


### Calculate cv score

In [18]:
cv_pred_final = np.zeros(X.shape[0])
for i, col in enumerate(X):
    cv_pred_final = cv_pred_final + X[col] * reg.coef_[i]
cv_pred_final = cv_pred_final + reg.intercept_

if is_per_area:
    cv_pred_final = np.expm1(cv_pred_final) * cv['building_area']
    cv_true_final = np.expm1(y) * cv['building_area']
else:
    cv_pred_final = np.expm1(cv_pred_final)
    cv_true_final = np.expm1(y)

In [19]:
pd.DataFrame({'a':cv_true_final,'b':cv_pred_final}).head()

Unnamed: 0,a,b
0,647603.75,635259.3
1,3321452.0,3124398.0
2,9570885.0,9789021.0
3,14215011.0,12772480.0
4,762712.0,1266879.0


In [20]:
cal_score(cv_true_final, cv_pred_final)

5936.875242876889

### Compute submission

In [21]:
if is_per_area:
    col_prefix = 'log_parea_pred'
else:
    col_prefix = 'log_pred'

test_pred_final = pd.DataFrame({'building_id': test['building_id'], 'total_price': np.zeros(test.shape[0])})

for i, idx in enumerate(idx_models):
    test_pred_final['total_price'] = test_pred_final['total_price'] + test[f'{col_prefix}_{idx}'] * reg.coef_[i]
test_pred_final['total_price'] = test_pred_final['total_price'] + reg.intercept_

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_parea_{}_{}.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_{}_{}.csv'.format(stack_idx, models), index=False)

In [22]:
#a= pd.read_csv('output/model-03-lgb-feats-selection-cv.csv')
#b= pd.read_csv('output/model-03-lgb-feats-selection-test-one.csv')

In [23]:
#a.rename(columns={'building_id':'id','total_price_predict':'target'}).to_csv('opt-pred3.csv',index=False)
#b.rename(columns={'building_id':'id','total_price':'target'}).to_csv('opt-test3.csv',index=False)