In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import KFold
from utilities import cal_score

In [2]:
stack_idx = '08'
models = '1-7,9-11'
use_test_kfold = set([2,7])

is_per_area = False

### Read CV predictions and test

In [3]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [4]:
idx_models = parse_models(models)

In [5]:
idx_models

[1, 2, 3, 4, 5, 6, 7, 9, 10, 11]

In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]

files_cv = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'cv' in f][0] for idx in idx_models}

files_test_one = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-one' in f][0] \
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-kfold' in f][0] \
                 for idx in idx_models}

In [7]:
print(files_cv)
print(files_test_kf)
print(files_test_one)

{1: 'model-01-lgb-cv.csv', 2: 'model-02-keras-search-cv.csv', 3: 'model-03-lgb-feats-selection-cv.csv', 4: 'model-04-lgb-PCA-cv.csv', 5: 'model-05-lgb-wo-per-area-cv.csv', 6: 'model-06-lgb-lr0.001-cv.csv', 7: 'exp-model-07-keras-embedding-small-baseline-cv.csv', 9: 'model-09-lgb-feats-selection-75-cv.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-cv.csv', 11: 'model-11-rf-cv.csv'}
{1: 'model-01-lgb-test-kfold.csv', 2: 'model-02-keras-search-test-kfold.csv', 3: 'model-03-lgb-feats-selection-test-kfold.csv', 4: 'model-04-lgb-PCA-test-kfold.csv', 5: 'model-05-lgb-wo-per-area-test-kfold.csv', 6: 'model-06-lgb-lr0.001-test-kfold.csv', 7: 'exp-model-07-keras-embedding-small-baseline-test-kfold.csv', 9: 'model-09-lgb-feats-selection-75-test-kfold.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-test-kfold.csv', 11: 'model-11-rf-test-kfold.csv'}
{1: 'model-01-lgb-test-one.csv', 2: 'model-02-keras-search-test-one.csv', 3: 'model-03-lgb-feats-selection-test-one.csv', 4: 'model-04-lgb-PCA

#### Load area

In [8]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [9]:
cv = df_train[['building_id', 'building_area']]
test = df_test[['building_id', 'building_area']]

for i, idx in enumerate(idx_models):
    f = files_cv[idx]
    df = pd.read_csv('output/'+f)
    
    cv = pd.merge(cv, df[['building_id','total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict':'pred_{}'.format(idx_models[i])})
    cv[f'log_pred_{idx_models[i]}'] = np.log1p(cv[f'pred_{idx_models[i]}'])
    cv[f'log_parea_pred_{idx_models[i]}'] = np.log1p( cv[f'pred_{idx_models[i]}'] / cv['building_area'] )

cv = pd.merge(cv, df[['building_id','total_price']], on='building_id')
cv['log_total_price'] = np.log1p(cv['total_price'])
cv['log_parea_total_price'] = np.log1p( cv['total_price'] / cv['building_area'] )

for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    df = pd.read_csv('output/'+f)

    test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price':'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])
    test[f'log_parea_pred_{idx_models[i]}'] = np.log1p( test[f'pred_{idx_models[i]}'] / test['building_area'] )


In [10]:
cv.head()

Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_9,pred_10,log_pred_10,log_parea_pred_10,pred_11,log_pred_11,log_parea_pred_11,total_price,log_total_price,log_parea_total_price
0,e3mMIMR3JJqCaXz1,3.418175,633155.2,13.358472,12.129369,717209.94,13.483125,12.254022,665689.1,13.408579,...,12.161329,666699.7,13.410097,12.180993,1032484.0,13.84748,12.618375,647603.75,13.381036,12.151933
1,LgwzgklNvy4QCtq5,4.041309,3064324.0,14.935338,13.53877,2899842.2,14.880167,13.4836,3079196.0,14.940179,...,13.553184,3161088.0,14.966427,13.56986,3156863.0,14.96509,13.568522,3321452.0,15.015913,13.619345
2,ucIR2NLLsC3T650L,5.584279,9827776.0,16.100723,14.380769,9766813.0,16.094501,14.374546,9814852.0,16.099407,...,14.381977,9737051.0,16.091449,14.371494,9333706.0,16.049143,14.329188,9570885.0,16.074236,14.354282
3,jre1pJhcQj91Kdky,13.563031,12553500.0,16.34551,13.738164,12699800.0,16.357097,13.74975,12559810.0,16.346013,...,13.743142,12691760.0,16.356463,13.749117,12006500.0,16.300959,13.693612,14215011.0,16.469809,13.862462
4,rQpYpY9nRG7X5mmr,4.688108,1215194.0,14.010415,12.465389,2012610.5,14.514944,12.969916,1128419.0,13.936329,...,12.39769,1122204.0,13.930806,12.38578,973508.7,13.788663,12.243638,762712.0,13.544637,11.999613


In [11]:
test.head()

Unnamed: 0,building_id,building_area,pred_1,log_pred_1,log_parea_pred_1,pred_2,log_pred_2,log_parea_pred_2,pred_3,log_pred_3,...,log_parea_pred_7,pred_9,log_pred_9,log_parea_pred_9,pred_10,log_pred_10,log_parea_pred_10,pred_11,log_pred_11,log_parea_pred_11
0,X5gsdTWGS3W7JJQB,3.418175,15269120.0,16.541343,15.312236,12470072.0,16.338842,15.109735,15316850.0,16.544464,...,15.23483,14682270.0,16.502151,15.273045,14278890.0,16.474293,15.245186,10849480.0,16.199628,14.970521
1,BTshNOJyKHnT2YIT,7.726227,3924241.0,15.182684,13.138065,3916552.2,15.180723,13.136104,3977095.0,15.196062,...,13.166502,3899813.0,15.176439,13.13182,3920610.0,15.181758,13.137139,3840545.0,15.161125,13.116506
2,dhdymr0lV8N5kZOT,12.170581,10961270.0,16.209879,13.710858,11912735.0,16.293119,13.794098,10849670.0,16.199646,...,13.84341,11185570.0,16.230135,13.731115,10671530.0,16.18309,13.68407,8393805.0,15.943005,13.443984
3,VEwyGGMcD56w5BOc,2.252256,6155550.0,15.632865,14.820933,5940670.0,15.597333,14.7854,6015238.0,15.609807,...,14.797854,6070453.0,15.618944,14.807012,6082243.0,15.620884,14.808952,5798727.0,15.573149,14.761217
4,wmUeMoJZfsqaSX9b,5.813985,1062995.0,13.876602,12.11634,1088488.1,13.900301,12.140039,1027248.0,13.842395,...,12.10498,1050322.0,13.864608,12.104347,1059463.0,13.873273,12.113012,1017048.0,13.832416,12.072154


### Make Xy

In [12]:
if is_per_area:
    X = cv[['log_parea_pred_{}'.format(idx) for idx in idx_models]]
else:
    X = cv[['log_pred_{}'.format(idx) for idx in idx_models]]

if is_per_area:
    y = cv['log_parea_total_price']
else:
    y = cv['log_total_price']

### start regression

In [13]:
#reg = LassoCV(alphas=[0]+list(np.logspace(-4, 3, 7)), max_iter=100000, tol=1e-6, n_jobs=-1)
#reg.fit(X, y)

#print(reg.alpha_)
#print(reg.mse_path_)
#print(reg.coef_, reg.intercept_)

In [14]:
#for a in [0]+list(np.logspace(-4, 3, 7)):
#    reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
#    reg_single.fit(X, y)
#    print(reg_single.coef_, reg.intercept_)
#    print(reg_single.score(X,y))

In [15]:
alphas = [0, 0.0001, 0.0002, 0.0005, 0.0008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02]
gsearch = {}

folds = KFold(n_splits=3, shuffle=True, random_state=1208)
for i_fold, (itrain, ival) in enumerate(folds.split(X)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    X_train = X.iloc[itrain]
    X_val = X.iloc[ival]
    y_train = y.iloc[itrain]
    y_val = y.iloc[ival]
    
    # random sample - grid search
    for a in alphas:
        if a == 0:
            reg_single = LinearRegression()
        else:
            reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
        reg_single.fit(X_train, y_train)
        
        y_pred = reg_single.predict(X_val)
        if is_per_area:
            y_pred_final = np.expm1(y_pred) * cv.iloc[ival]['building_area']
            y_true_final = np.expm1(y_val) * cv.iloc[ival]['building_area']
        else:
            y_pred_final = np.expm1(y_pred)
            y_true_final = np.expm1(y_val)
        score = cal_score(y_true_final, y_pred_final)

        print('alpha, score:', a, score)
        gsearch[a] = gsearch.get(a,[]) + [score]

results = [[key, np.mean(value), value] for key, value, in gsearch.items()]
results.sort(key= lambda x: x[1], reverse=True)
for item in results:
    print(item)

==== Fold 1 ====
alpha, score: 0 5949.874552547644
alpha, score: 0.0001 5955.874603354052
alpha, score: 0.0002 5956.874614172277
alpha, score: 0.0005 5960.874550978052
alpha, score: 0.0008 5952.874352082446
alpha, score: 0.001 5958.874347850014
alpha, score: 0.002 5958.874352459448
alpha, score: 0.005 5956.874320515024
alpha, score: 0.008 5952.874211140084
alpha, score: 0.01 5944.87409414704
alpha, score: 0.02 5918.873022740997
==== Fold 2 ====
alpha, score: 0 5934.875730920331
alpha, score: 0.0001 5952.875819810957
alpha, score: 0.0002 5952.875853822493
alpha, score: 0.0005 5941.8758920493765
alpha, score: 0.0008 5941.875823495045
alpha, score: 0.001 5942.8757759215705
alpha, score: 0.002 5937.875793918769
alpha, score: 0.005 5936.875797039627
alpha, score: 0.008 5926.875721183328
alpha, score: 0.01 5929.875624584154
alpha, score: 0.02 5908.87465079867
==== Fold 3 ====
alpha, score: 0 5876.874423429842
alpha, score: 0.0001 5888.874504130206
alpha, score: 0.0002 5886.874530106288
alpha

In [16]:
alpha_set = results[0][0]
print(alpha_set)
if alpha_set == 0:
    reg = LinearRegression()
else:
    reg = Lasso(alpha=alpha_set, max_iter=1000000, tol=1e-6)
reg.fit(X, y)

0.0001


Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=1e-06, warm_start=False)

In [17]:
print(reg.coef_, reg.intercept_)

[ 0.          0.08810639  0.09664146  0.02611218  0.2343765   0.
  0.10551724  0.          0.55527385 -0.10298485] -0.04577187581008246


### Calculate cv score

In [18]:
cv_pred_final = np.zeros(X.shape[0])
for i, col in enumerate(X):
    cv_pred_final = cv_pred_final + X[col] * reg.coef_[i]
cv_pred_final = cv_pred_final + reg.intercept_

if is_per_area:
    cv_pred_final = np.expm1(cv_pred_final) * cv['building_area']
    cv_true_final = np.expm1(y) * cv['building_area']
else:
    cv_pred_final = np.expm1(cv_pred_final)
    cv_true_final = np.expm1(y)

In [19]:
pd.DataFrame({'a':cv_true_final,'b':cv_pred_final}).head()

Unnamed: 0,a,b
0,647603.75,645366.4
1,3321452.0,3130838.0
2,9570885.0,9799171.0
3,14215011.0,12826670.0
4,762712.0,1293574.0


In [20]:
cal_score(cv_true_final, cv_pred_final)

5926.875103216106

### Compute submission

In [21]:
if is_per_area:
    col_prefix = 'log_parea_pred'
else:
    col_prefix = 'log_pred'

test_pred_final = pd.DataFrame({'building_id': test['building_id'], 'total_price': np.zeros(test.shape[0])})

for i, idx in enumerate(idx_models):
    test_pred_final['total_price'] = test_pred_final['total_price'] + test[f'{col_prefix}_{idx}'] * reg.coef_[i]
test_pred_final['total_price'] = test_pred_final['total_price'] + reg.intercept_

if is_per_area:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price']) * test['building_area'] 
else:
    test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
    
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

if is_per_area:
    test_pred_final.to_csv('output/stack_parea_{}_{}.csv'.format(stack_idx, models), index=False)
else:
    test_pred_final.to_csv('output/stack_{}_{}.csv'.format(stack_idx, models), index=False)

In [22]:
#a= pd.read_csv('output/model-03-lgb-feats-selection-cv.csv')
#b= pd.read_csv('output/model-03-lgb-feats-selection-test-one.csv')

In [23]:
#a.rename(columns={'building_id':'id','total_price_predict':'target'}).to_csv('opt-pred3.csv',index=False)
#b.rename(columns={'building_id':'id','total_price':'target'}).to_csv('opt-test3.csv',index=False)