In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import KFold
from utilities import cal_score

In [2]:
stack_idx = '06'
models = '1,3-7,9-10'
use_test_kfold = set([7])

### Read CV predictions and test

In [3]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [4]:
idx_models = parse_models(models)

In [5]:
idx_models

[1, 3, 4, 5, 6, 7, 9, 10]

In [6]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]

files_cv = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'cv' in f][0] for idx in idx_models}

files_test_one = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-one' in f][0] \
                  for idx in idx_models}
files_test_kf = {idx: [f for f in files_in_output if 'model-%02d-' % idx in f and 'test-kfold' in f][0] \
                 for idx in idx_models}

In [7]:
print(files_cv)
print(files_test_kf)
print(files_test_one)

{1: 'model-01-lgb-cv.csv', 3: 'model-03-lgb-feats-selection-cv.csv', 4: 'model-04-lgb-PCA-cv.csv', 5: 'model-05-lgb-wo-per-area-cv.csv', 6: 'model-06-lgb-lr0.001-cv.csv', 7: 'exp-model-07-keras-embedding-small-baseline-cv.csv', 9: 'model-09-lgb-feats-selection-75-cv.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-cv.csv'}
{1: 'model-01-lgb-test-kfold.csv', 3: 'model-03-lgb-feats-selection-test-kfold.csv', 4: 'model-04-lgb-PCA-test-kfold.csv', 5: 'model-05-lgb-wo-per-area-test-kfold.csv', 6: 'model-06-lgb-lr0.001-test-kfold.csv', 7: 'exp-model-07-keras-embedding-small-baseline-test-kfold.csv', 9: 'model-09-lgb-feats-selection-75-test-kfold.csv', 10: 'model-10-lgb-feats-selection-75-lr-0.001-test-kfold.csv'}
{1: 'model-01-lgb-test-one.csv', 3: 'model-03-lgb-feats-selection-test-one.csv', 4: 'model-04-lgb-PCA-test-one.csv', 5: 'model-05-lgb-wo-per-area-test-one.csv', 6: 'model-06-lgb-lr0.001-test-one.csv', 7: 'model-07-keras-embedding-test-one.csv', 9: 'model-09-lgb-feats-selection-75

In [8]:
cv = None
test = None

for i, idx in enumerate(idx_models):
    f = files_cv[idx]
    df = pd.read_csv('output/'+f)
    
    if cv is None:
        cv = df[['building_id','total_price_predict']].copy()
    else:
        cv = pd.merge(cv, df[['building_id','total_price_predict']], on='building_id')
    
    cv = cv.rename(columns = {'total_price_predict':'pred_{}'.format(idx_models[i])})
    cv[f'log_pred_{idx_models[i]}'] = np.log1p(cv[f'pred_{idx_models[i]}'])

cv = pd.merge(cv, df[['building_id','total_price']], on='building_id')
cv['log_total_price'] = np.log1p(cv['total_price'])

for i, idx in enumerate(idx_models):
    f = files_test_kf[idx] if idx in use_test_kfold else files_test_one[idx]
    df = pd.read_csv('output/'+f)

    if test is None:
        test = df[['building_id','total_price']].copy()
    else:
        test = pd.merge(test, df[['building_id','total_price']], on='building_id')
        
    test = test.rename(columns = {'total_price':'pred_{}'.format(idx)})
    test[f'log_pred_{idx}'] = np.log1p(test[f'pred_{idx}'])

In [9]:
cv.head()

Unnamed: 0,building_id,pred_1,log_pred_1,pred_3,log_pred_3,pred_4,log_pred_4,pred_5,log_pred_5,pred_6,log_pred_6,pred_7,log_pred_7,pred_9,log_pred_9,pred_10,log_pred_10,total_price,log_total_price
0,jre1pJhcQj91Kdky,12553500.0,16.34551,12559810.0,16.346013,12883810.0,16.371482,12822130.0,16.366683,12474840.0,16.339224,12575326.0,16.347247,12616160.0,16.350489,12691760.0,16.356463,14215011.0,16.469809
1,jcbuA8q3KPH9SzpS,9117771.0,16.025736,9675385.0,16.085096,7095057.0,15.774909,9570172.0,16.074162,9175013.0,16.031994,8287008.5,15.9302,9376357.0,16.053702,9338673.0,16.049675,7642884.5,15.849286
2,EqWJpHmkXPyfHkB8,4491672.0,15.317736,4511709.0,15.322187,4586149.0,15.338551,4687745.0,15.360462,4562418.0,15.333364,5103701.0,15.445477,4600447.0,15.341664,4486105.0,15.316496,4807053.0,15.385595
3,ihrMWGM8Nq99Uvu6,6918995.0,15.749781,6878882.0,15.743967,6906435.0,15.747964,6981481.0,15.758772,6937283.0,15.752421,7067820.5,15.771063,6959559.0,15.755627,6866963.0,15.742233,7493026.5,15.829483
4,jRhwzoVMFLFzxAX1,2750253.0,14.827204,2754684.0,14.828814,2725200.0,14.818053,2838090.0,14.858642,2766369.0,14.833047,2546584.2,14.750264,2727247.0,14.818804,2775738.0,14.836428,2846855.8,14.861726


In [10]:
X = cv[['log_pred_{}'.format(idx) for idx in idx_models]]
y = cv['log_total_price']

reg = LassoCV(alphas=[0]+list(np.logspace(-4, 3, 7)), max_iter=100000, tol=1e-6, n_jobs=-1)
reg.fit(X, y)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


LassoCV(alphas=[0, 0.0001, 0.0014677992676220704, 0.021544346900318846, 0.31622776601683794, 4.641588833612782, 68.12920690579622, 1000.0],
    copy_X=True, cv='warn', eps=0.001, fit_intercept=True, max_iter=100000,
    n_alphas=100, n_jobs=-1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=1e-06,
    verbose=False)

In [11]:
print(reg.alpha_)
print(reg.mse_path_)
print(reg.coef_, reg.intercept_)

0.0001
[[1.3539862  1.37718519 1.34124855]
 [1.3539862  1.37718519 1.34124855]
 [1.3539862  1.37718519 1.34124855]
 [0.11180859 0.11705994 0.11021192]
 [0.03576088 0.03798314 0.03672315]
 [0.03539801 0.03753207 0.03644233]
 [0.03539694 0.0375248  0.03644675]
 [0.03537252 0.03749974 0.03650501]]
[0.         0.09114964 0.         0.25254681 0.         0.14300199
 0.         0.51894123] -0.08703397457068185


In [12]:
for a in [0]+list(np.logspace(-4, 3, 7)):
    reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
    reg_single.fit(X, y)
    print(reg_single.coef_, reg.intercept_)
    print(reg_single.score(X,y))

  This is separate from the ipykernel package so we can avoid doing imports until
  positive)


[-0.18995522  0.21260467  0.00231696  0.25962775  0.02975889  0.14436319
 -0.06614862  0.61323657] -0.08703397457068185
0.9732006665587993
[0.         0.09114964 0.         0.25254681 0.         0.14300199
 0.         0.51894123] -0.08703397457068185
0.9731868065281322
[0.         0.09405304 0.         0.24886819 0.         0.14335845
 0.         0.51827503] -0.08703397457068185
0.9731855534618169
[0.         0.11203855 0.         0.19228747 0.         0.14817828
 0.         0.53609142] -0.08703397457068185
0.9729150466710734
[0.         0.0878288  0.         0.         0.         0.18630251
 0.01386337 0.47374562] -0.08703397457068185
0.916838964558573
[0. 0. 0. 0. 0. 0. 0. 0.] -0.08703397457068185
0.0
[0. 0. 0. 0. 0. 0. 0. 0.] -0.08703397457068185
0.0
[0. 0. 0. 0. 0. 0. 0. 0.] -0.08703397457068185
0.0


In [13]:
alphas = [0, 0.0001, 0.0002, 0.0005, 0.0008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02]
gsearch = {}

folds = KFold(n_splits=3, shuffle=True, random_state=1208)
for i_fold, (itrain, ival) in enumerate(folds.split(X)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    X_train = X.iloc[itrain]
    X_val = X.iloc[ival]
    y_train = y.iloc[itrain]
    y_val = y.iloc[ival]
    
    # random sample - grid search
    for a in alphas:
        if a == 0:
            reg_single = LinearRegression()
        else:
            reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
        reg_single.fit(X_train, y_train)
        
        y_pred = reg_single.predict(X_val)
        y_pred_final = np.expm1(y_pred)
        y_true_final = np.expm1(y_val)
        score = cal_score(y_true_final, y_pred_final)

        print('alpha, score:', a, score)
        gsearch[a] = gsearch.get(a,[]) + [score]

results = [[key, np.mean(value), value] for key, value, in gsearch.items()]
results.sort(key= lambda x: x[1], reverse=True)
for item in results:
    print(item)

==== Fold 1 ====
alpha, score: 0 5904.8726865782655
alpha, score: 0.0001 5916.872662325437
alpha, score: 0.0002 5916.872664266751
alpha, score: 0.0005 5914.872667949364
alpha, score: 0.0008 5913.872671278478
alpha, score: 0.001 5913.872673129196
alpha, score: 0.002 5912.872677850226
alpha, score: 0.005 5908.872640355782
alpha, score: 0.008 5902.8725185661915
alpha, score: 0.01 5898.872386839879
alpha, score: 0.02 5871.871247565909
==== Fold 2 ====
alpha, score: 0 5893.873584723424
alpha, score: 0.0001 5890.873588906723
alpha, score: 0.0002 5894.873592214265
alpha, score: 0.0005 5895.8736019632015
alpha, score: 0.0008 5891.8736111144935
alpha, score: 0.001 5890.873616768086
alpha, score: 0.002 5890.873639630076
alpha, score: 0.005 5888.873660711991
alpha, score: 0.008 5885.873606175742
alpha, score: 0.01 5885.873524237075
alpha, score: 0.02 5848.872636678581
==== Fold 3 ====
alpha, score: 0 5938.877545913836
alpha, score: 0.0001 5944.877596891769
alpha, score: 0.0002 5944.877597565735
a

In [14]:
alpha_set = results[0][0]
print(alpha_set)
if alpha_set == 0:
    reg = LinearRegression()
else:
    reg = Lasso(alpha=alpha_set, max_iter=1000000, tol=1e-6)
reg.fit(X, y)

0.0002


Lasso(alpha=0.0002, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=1e-06, warm_start=False)

In [15]:
print(reg.coef_, reg.intercept_)

[0.         0.09291752 0.         0.25244118 0.         0.14305409
 0.         0.5171497 ] -0.08583643617885173


### Calculate cv score

In [16]:
cv_pred_final = np.zeros(X.shape[0])
for i, col in enumerate(X):
    cv_pred_final = cv_pred_final + X[col] * reg.coef_[i]
cv_pred_final = cv_pred_final + reg.intercept_
cv_pred_final = np.expm1(cv_pred_final)
cv_true_final = np.expm1(y)

In [17]:
pd.DataFrame({'a':cv_true_final,'b':cv_pred_final}).head()

Unnamed: 0,a,b
0,14215011.0,12760950.0
1,7642884.5,9299729.0
2,4807053.0,4620142.0
3,7493026.5,6937296.0
4,2846855.8,2746101.0


In [18]:
cal_score(cv_true_final, cv_pred_final)

5918.874653412518

### Compute submission

In [19]:
test_pred_final = pd.DataFrame({'building_id': test['building_id'], 'total_price': np.zeros(test.shape[0])})
for i, idx in enumerate(idx_models):
    test_pred_final['total_price'] = test_pred_final['total_price'] + test[f'log_pred_{idx}'] * reg.coef_[i]
test_pred_final['total_price'] = test_pred_final['total_price'] + reg.intercept_
test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

test_pred_final.to_csv('output/stack_{}_{}.csv'.format(stack_idx, models), index=False)

In [20]:
#a= pd.read_csv('output/model-03-lgb-feats-selection-cv.csv')
#b= pd.read_csv('output/model-03-lgb-feats-selection-test-one.csv')

In [21]:
#a.rename(columns={'building_id':'id','total_price_predict':'target'}).to_csv('opt-pred3.csv',index=False)
#b.rename(columns={'building_id':'id','total_price':'target'}).to_csv('opt-test3.csv',index=False)