In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import KFold
from utilities import cal_score

In [2]:
stack_idx = '01'
models = '1,4-5'

### Read CV predictions and test

In [3]:
def parse_models(exp):
    exp_split = exp.split(',')
    idx_models = []
    for e in exp_split:
        if '-' in e:
            n0, n1 = e.split('-')
            idx_models.extend(list(range(int(n0), int(n1)+1, 1)))
        else:
            idx_models.append(int(e))
    return idx_models

In [4]:
idx_models = parse_models(models)

In [5]:
files_in_output = [f for f in os.listdir('output/') if os.path.isfile('output/'+f)]
files_cv = [[f for f in files_in_output if 'model-%02d-' % idx in f and 'cv' in f][0] for idx in idx_models]
files_test_one = [[f for f in files_in_output if 'model-%02d-' % idx in f and 'test-one' in f][0] \
                  for idx in idx_models]
files_test_kf = [[f for f in files_in_output if 'model-%02d-' % idx in f and 'test-kfold' in f][0] \
                 for idx in idx_models]

In [6]:
files_test_kf

['model-01-lgb-test-kfold.csv',
 'model-04-lgb-PCA-test-kfold.csv',
 'model-05-lgb-wo-per-area-test-kfold.csv']

In [7]:
cv = None
test_one = None
test_kf = None

for i, f in enumerate(files_cv):
    df = pd.read_csv('output/'+f)
    if cv is None:
        cv = df[['building_id','total_price_predict']].copy()
    else:
        cv = pd.merge(cv, df[['building_id','total_price_predict']], on='building_id')
    cv = cv.rename(columns = {'total_price_predict':'pred_{}'.format(idx_models[i])})
    cv[f'log_pred_{idx_models[i]}'] = np.log1p(cv[f'pred_{idx_models[i]}'])
cv = pd.merge(cv, df[['building_id','total_price']], on='building_id')
cv['log_total_price'] = np.log1p(cv['total_price'])

for i, f in enumerate(files_test_one):
    df = pd.read_csv('output/'+f)
    if test_one is None:
        test_one = df[['building_id','total_price']].copy()
    else:
        test_one = pd.merge(test_one, df[['building_id','total_price']], on='building_id')
    test_one = test_one.rename(columns = {'total_price':'pred_{}'.format(idx_models[i])})
    test_one[f'log_pred_{idx_models[i]}'] = np.log1p(test_one[f'pred_{idx_models[i]}'])

In [17]:
test_one[['building_id','pred_5']].rename(columns={'building_id':'id','pred_5':'target'})\
    .to_csv('test_5.csv',index=False)

In [18]:
cv.head()

Unnamed: 0,building_id,pred_1,log_pred_1,pred_4,log_pred_4,pred_5,log_pred_5,total_price,log_total_price
0,jre1pJhcQj91Kdky,12553500.0,16.34551,12883810.0,16.371482,12822130.0,16.366683,14215011.0,16.469809
1,jcbuA8q3KPH9SzpS,9117771.0,16.025736,7095057.0,15.774909,9570172.0,16.074162,7642884.5,15.849286
2,EqWJpHmkXPyfHkB8,4491672.0,15.317736,4586149.0,15.338551,4687745.0,15.360462,4807053.0,15.385595
3,ihrMWGM8Nq99Uvu6,6918995.0,15.749781,6906435.0,15.747964,6981481.0,15.758772,7493026.5,15.829483
4,jRhwzoVMFLFzxAX1,2750253.0,14.827204,2725200.0,14.818053,2838090.0,14.858642,2846855.8,14.861726


In [19]:
X = cv[['log_pred_{}'.format(idx) for idx in idx_models]]
y = cv['log_total_price']

reg = LassoCV(alphas=[0]+list(np.logspace(-4, 3, 7)), max_iter=100000, tol=1e-6, n_jobs=-1)
reg.fit(X, y)

  tol, rng, random, positive)
  tol, rng, random, positive)
  model.fit(X, y)
  positive)


LassoCV(alphas=[0, 0.0001, 0.0014677992676220704, 0.021544346900318846, 0.31622776601683794, 4.641588833612782, 68.12920690579622, 1000.0],
    copy_X=True, cv='warn', eps=0.001, fit_intercept=True, max_iter=100000,
    n_alphas=100, n_jobs=-1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=1e-06,
    verbose=False)

In [None]:
print(reg.alpha_)
print(reg.mse_path_)
print(reg.coef_, reg.intercept_)

In [None]:
for a in [0]+list(np.logspace(-4, 3, 7)):
    reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
    reg_single.fit(X, y)
    print(reg_single.coef_, reg.intercept_)
    print(reg_single.score(X,y))

In [None]:
alphas = [0, 0.0001, 0.0002, 0.0005, 0.0008, 0.001, 0.002, 0.005, 0.008, 0.01, 0.02]
gsearch = {}

folds = KFold(n_splits=3, shuffle=True, random_state=1208)
for i_fold, (itrain, ival) in enumerate(folds.split(X)): # kfold
    print('==== Fold', i_fold+1, '====')
    
    # split train, val
    X_train = X.iloc[itrain]
    X_val = X.iloc[ival]
    y_train = y.iloc[itrain]
    y_val = y.iloc[ival]
    
    # random sample - grid search
    for a in alphas:
        if a == 0:
            reg_single = LinearRegression()
        else:
            reg_single = Lasso(alpha=a, max_iter=100000, tol=1e-6)
        reg_single.fit(X_train, y_train)
        
        y_pred = reg_single.predict(X_val)
        y_pred_final = np.expm1(y_pred)
        y_true_final = np.expm1(y_val)
        score = cal_score(y_true_final, y_pred_final)

        print('alpha, score:', a, score)
        gsearch[a] = gsearch.get(a,[]) + [score]

results = [[key, np.mean(value), value] for key, value, in gsearch.items()]
results.sort(key= lambda x: x[1], reverse=True)
for item in results:
    print(item)

In [None]:
alpha_set = results[0][0]
print(alpha_set)
if alpha_set == 0:
    reg = LinearRegression()
else:
    reg = Lasso(alpha=alpha_set, max_iter=1000000, tol=1e-6)
reg.fit(X, y)

In [None]:
print(reg.coef_, reg.intercept_)

### Calculate cv score

In [None]:
cv_pred_final = np.zeros(X.shape[0])
for i, col in enumerate(X):
    cv_pred_final = cv_pred_final + X[col] * reg.coef_[i]
cv_pred_final = cv_pred_final + reg.intercept_
cv_pred_final = np.expm1(cv_pred_final)
cv_true_final = np.expm1(y)

In [None]:
pd.DataFrame({'a':cv_true_final,'b':cv_pred_final}).head()

In [None]:
cal_score(cv_true_final, cv_pred_final)

### Compute submission

In [None]:
test_pred_final = pd.DataFrame({'building_id': test_one['building_id'], 'total_price': np.zeros(test_one.shape[0])})
for i, idx in enumerate(idx_models):
    test_pred_final['total_price'] = test_pred_final['total_price'] + test_one[f'log_pred_{idx}'] * reg.coef_[i]
test_pred_final['total_price'] = test_pred_final['total_price'] + reg.intercept_
test_pred_final['total_price'] = np.expm1(test_pred_final['total_price'])
test_pred_final['total_price'] = np.clip(test_pred_final['total_price'], 0, None)

test_pred_final.to_csv('output/stack_{}_{}.csv'.format(stack_idx, models), index=False)