In [1]:
import numpy as np 
import pandas as pd 
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
import seaborn as sns

In [2]:
df = pd.read_excel('2015_7차_직접측정 데이터.xlsx')

target1: 얼굴수직길이 
target2: 머리너비 
target3: 머리수직길이 
target4: 아래턱사이너비 
target5: 머리둘레 

we already have models for target1,target2. 

The notebook below produces models for target3, target4, target5 

## For using all 5 features

In [3]:
gender = df['ⓞ_02_성별'].values 
age = df['ⓞ_06_나이_반올림'].values
height = df['①_003_키'].values
weight = df['①_031_몸무게'].values 
shoe = df['①_119_발직선길이'].values 
target1 = df['①_094_얼굴수직길이'].values
target2 = df['①_115_머리너비'].values
target3 = df['①_093_머리수직길이'].values
target4 = df['①_117_아래턱사이너비'].values 
target5 = df['①_111_머리둘레'].values

In [4]:
gender_encoded = [] 
for i in range(len(gender)): 
    if gender[i] == '남': 
        gender_encoded.append(0) 
    else: 
        gender_encoded.append(1) 
gender_encoded = np.asarray(gender_encoded)  

In [5]:
train = pd.DataFrame({'gender':gender_encoded,
                      'age':age,
                      'height': height, 
                      'weight': weight, 
                      'shoe_size': shoe, 
                      'target1': target1, 
                      'target2': target2, 
                      'target3': target3, 
                      'target4': target4, 
                      'target5': target5
                     })

In [20]:
def produce_models(target_name): 
    target_names = ['target1','target2','target3','target4','target5'] 
    x_train = train.drop(columns=target_names)
    y_train = train[target_name] 
    temp = pd.concat([x_train,y_train],axis=1)
    temp = temp.dropna()
    x_train = temp.drop(columns=[target_name])
    y_train = temp[target_name] 
    print("fitting model...")
    models = []

    kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
    for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)):  
        print("validating on fold {}...".format(idx+1))
        train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
        train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
        model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 2500, 
                              use_best_model = True,
                              random_seed = 999)  
    
        model.fit(train_x, train_y, eval_set = (val_x, val_y)) 
    
        models.append(model) 
    
    print("saving models...") 
    for idx,model in enumerate(models):  
        model.save_model(target_name + "_model_" + str(idx+1)) 
        
    return models


In [24]:
produce_models('target1') 

fitting model...
validating on fold 1...
0:	learn: 7.3253502	test: 7.0934690	best: 7.0934690 (0)	total: 4.5ms	remaining: 45s
2500:	learn: 5.2958141	test: 5.7038761	best: 5.6959675 (1880)	total: 3.89s	remaining: 11.7s
5000:	learn: 4.8261555	test: 5.7540414	best: 5.6959675 (1880)	total: 8.63s	remaining: 8.63s
7500:	learn: 4.4689092	test: 5.8070423	best: 5.6959675 (1880)	total: 13.7s	remaining: 4.58s
9999:	learn: 4.1800609	test: 5.8569251	best: 5.6959675 (1880)	total: 18s	remaining: 0us

bestTest = 5.695967547
bestIteration = 1880

Shrink model to first 1881 iterations.
validating on fold 2...
0:	learn: 7.2336802	test: 7.4588208	best: 7.4588208 (0)	total: 1.78ms	remaining: 17.8s
2500:	learn: 5.2616999	test: 6.1138688	best: 6.0539788 (497)	total: 4.85s	remaining: 14.5s
5000:	learn: 4.7944642	test: 6.1903731	best: 6.0539788 (497)	total: 9.74s	remaining: 9.74s
7500:	learn: 4.4417082	test: 6.2561284	best: 6.0539788 (497)	total: 14.4s	remaining: 4.78s
9999:	learn: 4.1664834	test: 6.3062966	bes

[<catboost.core.CatBoostRegressor at 0x7f86f7d607f0>,
 <catboost.core.CatBoostRegressor at 0x7f86f0725700>,
 <catboost.core.CatBoostRegressor at 0x7f86f55c3b20>,
 <catboost.core.CatBoostRegressor at 0x7f86f7cc57c0>,
 <catboost.core.CatBoostRegressor at 0x7f86f6c2d670>]

In [25]:
produce_models('target2')

fitting model...
validating on fold 1...
0:	learn: 7.1848582	test: 7.1864708	best: 7.1864708 (0)	total: 3.04ms	remaining: 30.4s
2500:	learn: 5.3747617	test: 5.8963681	best: 5.8322841 (552)	total: 5.51s	remaining: 16.5s
5000:	learn: 4.9284305	test: 5.9756803	best: 5.8322841 (552)	total: 9.98s	remaining: 9.98s
7500:	learn: 4.5746114	test: 6.0407675	best: 5.8322841 (552)	total: 14.9s	remaining: 4.97s
9999:	learn: 4.2842944	test: 6.0952727	best: 5.8322841 (552)	total: 19.6s	remaining: 0us

bestTest = 5.832284063
bestIteration = 552

Shrink model to first 553 iterations.
validating on fold 2...
0:	learn: 7.1971305	test: 7.1361163	best: 7.1361163 (0)	total: 1.83ms	remaining: 18.3s
2500:	learn: 5.3886376	test: 5.9256471	best: 5.8567414 (736)	total: 4.56s	remaining: 13.7s
5000:	learn: 4.9293153	test: 6.0368179	best: 5.8567414 (736)	total: 8.87s	remaining: 8.87s
7500:	learn: 4.5638686	test: 6.1229810	best: 5.8567414 (736)	total: 13.4s	remaining: 4.45s
9999:	learn: 4.2675397	test: 6.1956109	best

[<catboost.core.CatBoostRegressor at 0x7f86f3fb8160>,
 <catboost.core.CatBoostRegressor at 0x7f86f3e5abb0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3e5afd0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb80a0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3e5ae50>]

In [21]:
produce_models('target3')

fitting model...
validating on fold 1...
0:	learn: 13.4280395	test: 13.4678011	best: 13.4678011 (0)	total: 55ms	remaining: 9m 9s
2500:	learn: 8.1412283	test: 9.2294239	best: 9.1556414 (492)	total: 4.11s	remaining: 12.3s
5000:	learn: 7.4943182	test: 9.3307951	best: 9.1556414 (492)	total: 8.04s	remaining: 8.04s
7500:	learn: 6.9766757	test: 9.4323321	best: 9.1556414 (492)	total: 12.2s	remaining: 4.07s
9999:	learn: 6.5370712	test: 9.5286139	best: 9.1556414 (492)	total: 16.3s	remaining: 0us

bestTest = 9.155641387
bestIteration = 492

Shrink model to first 493 iterations.
validating on fold 2...
0:	learn: 13.4316773	test: 13.4537413	best: 13.4537413 (0)	total: 1.9ms	remaining: 19s
2500:	learn: 8.1965842	test: 9.0545409	best: 8.9548229 (685)	total: 4.38s	remaining: 13.1s
5000:	learn: 7.5323515	test: 9.1953569	best: 8.9548229 (685)	total: 9.01s	remaining: 9.01s
7500:	learn: 7.0033688	test: 9.3035535	best: 8.9548229 (685)	total: 13.4s	remaining: 4.45s
9999:	learn: 6.5543670	test: 9.3944363	bes

[<catboost.core.CatBoostRegressor at 0x7f86f6c37a00>,
 <catboost.core.CatBoostRegressor at 0x7f86f6c37b80>,
 <catboost.core.CatBoostRegressor at 0x7f86f6c2c760>,
 <catboost.core.CatBoostRegressor at 0x7f86f6c373d0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb85b0>]

In [22]:
produce_models('target4')

fitting model...
validating on fold 1...
0:	learn: 8.4139119	test: 8.4675934	best: 8.4675934 (0)	total: 4.76ms	remaining: 47.6s
2500:	learn: 6.7040603	test: 7.4624043	best: 7.4371550 (1252)	total: 4.48s	remaining: 13.4s
5000:	learn: 6.0980306	test: 7.5478761	best: 7.4371550 (1252)	total: 9.22s	remaining: 9.21s
7500:	learn: 5.6447238	test: 7.6197851	best: 7.4371550 (1252)	total: 14.1s	remaining: 4.71s
9999:	learn: 5.2740362	test: 7.6742381	best: 7.4371550 (1252)	total: 18.5s	remaining: 0us

bestTest = 7.437154986
bestIteration = 1252

Shrink model to first 1253 iterations.
validating on fold 2...
0:	learn: 8.4240275	test: 8.4331990	best: 8.4331990 (0)	total: 1.79ms	remaining: 17.9s
2500:	learn: 6.7023024	test: 7.4616509	best: 7.4320705 (1163)	total: 4.3s	remaining: 12.9s
5000:	learn: 6.1270330	test: 7.5418322	best: 7.4320705 (1163)	total: 8.54s	remaining: 8.54s
7500:	learn: 5.6736904	test: 7.6066402	best: 7.4320705 (1163)	total: 13.4s	remaining: 4.47s
9999:	learn: 5.3106063	test: 7.6615

[<catboost.core.CatBoostRegressor at 0x7f86f3fb8cd0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8340>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8b20>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8ac0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb87f0>]

In [23]:
produce_models('target5') 

fitting model...
validating on fold 1...
0:	learn: 17.4210240	test: 17.8629494	best: 17.8629494 (0)	total: 4.98ms	remaining: 49.8s
2500:	learn: 12.5472872	test: 13.9118578	best: 13.8667336 (997)	total: 4.22s	remaining: 12.7s
5000:	learn: 11.5002676	test: 14.0171521	best: 13.8667336 (997)	total: 8.8s	remaining: 8.79s
7500:	learn: 10.7142319	test: 14.1345800	best: 13.8667336 (997)	total: 13.7s	remaining: 4.56s
9999:	learn: 10.0283311	test: 14.2314836	best: 13.8667336 (997)	total: 17.9s	remaining: 0us

bestTest = 13.8667336
bestIteration = 997

Shrink model to first 998 iterations.
validating on fold 2...
0:	learn: 17.5093718	test: 17.5163627	best: 17.5163627 (0)	total: 1.74ms	remaining: 17.4s
2500:	learn: 12.4972491	test: 13.8435878	best: 13.7234748 (955)	total: 4.27s	remaining: 12.8s
5000:	learn: 11.3875768	test: 14.0239326	best: 13.7234748 (955)	total: 8.47s	remaining: 8.47s
7500:	learn: 10.5502320	test: 14.1725704	best: 13.7234748 (955)	total: 12.7s	remaining: 4.24s
9999:	learn: 9.869

[<catboost.core.CatBoostRegressor at 0x7f86f3fb8bb0>,
 <catboost.core.CatBoostRegressor at 0x7f86f6c376d0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8880>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8b50>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8d90>]

## For using 3 features 

As you can see from the training logs below, the RMSE when using 3 features only is higher. 

In [31]:
gender = df['ⓞ_02_성별'].values 
age = df['ⓞ_06_나이_반올림'].values
height = df['①_003_키'].values
target1 = df['①_094_얼굴수직길이'].values
target2 = df['①_115_머리너비'].values
target3 = df['①_093_머리수직길이'].values
target4 = df['①_117_아래턱사이너비'].values 
target5 = df['①_111_머리둘레'].values

In [32]:
gender_encoded = [] 
for i in range(len(gender)): 
    if gender[i] == '남': 
        gender_encoded.append(0) 
    else: 
        gender_encoded.append(1) 
gender_encoded = np.asarray(gender_encoded)  

In [33]:
train = pd.DataFrame({'gender':gender_encoded,
                      'age':age,
                      'height': height,
                      'target1': target1, 
                      'target2': target2, 
                      'target3': target3, 
                      'target4': target4, 
                      'target5': target5
                     })

In [34]:
def produce_models(target_name): 
    target_names = ['target1','target2','target3','target4','target5'] 
    x_train = train.drop(columns=target_names)
    y_train = train[target_name] 
    temp = pd.concat([x_train,y_train],axis=1)
    temp = temp.dropna()
    x_train = temp.drop(columns=[target_name])
    y_train = temp[target_name] 
    print("fitting model...")
    models = []

    kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
    for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)):  
        print("validating on fold {}...".format(idx+1))
        train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
        train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
        model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 2500, 
                              use_best_model = True,
                              random_seed = 999)  
    
        model.fit(train_x, train_y, eval_set = (val_x, val_y)) 
    
        models.append(model) 
    
    print("saving models...") 
    for idx,model in enumerate(models):  
        model.save_model("3_features_" + target_name + "_model_" + str(idx+1)) 
        
    return models


In [35]:
produce_models('target1')

fitting model...
validating on fold 1...
0:	learn: 7.3117262	test: 7.1483959	best: 7.1483959 (0)	total: 3.62ms	remaining: 36.2s
2500:	learn: 5.7724295	test: 5.9883995	best: 5.9165456 (545)	total: 3.86s	remaining: 11.6s
5000:	learn: 5.5222065	test: 6.0603121	best: 5.9165456 (545)	total: 8.23s	remaining: 8.22s
7500:	learn: 5.3354762	test: 6.1203858	best: 5.9165456 (545)	total: 12.9s	remaining: 4.31s
9999:	learn: 5.1847166	test: 6.1728059	best: 5.9165456 (545)	total: 16.8s	remaining: 0us

bestTest = 5.916545626
bestIteration = 545

Shrink model to first 546 iterations.
validating on fold 2...
0:	learn: 7.2811446	test: 7.2842966	best: 7.2842966 (0)	total: 1.32ms	remaining: 13.3s
2500:	learn: 5.7422825	test: 6.1257898	best: 6.0995565 (623)	total: 3.92s	remaining: 11.8s
5000:	learn: 5.5002730	test: 6.1882705	best: 6.0995565 (623)	total: 9.68s	remaining: 9.68s
7500:	learn: 5.3227348	test: 6.2464049	best: 6.0995565 (623)	total: 14.8s	remaining: 4.92s
9999:	learn: 5.1796052	test: 6.3058866	best

[<catboost.core.CatBoostRegressor at 0x7f86f4c6fb50>,
 <catboost.core.CatBoostRegressor at 0x7f86f55dd760>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f0a0>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f8b0>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f580>]

In [36]:
produce_models('target2')

fitting model...
validating on fold 1...
0:	learn: 7.1655091	test: 7.2740437	best: 7.2740437 (0)	total: 2.47ms	remaining: 24.7s
2500:	learn: 5.7172806	test: 6.2732165	best: 6.2011868 (515)	total: 4.13s	remaining: 12.4s
5000:	learn: 5.4887419	test: 6.3477185	best: 6.2011868 (515)	total: 8.47s	remaining: 8.46s
7500:	learn: 5.3169727	test: 6.4124890	best: 6.2011868 (515)	total: 12.8s	remaining: 4.27s
9999:	learn: 5.1782748	test: 6.4682401	best: 6.2011868 (515)	total: 17.5s	remaining: 0us

bestTest = 6.201186782
bestIteration = 515

Shrink model to first 516 iterations.
validating on fold 2...
0:	learn: 7.1990363	test: 7.1393045	best: 7.1393045 (0)	total: 1.25ms	remaining: 12.5s
2500:	learn: 5.7588042	test: 6.1468685	best: 6.0759751 (451)	total: 4.24s	remaining: 12.7s
5000:	learn: 5.5269141	test: 6.2195488	best: 6.0759751 (451)	total: 8.9s	remaining: 8.9s
7500:	learn: 5.3557246	test: 6.2751393	best: 6.0759751 (451)	total: 13.7s	remaining: 4.58s
9999:	learn: 5.2156638	test: 6.3260721	best: 

[<catboost.core.CatBoostRegressor at 0x7f86f4c6f940>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6ffd0>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6fd30>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6fbb0>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6fa00>]

In [37]:
produce_models('target3')

fitting model...
validating on fold 1...
0:	learn: 13.4773520	test: 13.2554377	best: 13.2554377 (0)	total: 4.49ms	remaining: 44.9s
2500:	learn: 8.6290969	test: 9.3511924	best: 9.2933329 (637)	total: 4.6s	remaining: 13.8s
5000:	learn: 8.2970987	test: 9.4676043	best: 9.2933329 (637)	total: 9.2s	remaining: 9.2s
7500:	learn: 8.0476014	test: 9.5714813	best: 9.2933329 (637)	total: 13.3s	remaining: 4.44s
9999:	learn: 7.8467174	test: 9.6585181	best: 9.2933329 (637)	total: 17.3s	remaining: 0us

bestTest = 9.293332861
bestIteration = 637

Shrink model to first 638 iterations.
validating on fold 2...
0:	learn: 13.4500372	test: 13.3745390	best: 13.3745390 (0)	total: 1.62ms	remaining: 16.2s
2500:	learn: 8.6818985	test: 9.2217703	best: 9.0894962 (605)	total: 3.78s	remaining: 11.3s
5000:	learn: 8.3511991	test: 9.3463989	best: 9.0894962 (605)	total: 7.71s	remaining: 7.71s
7500:	learn: 8.1037954	test: 9.4579159	best: 9.0894962 (605)	total: 11.6s	remaining: 3.87s
9999:	learn: 7.9016805	test: 9.5537798	b

[<catboost.core.CatBoostRegressor at 0x7f86f3fb81f0>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb86a0>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f430>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f550>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6fc10>]

In [38]:
produce_models('target4')

fitting model...
validating on fold 1...
0:	learn: 8.4237769	test: 8.4501850	best: 8.4501850 (0)	total: 3.37ms	remaining: 33.7s
2500:	learn: 7.5501281	test: 8.1597102	best: 8.0898616 (246)	total: 4.98s	remaining: 14.9s
5000:	learn: 7.2413237	test: 8.2291002	best: 8.0898616 (246)	total: 9.27s	remaining: 9.27s
7500:	learn: 7.0080565	test: 8.2918729	best: 8.0898616 (246)	total: 13.9s	remaining: 4.62s
9999:	learn: 6.8172460	test: 8.3511741	best: 8.0898616 (246)	total: 19.9s	remaining: 0us

bestTest = 8.089861593
bestIteration = 246

Shrink model to first 247 iterations.
validating on fold 2...
0:	learn: 8.4351759	test: 8.4084653	best: 8.4084653 (0)	total: 2.89ms	remaining: 28.9s
2500:	learn: 7.5787378	test: 7.9621265	best: 7.8465871 (453)	total: 6.79s	remaining: 20.4s
5000:	learn: 7.2442402	test: 8.0944569	best: 7.8465871 (453)	total: 11.4s	remaining: 11.4s
7500:	learn: 6.9990542	test: 8.1976876	best: 7.8465871 (453)	total: 15.7s	remaining: 5.23s
9999:	learn: 6.8009783	test: 8.2889423	best

[<catboost.core.CatBoostRegressor at 0x7f86f4c6f670>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8f40>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8a30>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8100>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8370>]

In [39]:
produce_models('target5')

fitting model...
validating on fold 1...
0:	learn: 17.6557258	test: 16.9605483	best: 16.9605483 (0)	total: 3.98ms	remaining: 39.8s
2500:	learn: 14.2440975	test: 14.5273709	best: 14.3741722 (486)	total: 4.66s	remaining: 14s
5000:	learn: 13.6618326	test: 14.7324624	best: 14.3741722 (486)	total: 9.35s	remaining: 9.34s
7500:	learn: 13.2418089	test: 14.9111055	best: 14.3741722 (486)	total: 14s	remaining: 4.68s
9999:	learn: 12.9019974	test: 15.0648982	best: 14.3741722 (486)	total: 18.3s	remaining: 0us

bestTest = 14.37417222
bestIteration = 486

Shrink model to first 487 iterations.
validating on fold 2...
0:	learn: 17.4300992	test: 17.8695826	best: 17.8695826 (0)	total: 1.65ms	remaining: 16.5s
2500:	learn: 13.9968457	test: 15.5510066	best: 15.4493649 (647)	total: 3.86s	remaining: 11.6s
5000:	learn: 13.4376095	test: 15.6938189	best: 15.4493649 (647)	total: 8.11s	remaining: 8.1s
7500:	learn: 13.0103692	test: 15.8095685	best: 15.4493649 (647)	total: 12.3s	remaining: 4.11s
9999:	learn: 12.67024

[<catboost.core.CatBoostRegressor at 0x7f86f3fb8070>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f700>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6fd00>,
 <catboost.core.CatBoostRegressor at 0x7f86f3fb8700>,
 <catboost.core.CatBoostRegressor at 0x7f86f4c6f4f0>]