In [1]:
import numpy as np 
import pandas as pd 
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [3]:
train = pd.read_csv('3_features_predict.csv') 

In [5]:
train.head()

Unnamed: 0,성별,나이,키,머리너비,얼굴수직길이
0,남,25,1736.0,159.0,118.0
1,남,28,1833.0,161.0,128.0
2,남,19,1744.0,162.0,127.0
3,남,20,1751.0,154.0,124.0
4,남,22,1851.0,158.0,117.0


### Fit model for lower face height

In [22]:
x_train = train.drop(columns=['머리너비', '얼굴수직길이'])
y_train = train['얼굴수직길이']

In [23]:
cat_cols = ['성별']

In [24]:
height_models = []

kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
    train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
    model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 1000, 
                              use_best_model = True,
                              random_seed = 999) 
    
    
    model.fit(train_x, train_y, eval_set = (val_x, val_y), cat_features = cat_cols) 
    
    height_models.append(model)
    


0:	learn: 9.0927769	test: 8.8660154	best: 8.8660154 (0)	total: 5.56ms	remaining: 55.6s
1000:	learn: 6.3664826	test: 6.4027045	best: 6.4026001 (994)	total: 2.38s	remaining: 21.4s
2000:	learn: 6.2782326	test: 6.4179707	best: 6.4021620 (1038)	total: 4.76s	remaining: 19s
3000:	learn: 6.2065077	test: 6.4342644	best: 6.4021620 (1038)	total: 7.39s	remaining: 17.2s
4000:	learn: 6.1472057	test: 6.4504070	best: 6.4021620 (1038)	total: 9.81s	remaining: 14.7s
5000:	learn: 6.0967070	test: 6.4650737	best: 6.4021620 (1038)	total: 12.1s	remaining: 12s
6000:	learn: 6.0524327	test: 6.4819336	best: 6.4021620 (1038)	total: 14.7s	remaining: 9.79s
7000:	learn: 6.0117416	test: 6.4980874	best: 6.4021620 (1038)	total: 17.3s	remaining: 7.39s
8000:	learn: 5.9738163	test: 6.5128367	best: 6.4021620 (1038)	total: 20.1s	remaining: 5.02s
9000:	learn: 5.9394492	test: 6.5260605	best: 6.4021620 (1038)	total: 22.5s	remaining: 2.5s
9999:	learn: 5.9072518	test: 6.5392049	best: 6.4021620 (1038)	total: 24.8s	remaining: 0us



In [25]:
for idx,model in enumerate(height_models):  
    model.save_model('3_feature_face_height_model' + str(idx+1)) 

In [26]:
total_mae = 0 
total_pe = 0 # percentage error 
kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    model = height_models[idx] 
    y_true = y_train.iloc[val_idx] 
    val_x = x_train.iloc[val_idx,:]
    y_pred = model.predict(val_x)   
    print("model {}의 평균 절대 오차 = {:.2f} mm".format(idx+1, mean_absolute_error(y_true, y_pred)))
    print("model {}의 평균 percentage 오차 = {:.2f}%".format(idx+1, np.mean(np.abs((y_true - y_pred) / y_true)) * 100))
    total_mae += mean_absolute_error(y_true, y_pred) 
    total_pe += np.mean(np.abs((y_true - y_pred) / y_true)) * 100  

print("전체 평균 절대 오차 = {:.2f} mm".format(total_mae / 5.0))
print("전체 평균 percentage 오차 = {:.2f}%".format(total_pe / 5.0))


model 1의 평균 절대 오차 = 5.08 mm
model 1의 평균 percentage 오차 = 4.52%
model 2의 평균 절대 오차 = 5.25 mm
model 2의 평균 percentage 오차 = 4.64%
model 3의 평균 절대 오차 = 4.99 mm
model 3의 평균 percentage 오차 = 4.43%
model 4의 평균 절대 오차 = 5.09 mm
model 4의 평균 percentage 오차 = 4.50%
model 5의 평균 절대 오차 = 5.01 mm
model 5의 평균 percentage 오차 = 4.43%
전체 평균 절대 오차 = 5.08 mm
전체 평균 percentage 오차 = 4.50%


### Fit model for lower face height

In [13]:
x_train = train.drop(columns=['머리너비', '얼굴수직길이'])
y_train = train['머리너비']   

In [14]:
cat_cols = ['성별']

In [15]:
width_models = []

kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
    train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
    model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 1000, 
                              use_best_model = True,
                              random_seed = 999) 
    
    
    model.fit(train_x, train_y, eval_set = (val_x, val_y), cat_features = cat_cols) 
    
    width_models.append(model)    
    


0:	learn: 7.3114902	test: 7.3039474	best: 7.3039474 (0)	total: 5.27ms	remaining: 52.7s
1000:	learn: 6.0256698	test: 6.1162738	best: 6.1162738 (1000)	total: 2.28s	remaining: 20.5s
2000:	learn: 5.9484985	test: 6.1268168	best: 6.1162738 (1000)	total: 4.6s	remaining: 18.4s
3000:	learn: 5.8867780	test: 6.1385320	best: 6.1162738 (1000)	total: 7.28s	remaining: 17s
4000:	learn: 5.8329462	test: 6.1517231	best: 6.1162738 (1000)	total: 9.99s	remaining: 15s
5000:	learn: 5.7865807	test: 6.1651892	best: 6.1162738 (1000)	total: 13s	remaining: 13s
6000:	learn: 5.7448377	test: 6.1776570	best: 6.1162738 (1000)	total: 15.9s	remaining: 10.6s
7000:	learn: 5.7056326	test: 6.1905104	best: 6.1162738 (1000)	total: 18.9s	remaining: 8.08s
8000:	learn: 5.6711746	test: 6.2029346	best: 6.1162738 (1000)	total: 21.6s	remaining: 5.39s
9000:	learn: 5.6388026	test: 6.2156278	best: 6.1162738 (1000)	total: 24.2s	remaining: 2.68s
9999:	learn: 5.6087964	test: 6.2262734	best: 6.1162738 (1000)	total: 26.6s	remaining: 0us

bes

In [16]:
for idx,model in enumerate(width_models):  
    model.save_model('3_feature_face_width_model' + str(idx+1))  

In [19]:
total_mae = 0         
total_pe = 0 # percentage error 
kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    model = width_models[idx] 
    y_true = y_train.iloc[val_idx] 
    val_x = x_train.iloc[val_idx,:]
    y_pred = model.predict(val_x)   
    print("model {}의 평균 절대 오차 = {:.2f} mm".format(idx+1, mean_absolute_error(y_true, y_pred)))  
    print("model {}의 평균 percentage 오차 = {:.2f}%".format(idx+1, np.mean(np.abs((y_true - y_pred) / y_true)) * 100))
    total_mae += mean_absolute_error(y_true, y_pred) 
    total_pe += np.mean(np.abs((y_true - y_pred) / y_true)) * 100 

print("전체 평균 절대 오차 = {:.2f} mm".format(total_mae / 5.0))
print("전체 평균 percentage 오차 = {:.2f}%".format(total_pe / 5.0))
    

model 1의 평균 절대 오차 = 4.75 mm
model 1의 평균 percentage 오차 = 3.04%
model 2의 평균 절대 오차 = 4.76 mm
model 2의 평균 percentage 오차 = 3.04%
model 3의 평균 절대 오차 = 4.74 mm
model 3의 평균 percentage 오차 = 3.04%
model 4의 평균 절대 오차 = 4.75 mm
model 4의 평균 percentage 오차 = 3.04%
model 5의 평균 절대 오차 = 4.83 mm
model 5의 평균 percentage 오차 = 3.09%
전체 평균 절대 오차 = 4.76 mm
전체 평균 percentage 오차 = 3.05%
