In [29]:
import numpy as np 
import pandas as pd 
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

### Cleaning Data 

In [2]:
df = pd.read_excel('2015_7차_직접측정 데이터.xlsx')

In [3]:
gender = np.asarray(df['ⓞ_02_성별']) 
age = np.asarray(df['ⓞ_06_나이_반올림']) 
height = np.asarray(df['①_003_키']) 
weight = np.asarray(df['①_031_몸무게']) 
shoe_size = np.asarray(df['①_119_발직선길이']) 

In [4]:
face_width = np.asarray(df['①_115_머리너비'])  
lower_face_height = np.asarray(df['①_094_얼굴수직길이'])

In [5]:
train = pd.DataFrame({'gender':gender,
                      'age':age,
                      'height': height, 
                      'weight': weight, 
                      'shoe_size': shoe_size, 
                      'face_width': face_width, 
                      'lower_face_height': lower_face_height
                     })

In [6]:
train = train.dropna() 

In [7]:
train

Unnamed: 0,gender,age,height,weight,shoe_size,face_width,lower_face_height
0,남,25,1736.0,72.1,228.0,159.0,118.0
1,남,28,1833.0,106.2,269.0,161.0,128.0
2,남,19,1744.0,69.1,244.0,162.0,127.0
3,남,20,1751.0,68.4,231.0,154.0,124.0
4,남,22,1851.0,81.9,265.0,158.0,117.0
...,...,...,...,...,...,...,...
6415,여,20,1706.0,59.6,242.0,161.0,101.0
6416,여,20,1624.0,54.9,241.0,151.0,105.0
6417,여,44,1508.0,50.1,228.0,142.0,111.0
6418,여,20,1574.0,44.6,222.0,157.0,103.0


In [8]:
np.min(train['height']), np.max(train['height'])

(1354.0, 1915.0)

In [9]:
np.min(train['age']), np.max(train['age'])

(15, 69)

### Fit model for lower face height

In [10]:
x_train = train.drop(columns=['face_width', 'lower_face_height'])
y_train = train['lower_face_height']

In [11]:
cat_cols = ['gender']

In [12]:
height_models = []

kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
    train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
    model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 1000, 
                              use_best_model = True,
                              random_seed = 999) 
    
    
    model.fit(train_x, train_y, eval_set = (val_x, val_y), cat_features = cat_cols) 
    
    height_models.append(model)
    


0:	learn: 7.3267974	test: 7.0949701	best: 7.0949701 (0)	total: 61.9ms	remaining: 10m 18s
1000:	learn: 5.6758208	test: 5.7020601	best: 5.7020601 (1000)	total: 3.43s	remaining: 30.8s
2000:	learn: 5.4120700	test: 5.6969026	best: 5.6963559 (1984)	total: 6.36s	remaining: 25.4s
3000:	learn: 5.1887909	test: 5.7102251	best: 5.6963559 (1984)	total: 9.06s	remaining: 21.1s
4000:	learn: 5.0029200	test: 5.7263026	best: 5.6963559 (1984)	total: 11.8s	remaining: 17.7s
5000:	learn: 4.8297122	test: 5.7485031	best: 5.6963559 (1984)	total: 14.7s	remaining: 14.7s
6000:	learn: 4.6758209	test: 5.7718964	best: 5.6963559 (1984)	total: 16.4s	remaining: 10.9s
7000:	learn: 4.5371215	test: 5.7942780	best: 5.6963559 (1984)	total: 18s	remaining: 7.7s
8000:	learn: 4.4110162	test: 5.8114691	best: 5.6963559 (1984)	total: 19.6s	remaining: 4.9s
9000:	learn: 4.2949202	test: 5.8339812	best: 5.6963559 (1984)	total: 21.2s	remaining: 2.35s
9999:	learn: 4.1875421	test: 5.8561174	best: 5.6963559 (1984)	total: 22.8s	remaining: 0

In [16]:
for idx,model in enumerate(height_models):  
    model.save_model('face_height_model' + str(idx+1))

calculating mae loss for the predictions

In [39]:
total_mae = 0 
kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    model = height_models[idx] 
    y_true = y_train.iloc[val_idx] 
    val_x = x_train.iloc[val_idx,:]
    y_pred = model.predict(val_x)   
    print("model {}의 평균 절대 오차 = {:.2f} mm".format(idx+1, mean_absolute_error(y_true, y_pred))) 
    total_mae += mean_absolute_error(y_true, y_pred) 

print("전체 평균 절대 오차 = {:.2f} mm".format(total_mae / 5.0))
    
    

model 1의 평균 절대 오차 = 4.50 mm
model 2의 평균 절대 오차 = 4.69 mm
model 3의 평균 절대 오차 = 4.69 mm
model 4의 평균 절대 오차 = 4.44 mm
model 5의 평균 절대 오차 = 4.71 mm
전체 평균 절대 오차 = 4.61 mm


### Fit model for face width

In [40]:
x_train = train.drop(columns=['face_width', 'lower_face_height'])
y_train = train['face_width']   

In [41]:
cat_cols = ['gender']

In [43]:
width_models = []

kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    train_x, val_x = x_train.iloc[train_idx,:], x_train.iloc[val_idx,:]
    train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx] 
    model = CatBoostRegressor(iterations = 10000,
                              learning_rate = 0.01, 
                              loss_function = 'RMSE', 
                              eval_metric = 'RMSE',
                              verbose = 1000, 
                              use_best_model = True,
                              random_seed = 999) 
    
    
    model.fit(train_x, train_y, eval_set = (val_x, val_y), cat_features = cat_cols) 
    
    width_models.append(model)    
    


0:	learn: 7.1851516	test: 7.1867080	best: 7.1867080 (0)	total: 4.65ms	remaining: 46.5s
1000:	learn: 5.7032682	test: 5.8482787	best: 5.8356518 (573)	total: 1.86s	remaining: 16.7s
2000:	learn: 5.4731773	test: 5.8889609	best: 5.8356518 (573)	total: 3.45s	remaining: 13.8s
3000:	learn: 5.2673452	test: 5.9225940	best: 5.8356518 (573)	total: 5.05s	remaining: 11.8s
4000:	learn: 5.0890601	test: 5.9522182	best: 5.8356518 (573)	total: 6.99s	remaining: 10.5s
5000:	learn: 4.9259943	test: 5.9781162	best: 5.8356518 (573)	total: 8.92s	remaining: 8.92s
6000:	learn: 4.7788398	test: 6.0083523	best: 5.8356518 (573)	total: 10.9s	remaining: 7.24s
7000:	learn: 4.6372077	test: 6.0377714	best: 5.8356518 (573)	total: 13s	remaining: 5.57s
8000:	learn: 4.5090648	test: 6.0639006	best: 5.8356518 (573)	total: 14.8s	remaining: 3.7s
9000:	learn: 4.3883036	test: 6.0846673	best: 5.8356518 (573)	total: 16.6s	remaining: 1.84s
9999:	learn: 4.2794779	test: 6.1050036	best: 5.8356518 (573)	total: 18.4s	remaining: 0us

bestTes

calculating mae loss for width predictions 

In [46]:
for idx,model in enumerate(width_models):  
    model.save_model('face_width_model' + str(idx+1))

In [44]:
total_mae = 0         
kfold = KFold(n_splits = 5, shuffle = True, random_state = 888) 
for idx, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    model = width_models[idx] 
    y_true = y_train.iloc[val_idx] 
    val_x = x_train.iloc[val_idx,:]
    y_pred = model.predict(val_x)   
    print("model {}의 평균 절대 오차 = {:.2f} mm".format(idx+1, mean_absolute_error(y_true, y_pred))) 
    total_mae += mean_absolute_error(y_true, y_pred) 

print("전체 평균 절대 오차 = {:.2f} mm".format(total_mae / 5.0))
    

model 1의 평균 절대 오차 = 4.52 mm
model 2의 평균 절대 오차 = 4.59 mm
model 3의 평균 절대 오차 = 4.59 mm
model 4의 평균 절대 오차 = 4.64 mm
model 5의 평균 절대 오차 = 4.72 mm
전체 평균 절대 오차 = 4.61 mm
