### Clustering 과 Prediction 모델을 결합한 (회귀)예측
- FAMD 나 PCA를 통한 차원축소 
- GMM을 통한 군집화 -> 결과를 확률로 반환. 데이터/클러스터별 소속확률
- 전체 데이터셋을 군집별로 나누기. 군집별 데이터셋을 train-test로 분리. 
- 각 군집별 데이터셋으로 개별 모델을 학습시키기 위함. (3번 군집데이터로는 3번 모델 학습)
- 전체 데이터셋 한번에 학습한 큰모델 vs 개별 군집별 데이터로 학습한 작은모델 -- 비교
- 군집별 소속확률과 군집별 개별학습모델간의 가중합 앙상블

#### 실험결과
전체 데이터셋을 CatBoost로 예측했을 때 MSE 0.158, 5개 군집으로 분류 후 군집별 개별 모델로 학습 후 예측했을 때 0.157, 5개 군집으로 분류 후 보팅앙상블 진행 시 MSE 0.114 MAPE 6.9

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import skew
import gc
from sklearn.linear_model import Ridge , LogisticRegression
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
import time
from tqdm import tqdm
#import torch
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [5]:
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
file2 = r"cycletime_normalized.csv"
ct_df = pd.read_csv(file2)
print(ct_df.shape, 1)
ct_df.head()

(521779, 57)


Unnamed: 0,DR,PROCESS_ID,PART_ID,LOTTYPE,QTY,HOLD_FLAG,FLOORID,GRADE,COMPLETE_RATE,LOT_PURPOSE,DUE_DATE,WIPTURN,ROOM,PROCESS_GROUP,LAYER_TITLE,LAYER_GROUP,BLOCK_GROUP,EQPTYPE,MOVE_0,EQP_WORKLOAD_0,MOVE_1,EQP_WORKLOAD_1,MOVE_2,EQP_WORKLOAD_2,BATCH,Q_E,Q_P,L_E,L_P,Q_H,Q_R,Q_W,L_H,L_R,L_W,Q_2,Q_3,Q_4,Q_5,Q_7,Q_8,Q_10,L_2,L_3,L_4,L_5,L_7,L_8,L_10,WIP_WAITTIME,WIP_PRODUCT_NUM,SHIFT_TYPE,PRC_WAIT_MIN,PRC_WAIT_MAX,PRC_WAIT_MEDIAN,PRC_WAIT_STD,CYCLETIME
0,N4,BU,5P9865A02,PT,-1.702672,Y,S3,0.678814,0.338031,J,1.27278,-1.040581,PH,COATER_HSOH,L1,LG61,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.062575,1.735273,3.425849,2.386661,2.44778,0.895343,1.414618,3.472389,0.715582,2.764852,0.919482,1.423341,2.576896,1.137361,0.358854,-0.034845,-0.065732,4.58937,2.841138,3.120002,2.419251,2.512175,2.523029,-0.067493,0.76861,3.398804,WG,-0.365971,4.352058,-0.222606,1.407415,115.0
1,N4,BU,5P9865A02,PT,0.444506,Y,S3,-1.923327,1.156127,1,-1.377403,1.366344,PH,COATER_HSOH,D10,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.365971,4.352058,-0.222606,1.407415,285.0
2,N5,LJ,FK05M2X01,PP,0.444506,Y,S3,0.678814,0.815253,1,-1.010888,0.244717,PH,COATER_HSOH,D5,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.148212,0.126315,-0.187582,-0.098518,264.0
3,N5,LJ,FK05M2X01,PP,0.670525,Y,S3,-1.272792,0.235769,1,-0.70076,0.391539,PH,COATER_HSOH,CM,LG51,PU5,PARCCO,0.985945,-0.664618,0.701941,-0.684381,0.790913,-0.684201,N,2.463695,1.550781,3.265198,1.781555,2.633157,1.144039,1.018506,3.376843,0.940246,1.807379,-0.208179,1.7728,1.966763,1.08063,2.376441,0.590516,-0.065732,-0.230689,2.410432,2.527811,2.336403,3.994703,3.140679,-0.067493,0.927776,3.138018,WO,-0.148212,0.126315,-0.187582,-0.098518,379.0
4,N4,BU,5P9865A02,PT,0.557515,Y,S3-D,0.678814,0.781166,1,-1.222339,1.375972,PH,COATER_HSOH,FD,LG63,PU6,PARCCO,0.582931,-0.688329,0.619812,-0.677794,0.615034,-0.681567,N,2.747564,3.442551,3.678301,3.407778,3.541026,2.06868,2.873977,3.854574,1.838902,3.31198,-0.208179,2.906896,3.520256,2.818017,0.436453,0.282799,-0.065732,-0.230689,3.092383,3.830631,3.247732,3.253439,3.017149,-0.067493,0.531556,3.572662,WO,-0.365971,4.352058,-0.222606,1.407415,22.0


In [6]:
y_ct = ct_df.iloc[:,-1]
x_ct = ct_df.iloc[:,:-1]
y_ct = np.log1p(y_ct)

In [7]:
# 숫자형, 범주형 변수 분할 및 범주형 변수의 type을 'category'로 변경
numerical_list=[]
categorical_list=[]

for i in x_ct.columns :
  if x_ct[i].dtypes == 'O' :   # O 는 object 와 동일.
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list {}:".format(len(categorical_list)), categorical_list)
print("numerical_list {}:".format(len(numerical_list)), numerical_list)

categorical_list 15: ['DR', 'PROCESS_ID', 'PART_ID', 'LOTTYPE', 'HOLD_FLAG', 'FLOORID', 'LOT_PURPOSE', 'ROOM', 'PROCESS_GROUP', 'LAYER_TITLE', 'LAYER_GROUP', 'BLOCK_GROUP', 'EQPTYPE', 'BATCH', 'SHIFT_TYPE']
numerical_list 41: ['QTY', 'GRADE', 'COMPLETE_RATE', 'DUE_DATE', 'WIPTURN', 'MOVE_0', 'EQP_WORKLOAD_0', 'MOVE_1', 'EQP_WORKLOAD_1', 'MOVE_2', 'EQP_WORKLOAD_2', 'Q_E', 'Q_P', 'L_E', 'L_P', 'Q_H', 'Q_R', 'Q_W', 'L_H', 'L_R', 'L_W', 'Q_2', 'Q_3', 'Q_4', 'Q_5', 'Q_7', 'Q_8', 'Q_10', 'L_2', 'L_3', 'L_4', 'L_5', 'L_7', 'L_8', 'L_10', 'WIP_WAITTIME', 'WIP_PRODUCT_NUM', 'PRC_WAIT_MIN', 'PRC_WAIT_MAX', 'PRC_WAIT_MEDIAN', 'PRC_WAIT_STD']


In [8]:
# x 데이터를 GMM 적용을 위해 FAMD를 통해 15차원 축소
'''
import prince
famd = prince.FAMD(n_components= 15, random_state=42)
famd = famd.fit(x_ct)
x_famd = famd.transform(x_ct)
x_famd.to_csv('cycletime_normalized_famd15.csv') '''

x_famd = pd.read_csv("cycletime_normalized_famd15.csv", index_col=0)
x_famd.shape

(521779, 15)

In [10]:
# GMM 을 통한 군집화
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4, random_state=42).fit(x_famd)
gmm_results = gmm.predict(x_famd)
gmm_prob_results = gmm.predict_proba(x_famd)

print(len(gmm_results), type(gmm_results))

521779 <class 'numpy.ndarray'>


In [18]:
# Cluster 결과정보를 포함한 데이터셋 생성
x_ct['cluster'] = gmm_results
xy = pd.concat([x_ct, y_ct], axis=1)
xy.head()

Unnamed: 0,DR,PROCESS_ID,PART_ID,LOTTYPE,QTY,HOLD_FLAG,FLOORID,GRADE,COMPLETE_RATE,LOT_PURPOSE,DUE_DATE,WIPTURN,ROOM,PROCESS_GROUP,LAYER_TITLE,LAYER_GROUP,BLOCK_GROUP,EQPTYPE,MOVE_0,EQP_WORKLOAD_0,MOVE_1,EQP_WORKLOAD_1,MOVE_2,EQP_WORKLOAD_2,BATCH,Q_E,Q_P,L_E,L_P,Q_H,Q_R,Q_W,L_H,L_R,L_W,Q_2,Q_3,Q_4,Q_5,Q_7,Q_8,Q_10,L_2,L_3,L_4,L_5,L_7,L_8,L_10,WIP_WAITTIME,WIP_PRODUCT_NUM,SHIFT_TYPE,PRC_WAIT_MIN,PRC_WAIT_MAX,PRC_WAIT_MEDIAN,PRC_WAIT_STD,cluster,CYCLETIME
0,N4,BU,5P9865A02,PT,-1.702672,Y,S3,0.678814,0.338031,J,1.27278,-1.040581,PH,COATER_HSOH,L1,LG61,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.062575,1.735273,3.425849,2.386661,2.44778,0.895343,1.414618,3.472389,0.715582,2.764852,0.919482,1.423341,2.576896,1.137361,0.358854,-0.034845,-0.065732,4.58937,2.841138,3.120002,2.419251,2.512175,2.523029,-0.067493,0.76861,3.398804,WG,-0.365971,4.352058,-0.222606,1.407415,2,4.75359
1,N4,BU,5P9865A02,PT,0.444506,Y,S3,-1.923327,1.156127,1,-1.377403,1.366344,PH,COATER_HSOH,D10,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.365971,4.352058,-0.222606,1.407415,2,5.655992
2,N5,LJ,FK05M2X01,PP,0.444506,Y,S3,0.678814,0.815253,1,-1.010888,0.244717,PH,COATER_HSOH,D5,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.148212,0.126315,-0.187582,-0.098518,2,5.57973
3,N5,LJ,FK05M2X01,PP,0.670525,Y,S3,-1.272792,0.235769,1,-0.70076,0.391539,PH,COATER_HSOH,CM,LG51,PU5,PARCCO,0.985945,-0.664618,0.701941,-0.684381,0.790913,-0.684201,N,2.463695,1.550781,3.265198,1.781555,2.633157,1.144039,1.018506,3.376843,0.940246,1.807379,-0.208179,1.7728,1.966763,1.08063,2.376441,0.590516,-0.065732,-0.230689,2.410432,2.527811,2.336403,3.994703,3.140679,-0.067493,0.927776,3.138018,WO,-0.148212,0.126315,-0.187582,-0.098518,1,5.940171
4,N4,BU,5P9865A02,PT,0.557515,Y,S3-D,0.678814,0.781166,1,-1.222339,1.375972,PH,COATER_HSOH,FD,LG63,PU6,PARCCO,0.582931,-0.688329,0.619812,-0.677794,0.615034,-0.681567,N,2.747564,3.442551,3.678301,3.407778,3.541026,2.06868,2.873977,3.854574,1.838902,3.31198,-0.208179,2.906896,3.520256,2.818017,0.436453,0.282799,-0.065732,-0.230689,3.092383,3.830631,3.247732,3.253439,3.017149,-0.067493,0.531556,3.572662,WO,-0.365971,4.352058,-0.222606,1.407415,2,3.135494


In [19]:
# Cluster 그룹별 데이터셋 따로 생성
grouped = xy.groupby('cluster')
xy = xy.drop('cluster', axis=1)
xy0 = grouped.get_group(0).drop('cluster', axis=1)
xy1 = grouped.get_group(1).drop('cluster', axis=1)
xy2 = grouped.get_group(2).drop('cluster', axis=1)
xy3 = grouped.get_group(3).drop('cluster', axis=1)
print(xy.shape, xy0.shape, xy1.shape, xy2.shape, xy3.shape)

xy_list = [xy, xy0, xy1, xy2, xy3]

(521779, 57) (75116, 57) (158505, 57) (210843, 57) (77315, 57)


In [21]:
# 데이터셋별 train-test, x-y 분리

def split_dataframes(df_list, target_column):
    train_test_sets = []
    for df in df_list:
        X = df.drop(columns=target_column)
        y = df[target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
        train_test_sets.append((X_train, X_test, y_train, y_test))
    return train_test_sets

train_test_sets = split_dataframes(xy_list, target_column='CYCLETIME')
X_train, X_test, y_train, y_test = train_test_sets[0]
X0_train, X0_test, y0_train, y0_test = train_test_sets[1]
X1_train, X1_test, y1_train, y1_test = train_test_sets[2]
X2_train, X2_test, y2_train, y2_test = train_test_sets[3]
X3_train, X3_test, y3_train, y3_test = train_test_sets[4]

In [24]:
# 군집별 적용할 CatBoost 예측모델 생성성
from catboost import CatBoostRegressor
cat_regressor = CatBoostRegressor(iterations=7, depth=5, learning_rate=0.07, loss_function='RMSE', cat_features=categorical_list, verbose=25)
cat_regressor0 = CatBoostRegressor(iterations=7, depth=5, learning_rate=0.07, loss_function='RMSE', cat_features=categorical_list, verbose=25)
cat_regressor1 = CatBoostRegressor(iterations=7, depth=5, learning_rate=0.07, loss_function='RMSE', cat_features=categorical_list, verbose=25)
cat_regressor2 = CatBoostRegressor(iterations=7, depth=5, learning_rate=0.07, loss_function='RMSE', cat_features=categorical_list, verbose=25)
cat_regressor3 = CatBoostRegressor(iterations=7, depth=5, learning_rate=0.07, loss_function='RMSE', cat_features=categorical_list, verbose=25)

def model_train_predict(model, x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    preds = model.predict(x_test)
    return preds

def mse(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

In [25]:
# 전체데이터셋 및 개별 군집별 서로다른 예측모델 학습 
preds = model_train_predict(cat_regressor, X_train,X_test,y_train,y_test)
preds0 = model_train_predict(cat_regressor0, X0_train, X0_test, y0_train, y0_test)
preds1 = model_train_predict(cat_regressor1, X1_train, X1_test, y1_train, y1_test)
preds2 = model_train_predict(cat_regressor2, X2_train, X2_test, y2_train, y2_test)
preds3 = model_train_predict(cat_regressor3, X3_train, X3_test, y3_train, y3_test)

0:	learn: 0.8435210	total: 99.1ms	remaining: 595ms
6:	learn: 0.6969333	total: 536ms	remaining: 0us
0:	learn: 0.8367285	total: 45.9ms	remaining: 276ms
6:	learn: 0.6754376	total: 227ms	remaining: 0us
0:	learn: 0.8560730	total: 47.5ms	remaining: 285ms
6:	learn: 0.7104595	total: 301ms	remaining: 0us
0:	learn: 0.8009915	total: 69ms	remaining: 414ms
6:	learn: 0.6456059	total: 333ms	remaining: 0us
0:	learn: 0.8982756	total: 37.4ms	remaining: 224ms
6:	learn: 0.7273625	total: 222ms	remaining: 0us


In [26]:
# 군집별 예측모델로 예측값 concat
preds_concat = np.concatenate((preds0, preds1, preds2, preds3))
y_test_concat = pd.concat([y0_test, y1_test, y2_test, y3_test])

In [27]:
# 예측성능비교 (전체데이터셋으로 단일모델 vs 군집별 개별모델 예측)
pred_list = [preds, preds0, preds1, preds2, preds3, preds_concat]
ytest_list = [y_test, y0_test, y1_test, y2_test,y3_test, y_test_concat]

mse_list = []
for predict, actual in zip(pred_list, ytest_list):
    loss = mse(actual, predict)
    mse_list.append(loss)

print(mse_list)

[0.48519235435639757, 0.45024441906528245, 0.5135712824790845, 0.419558922971386, 0.5332942790228329, 0.46938782292722764]


In [28]:
# GMM 군집별 소속확률을 concat한 데이터셋 생성 
gmm_prob_df = pd.DataFrame(gmm_prob_results, columns=['cluster0','cluster1','cluster2','cluster3'])
xy_new = pd.concat([xy, gmm_prob_df], axis=1)

print(xy_new.shape)
xy_new.head()

(521779, 61)


Unnamed: 0,DR,PROCESS_ID,PART_ID,LOTTYPE,QTY,HOLD_FLAG,FLOORID,GRADE,COMPLETE_RATE,LOT_PURPOSE,DUE_DATE,WIPTURN,ROOM,PROCESS_GROUP,LAYER_TITLE,LAYER_GROUP,BLOCK_GROUP,EQPTYPE,MOVE_0,EQP_WORKLOAD_0,MOVE_1,EQP_WORKLOAD_1,MOVE_2,EQP_WORKLOAD_2,BATCH,Q_E,Q_P,L_E,L_P,Q_H,Q_R,Q_W,L_H,L_R,L_W,Q_2,Q_3,Q_4,Q_5,Q_7,Q_8,Q_10,L_2,L_3,L_4,L_5,L_7,L_8,L_10,WIP_WAITTIME,WIP_PRODUCT_NUM,SHIFT_TYPE,PRC_WAIT_MIN,PRC_WAIT_MAX,PRC_WAIT_MEDIAN,PRC_WAIT_STD,CYCLETIME,cluster0,cluster1,cluster2,cluster3
0,N4,BU,5P9865A02,PT,-1.702672,Y,S3,0.678814,0.338031,J,1.27278,-1.040581,PH,COATER_HSOH,L1,LG61,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.062575,1.735273,3.425849,2.386661,2.44778,0.895343,1.414618,3.472389,0.715582,2.764852,0.919482,1.423341,2.576896,1.137361,0.358854,-0.034845,-0.065732,4.58937,2.841138,3.120002,2.419251,2.512175,2.523029,-0.067493,0.76861,3.398804,WG,-0.365971,4.352058,-0.222606,1.407415,4.75359,2.120738e-17,0.0001568149,0.9998431,1.336623e-07
1,N4,BU,5P9865A02,PT,0.444506,Y,S3,-1.923327,1.156127,1,-1.377403,1.366344,PH,COATER_HSOH,D10,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.365971,4.352058,-0.222606,1.407415,5.655992,3.6106600000000005e-17,7.270499e-07,0.9999993,5.608097e-10
2,N5,LJ,FK05M2X01,PP,0.444506,Y,S3,0.678814,0.815253,1,-1.010888,0.244717,PH,COATER_HSOH,D5,LG63,PU6,PARCCO,0.720655,-0.696233,0.981781,-0.66462,0.707444,-0.684201,N,2.247707,1.729416,3.356999,2.046289,2.937364,0.206645,1.586614,3.424616,0.378586,2.445695,-0.208179,1.838736,2.375083,0.846614,0.436453,0.461473,-0.065732,-0.230689,2.769354,2.646249,2.419251,3.253439,2.893619,-0.067493,0.881273,3.138018,WS,-0.148212,0.126315,-0.187582,-0.098518,5.57973,1.0052e-18,1.073648e-08,1.0,2.104432e-16
3,N5,LJ,FK05M2X01,PP,0.670525,Y,S3,-1.272792,0.235769,1,-0.70076,0.391539,PH,COATER_HSOH,CM,LG51,PU5,PARCCO,0.985945,-0.664618,0.701941,-0.684381,0.790913,-0.684201,N,2.463695,1.550781,3.265198,1.781555,2.633157,1.144039,1.018506,3.376843,0.940246,1.807379,-0.208179,1.7728,1.966763,1.08063,2.376441,0.590516,-0.065732,-0.230689,2.410432,2.527811,2.336403,3.994703,3.140679,-0.067493,0.927776,3.138018,WO,-0.148212,0.126315,-0.187582,-0.098518,5.940171,3.310768e-26,1.0,2.195018e-15,2.294039e-16
4,N4,BU,5P9865A02,PT,0.557515,Y,S3-D,0.678814,0.781166,1,-1.222339,1.375972,PH,COATER_HSOH,FD,LG63,PU6,PARCCO,0.582931,-0.688329,0.619812,-0.677794,0.615034,-0.681567,N,2.747564,3.442551,3.678301,3.407778,3.541026,2.06868,2.873977,3.854574,1.838902,3.31198,-0.208179,2.906896,3.520256,2.818017,0.436453,0.282799,-0.065732,-0.230689,3.092383,3.830631,3.247732,3.253439,3.017149,-0.067493,0.531556,3.572662,WO,-0.365971,4.352058,-0.222606,1.407415,3.135494,1.4248739999999998e-20,8.19832e-08,0.9999999,1.21928e-10


In [15]:
X_new = xy_new.drop('CYCLETIME', axis=1)
y_new = xy_new['CYCLETIME']
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=11)

In [16]:
# 4개 모델별 예측값 만들기
preds0_soft = cat_regressor0.predict(X_test)
preds1_soft = cat_regressor1.predict(X_test)
preds2_soft = cat_regressor2.predict(X_test)
preds3_soft = cat_regressor3.predict(X_test)

In [17]:
# 모델별 예측값을 concat한 데이터셋 생성
pred_soft_df = pd.DataFrame({'preds0': preds0_soft, 'preds1': preds1_soft, 'preds2': preds2_soft, 'preds3': preds3_soft})
print(pred_soft_df.shape)
pred_soft_df.head()

(104356, 4)


Unnamed: 0,preds0,preds1,preds2,preds3
0,3.634647,3.179486,3.371683,2.813513
1,3.055784,3.162715,3.569711,3.065728
2,3.278935,4.019439,3.79901,4.269928
3,3.414664,3.910313,3.822306,3.808492
4,3.567623,3.536182,3.806793,3.918414


In [18]:
# GMM의 soft clustering 결과(소속확률)를 concat한 데이터셋 생성 
gmm_prob_df = pd.DataFrame(X_test_new, columns=['cluster0','cluster1','cluster2','cluster3'])
gmm_prob_df = gmm_prob_df.reset_index(drop=True)
print(gmm_prob_df.shape)
gmm_prob_df.head()

(104356, 4)


Unnamed: 0,cluster0,cluster1,cluster2,cluster3
0,2.453215e-31,2.970962e-17,8.627271e-87,1.0
1,0.9702732,0.02972683,7.528252e-32,1.816852e-15
2,4.107131e-15,1.0,5.054375999999999e-38,8.218238e-18
3,6.508127e-08,1.192545e-16,0.9999999,7.393596e-29
4,1.303111e-12,2.707229e-07,0.9999997,4.793361e-11


In [19]:
soft_df = pd.concat([pred_soft_df, gmm_prob_df], axis=1)
soft_df['preds_soft'] = soft_df['preds0']*soft_df['cluster0'] + soft_df['preds1']*soft_df['cluster1'] \
                          + soft_df['preds2']*soft_df['cluster2']+ soft_df['preds3']*soft_df['cluster3']
soft_df.head()

Unnamed: 0,preds0,preds1,preds2,preds3,cluster0,cluster1,cluster2,cluster3,preds_soft
0,3.634647,3.179486,3.371683,2.813513,2.453215e-31,2.970962e-17,8.627271e-87,1.0,2.813513
1,3.055784,3.162715,3.569711,3.065728,0.9702732,0.02972683,7.528252e-32,1.816852e-15,3.058963
2,3.278935,4.019439,3.79901,4.269928,4.107131e-15,1.0,5.054375999999999e-38,8.218238e-18,4.019439
3,3.414664,3.910313,3.822306,3.808492,6.508127e-08,1.192545e-16,0.9999999,7.393596e-29,3.822306
4,3.567623,3.536182,3.806793,3.918414,1.303111e-12,2.707229e-07,0.9999997,4.793361e-11,3.806793


In [22]:
preds_soft = soft_df['preds_soft']
y_test_new = y_test_new.reset_index(drop=True)
print('mse : ', mse(y_test_new, preds_soft))

mse :  0.11440071756602763


In [23]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print('mape : ', mape(y_test_new, preds_soft))

mape :  6.912017861066007
