In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import OneHotEncoder, CatBoostEncoder, MEstimateEncoder
from sklearn.model_selection import StratifiedGroupKFold, train_test_split


from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn import set_config
import os
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.model_selection import StratifiedKFold
import optuna
from sklearn.compose import ColumnTransformer
from prettytable import PrettyTable

from sklearn.compose import make_column_transformer
from sklearn.base import clone
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
import optuna
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## load data

In [2]:
data = pd.read_csv("data/data.csv")
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
raw = pd.read_csv("data/ObesityDataSet_raw_and_data_sinthetic.csv")
submission = pd.read_csv("data/sample_submission.csv")

data.shape, train.shape, test.shape, test.shape, raw.shape, submission.shape

((50001, 18), (20758, 18), (13840, 17), (13840, 17), (2111, 17), (13840, 2))

## bmi 생성

In [3]:
train['BMI'] = train['Weight'] / (train['Height']**2)
test['BMI'] = test['Weight'] / (test['Height']**2)
data['BMI'] = data['Weight'] / (data['Height']**2)
raw['BMI'] = raw['Weight'] / (raw['Height']**2)

In [4]:
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [5]:
categories = train.columns[(train.dtypes=="object") & (train.columns != "NObeyesdad")]
for cat in categories:
    le = LabelEncoder()
    print(cat)
    if train[cat].dtypes == "object":
        le = le.fit(train[cat])
        train[cat] = le.transform(train[cat])

Gender
family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
CALC
MTRANS


In [6]:
rcats = raw.columns[(raw.dtypes=="object") & (raw.columns != "NObeyesdad")]
for rcat in rcats:
    le = LabelEncoder()
    print(rcat)
    if raw[rcat].dtypes == "object":
        le = le.fit(raw[rcat])
        raw[rcat] = le.transform(raw[rcat])

Gender
family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
CALC
MTRANS


In [7]:
tcats = test.columns[test.dtypes=="object"]
for tcat in tcats:
    le = LabelEncoder()
    print(tcat)
    if test[tcat].dtypes == "object":
        le = le.fit(test[tcat])
        test[tcat] = le.transform(test[tcat])

Gender
family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
CALC
MTRANS


In [8]:
train.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,Overweight_Level_II,28.259565


In [9]:
raw.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,Normal_Weight,24.386526


In [10]:
test.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,1,26.899886,1.848294,120.644178,1,1,2.938616,3.0,2,0,2.825629,0,0.8554,0.0,2,3,35.315411


In [11]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [12]:
data.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,6,28.259565


In [13]:
nobeyesdad_mapping = {'Insufficient_Weight': 0, 
                      'Normal_Weight': 1, 
                      'Obesity_Type_I': 2, 
                      'Obesity_Type_II': 3, 
                      'Obesity_Type_III': 4,
                      'Overweight_Level_I': 5, 
                      'Overweight_Level_II': 6} 


train['NObeyesdad'] = train['NObeyesdad'].map(nobeyesdad_mapping)
raw['NObeyesdad'] = raw['NObeyesdad'].map(nobeyesdad_mapping)

In [14]:
train.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,6,28.259565


In [15]:
raw.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1,24.386526


## 변수 분리

In [16]:
# 타겟 변수와 피처 분리
train_X = train.drop('NObeyesdad', axis=1)
train_Y = train['NObeyesdad']
data_X = data.drop('NObeyesdad', axis=1)
data_Y = data['NObeyesdad']
raw_X = raw.drop('NObeyesdad', axis=1)
raw_Y = raw['NObeyesdad']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=42)

In [18]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# 모델 정의
rf_shallow = RandomForestClassifier(max_depth=3, random_state=42)
rf_deep = RandomForestClassifier(max_depth=None, random_state=42) # 깊이가 깊은 버전은 max_depth를 기본값(None)으로 설정
xgb = XGBClassifier(max_depth=10, random_state=42)  # 깊은 트리
lgbm = LGBMClassifier(random_state=42)  # 중간 깊이의 트리
catboost = CatBoostClassifier(max_depth=6, verbose=0, random_state=42)  # 얕은 트리


# 스태킹 모델의 기본 모델
estimators = [
    ('rf_shallow', rf_shallow),
    ('rf_deep', rf_deep),
    ('xgb', xgb),
    ('lgbm', lgbm),
    ('catboost', catboost)
]

# 스태킹을 위한 최종 모델 (메타 모델)
final_estimator = LogisticRegression(random_state=42, verbose=0)

In [19]:
# 스태킹 분류기 정의 및 학습
stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)
stacking_classifier.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002829 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2318
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 17
[LightGBM] [Info] Start training from score -1.953690
[LightGBM] [Info] Start training from score -1.939680
[LightGBM] [Info] Start training from score -1.952633
[LightGBM] [Info] Start training from score -1.940027
[LightGBM] [Info] Start training from score -1.941072
[LightGBM] [Info] Start training from score -1.947887
[LightGBM] [Info] Start training from score -1.946485
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2318
[LightGBM

In [20]:
# 테스트 데이터에 대한 성능 평가
from sklearn.metrics import accuracy_score

y_pred = stacking_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.6f}')

Accuracy: 0.939606


In [21]:
pred_stack = stacking_classifier.predict(train_X)
accuracy_stack = accuracy_score(train_Y, pred_stack)
print("스태킹 모델 학습 데이터 정확도(train):", accuracy_stack)

pred_cla = stacking_classifier.predict(raw_X)
accuracy_cla = accuracy_score(raw_Y, pred_cla)
print("스태킹 모델 학습 데이터 정확도(raw):", accuracy_cla)

스태킹 모델 학습 데이터 정확도(train): 0.9820310241834473
스태킹 모델 학습 데이터 정확도(raw): 0.9981051634296542
