In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
submission = pd.read_csv('/kaggle/input/playground-series-s4e2/sample_submission.csv')

## bmi 생성

In [4]:
train['BMI'] = train['Weight'] / (train['Height']**2)
test['BMI'] = test['Weight'] / (test['Height']**2)

train.head(1)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II,28.259565


## 순서형 변수인코딩

In [5]:
# "id" 컬럼을 train과 test DataFrame에서 제거
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [6]:
caec_mapping = {'no': 0, 
                'Sometimes': 1, 
                'Frequently': 2, 
                'Always': 3}

calc_mapping = {'no': 0, 
                'Sometimes': 1, 
                'Frequently': 2}

mtrans_mapping = {'Walking': 4, 
                  'Bike': 3, 
                  'Public_Transportation': 2, 
                  'Motorbike': 1, 
                  'Automobile': 0}

nobeyesdad_mapping = {'Insufficient_Weight': 0, 
                      'Normal_Weight': 1, 
                      'Overweight_Level_I': 2, 
                      'Overweight_Level_II': 3, 
                      'Obesity_Type_I': 4, 
                      'Obesity_Type_II': 5, 
                      'Obesity_Type_III': 6}

gender_mapping = {'Female': 0, 'Male': 1}

train['CAEC'] = train['CAEC'].map(caec_mapping)
test['CAEC'] = test['CAEC'].map(caec_mapping)
train['CALC'] = train['CALC'].map(calc_mapping)
test['CALC'] = test['CALC'].map(calc_mapping)
train['MTRANS'] = train['MTRANS'].map(mtrans_mapping)
test['MTRANS'] = test['MTRANS'].map(mtrans_mapping)
train['NObeyesdad'] = train['NObeyesdad'].map(nobeyesdad_mapping)
train['Gender'] = train['Gender'].map(gender_mapping)
test['Gender'] = test['Gender'].map(gender_mapping)

## 명목형 변수 인코딩

In [7]:
# 범주형 피처 인코딩을 위해 OneHotEncoder 사용
categorical_features = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
numerical_features = train.columns.drop(['NObeyesdad'] + categorical_features)

# 전처리기 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # 수치형 변수 스케일링
        ('cat', OneHotEncoder(), categorical_features)  # 범주형 변수 OneHot 인코딩
    ])

# 파이프라인 구축
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        subsample=0.7,
        n_estimators=900,
        max_depth=4,
        learning_rate=0.03,
        colsample_bytree=0.5,
        use_label_encoder=False,
        eval_metric='mlogloss'
    ))
])

In [8]:
# 타겟 변수와 피처 분리
X = train.drop('NObeyesdad', axis=1)
y = train['NObeyesdad']

# 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16606, 17), (4152, 17), (16606,), (4152,))

In [9]:
# 레이블 인코더 생성 및 타겟 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [10]:
pipeline.fit(X_train, y_train_encoded)

In [11]:
# StratifiedKFold 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# 교차 검증 수행
scores = cross_val_score(pipeline, X_train, y_train_encoded, cv=cv, scoring='accuracy')

In [13]:
print("교차 검증 정확도:", scores)
print("평균 정확도:", scores.mean())

교차 검증 정확도: [0.90698374 0.90213791 0.91177356 0.91117133 0.90876242]
평균 정확도: 0.9081657944146503


In [14]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred, average='macro')
precision = precision_score(y_test_encoded, y_pred, average='macro')
f1 = f1_score(y_test_encoded, y_pred, average='macro')

print(f'정확도: {accuracy}')
print(f'재현율: {recall}')
print(f'정밀도: {precision}')
print(f'F1 점수: {f1}')

정확도: 0.9075144508670521
재현율: 0.8972691524116877
정밀도: 0.8979367720980559
F1 점수: 0.8975439667399768


In [15]:
# 역매핑을 위한 딕셔너리 생성
inverse_nobeyesdad_mapping = {v: k for k, v in nobeyesdad_mapping.items()}
inverse_nobeyesdad_mapping

{0: 'Insufficient_Weight',
 1: 'Normal_Weight',
 2: 'Overweight_Level_I',
 3: 'Overweight_Level_II',
 4: 'Obesity_Type_I',
 5: 'Obesity_Type_II',
 6: 'Obesity_Type_III'}

In [16]:
preds = pipeline.predict(test)

In [17]:
# 예측된 숫자형 레이블을 문자열 레이블로 변환
test_preds_labels = [inverse_nobeyesdad_mapping[pred] for pred in preds]

In [18]:
submission['NObeyesdad'] =test_preds_labels
submission.to_csv("submission_ver_3.csv", index=False)