In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [69]:
train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
submission = pd.read_csv('/kaggle/input/playground-series-s4e2/sample_submission.csv')

In [41]:
# Feature Engineering을 위한 Transformer 클래스 정의
class CustomFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        #BMI 계산
        X['BMI'] = X['Weight'] / (X['Height'] ** 2)
        # 연령 그륩화 (10살 단위)
        X['AgeGroup'] = (X['Age'] // 10) * 10
        return X
    
# 범주형 및 수치형 변수 식별
cat_cols = list(train.select_dtypes(exclude=np.number).columns)
num_cols = list(train.select_dtypes(include=np.number).columns)

In [42]:
# 전처리 파이프라인 설정
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_cols_updated = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI', 'AgeGroup'] 
cat_cols_updated = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']  

# 전처리기 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols_updated), 
        ('cat', categorical_transformer, cat_cols_updated) 
    ])

# 파이프라인 구축
pipeline = Pipeline(steps=[('features', CustomFeatures()),
                           ('preprocessor', preprocessor),
                           ('classifier', xgb.XGBClassifier(random_state=42))])

In [45]:
X = train.drop('NObeyesdad', axis=1)  # 타겟 변수 제거
y = train['NObeyesdad']  # 타겟 변수

# 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 레이블 인코더 생성
label_encoder = LabelEncoder()

# 학습 및 테스트 타겟 레이블 인코딩
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [46]:
pipeline.fit(X_train, y_train_encoded)

In [49]:
# 예측 수행
y_pred_encoded = pipeline.predict(X_test)

# 예측된 레이블을 원래 문자열 레이블로 변환
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# 예측 결과 출력 또는 평가
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.9089595375722543


In [51]:
# 교차 검증 
scores = cross_val_score(pipeline, X_train, y_train_encoded, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {np.mean(scores)}')

Cross-validation accuracy: 0.9045528781597267


In [54]:
param_grid = {
    'classifier__max_depth' : [3, 5, 7],
    'classifier__n_estimators' : [100, 200],
    'classifier__learning_rate' : [0.01, 0.1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train_encoded)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")

Best parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
Best accuracy: 0.9062992675548536


In [57]:
# 테스트 데이터 예측 수행
y_pred_encoded = grid_search.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# 정확도 계산
print(f'Test Accuracy: {accuracy_score(y_test, y_pred)}')

Test Accuracy: 0.9060693641618497


In [59]:
# 성능 지표 계산
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
recall = recall_score(y_test_encoded, y_pred_encoded, average='macro')
precision = precision_score(y_test_encoded, y_pred_encoded, average='macro')
f1 = f1_score(y_test_encoded, y_pred_encoded, average='macro')

# 결과 출력
print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9061
Recall: 0.8959
Precision: 0.8964
F1 Score: 0.8961


In [60]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
import time

def convert_seconds_to_hms(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return hours, minutes, seconds

data = load_iris()
X = data.data
y = data.target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', lgb.LGBMClassifier(random_state=42, verbose=-1))
])

# 시간 측정
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()
hours, minutes, seconds = convert_seconds_to_hms(end_time - start_time)

# 최종 확인
print(f"학습시간 : {end_time - start_time} 초")
print(f"{hours} 시간, {minutes} 분, {seconds} 초")

학습시간 : 0.15890264511108398 초
0.0 시간, 0.0 분, 0.15890264511108398 초


In [68]:
preds = grid_search.predict(test)
test_preds = label_encoder.inverse_transform(preds)
test_preds

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [70]:
submission['id'].head(1)

0    20758
Name: id, dtype: int64

In [71]:
submission['NObeyesdad'] = test_preds
submission.to_csv("submission_ver_1.csv", index=False)