In [45]:
# !pip install imblearn

In [114]:
# pip install xgboost lightgbm

# Import

In [139]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc

import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
# 윈도우 기본 한글 폰트 경로
from matplotlib import font_manager, rc
font_path = 'C:\\Windows\\Fonts\\malgun.ttf' 
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)


# ML

새로운 데이터 라벨 생성

In [3]:
# 1차 EDA 불러오기
df_diabetes = pd.read_csv('./data/df_diabetes.csv')

In [None]:
# diabetes_label 초기화 (0=비당뇨)
df_diabetes['diabetes_label'] = 0

# mask: 두 조건 중 하나라도 True면 당뇨병 환자(1)로 표시
mask = (
    ((df_diabetes['DE1_dg'] == 1) & (df_diabetes['HE_DM_HbA1c'] == 3)) |  # 두 컬럼 모두 당뇨
    ((df_diabetes['HE_DM_HbA1c'] == 3) & (df_diabetes['DE1_dg'] != 1))    # HbA1c 당뇨지만 의사 진단은 아님
)

# 당뇨병 환자 라벨링
df_diabetes.loc[mask, 'diabetes_label'] = 1

# 당뇨병 환자만 추출 (필요 시) diabetes_label == 1인 행만 필터링하여 변수명으로 저장
diabetes_label = df_diabetes[df_diabetes['diabetes_label'] == 1]

# 당뇨병 라벨 분포 확인
print(df_diabetes['diabetes_label'].value_counts())

diabetes_label
0    6069
1     860
Name: count, dtype: int64 



### SMOTE 기법 (로지스틱회귀 + SMOTE)

In [124]:
# 1. 입력 변수와 타겟 정의
features = [
    'DE1_dg', 'HE_DM_HbA1c', 'HE_glu', 'HE_HbA1c', 'HE_hsCRP',
    'HE_chol', 'HE_BUN', 'HE_TG', 'HE_HP', 'HE_BMI',
]

# 결측치 제거
df_model = df_diabetes[features + ['diabetes_label']].dropna()

X = df_model[features]
y = df_model['diabetes_label']

# 학습/테스트 데이터 분리 (층화 샘플링)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

SMOTE 오버샘플링 적용

In [126]:
smote = SMOTE(random_state=13)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)



모델학습(로지스틱 회귀 예시)

In [127]:
model = LogisticRegression(random_state=42)
model.fit(X_train_smote, y_train_smote)

예측 및 평가

In [128]:
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

In [129]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

Confusion Matrix:
[[769   0]
 [  0 145]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       769
           1       1.00      1.00      1.00       145

    accuracy                           1.00       914
   macro avg       1.00      1.00      1.00       914
weighted avg       1.00      1.00      1.00       914

ROC AUC Score: 1.0000


SMOTE + 스케일링 + 로지스틱 회귀를 5-폴드 교차검증 하는 예시 코드

In [110]:
# 피처와 라벨 (이미 준비된 상태라 가정)
X = df_model[features]
y = df_model['diabetes_label']

# Stratified K-Fold 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

precisions, recalls, f1s, aucs = [], [], [], []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 파이프라인: SMOTE -> 스케일링 -> 로지스틱회귀
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_proba))

print(f"Precision: {np.mean(precisions):.3f} ± {np.std(precisions):.3f}")
print(f"Recall: {np.mean(recalls):.3f} ± {np.std(recalls):.3f}")
print(f"F1-score: {np.mean(f1s):.3f} ± {np.std(f1s):.3f}")
print(f"ROC AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")

Precision: 1.000 ± 0.000
Recall: 1.000 ± 0.000
F1-score: 1.000 ± 0.000
ROC AUC: 1.000 ± 0.000




모든 폴드에서 Precision, Recall, F1, ROC AUC가 1.000으로 나왔다는 건, 모델이 모든 교차검증 단계에서 완벽하게 분류했다는 뜻<br>
데이터가 너무 적거나 단순해서 모델이 쉽게 분류한 것 같음

### SMOTE 기법 (랜덤포레스트 + SMOTE)

In [111]:
# 2. train/test 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. 결측치 평균값으로 채우기
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# 4. SMOTE 적용 (훈련셋만)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_imputed, y_train)



In [112]:
# 1. 모델 학습
model_random = RandomForestClassifier(random_state=42)
model_random.fit(X_resampled, y_resampled)

# 2. 테스트셋 예측
y_pred = model_random.predict(X_test_imputed)

# 3. 성능 평가
print("정확도 (Accuracy):", accuracy_score(y_test, y_pred))
print("\n 분류 리포트:\n", classification_report(y_test, y_pred, target_names=['비당뇨(0)', '당뇨(1)']))
print("\n 혼동 행렬:\n", confusion_matrix(y_test, y_pred))

# [[1225   28]   → 28명은 당뇨 아닌데 당뇨라고 예측 (위양성)
#  [  4  129]]   → 4명은 실제 당뇨인데 놓쳤음 (위음성)

정확도 (Accuracy): 1.0

 분류 리포트:
               precision    recall  f1-score   support

      비당뇨(0)       1.00      1.00      1.00       769
       당뇨(1)       1.00      1.00      1.00       145

    accuracy                           1.00       914
   macro avg       1.00      1.00      1.00       914
weighted avg       1.00      1.00      1.00       914


 혼동 행렬:
 [[769   0]
 [  0 145]]


### 두 모델의 ROC Curve 비교