In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# 데이터 불러오기
data = pd.read_csv('/content/drive/MyDrive/data_anay/Dataset_Final_Preprocessed.csv', encoding='UTF-8')

scaler = StandardScaler()

y = data.iloc[:, [6]]
X = data.drop(columns=data.columns[:8])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X

In [None]:
selected_column_cad = y['cad_icd_code']
binary_labels_cad = selected_column_cad.notna().astype(int)
y['binary_cad_icd_code'] = binary_labels_cad

y = y.drop(columns=['cad_icd_code'])

In [None]:
y

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

**XGBoost**

In [None]:
import xgboost as xgb

# XGBoost 모델 초기화
xgb_model = xgb.XGBClassifier(random_state=0, n_estimators=400, scale_pos_weight = 0.1)

# 훈련 데이터에 모델 훈련
xgb_model.fit(train_X, train_y)

# 테스트 데이터로 예측
y_pred = xgb_model.predict(test_X)

accuracy = accuracy_score(test_y, y_pred)
print("Accuracy:", accuracy)

y_pred_proba_xgb = xgb_model.predict_proba(test_X)[:, 1]
roc_auc = roc_auc_score(test_y, y_pred_proba_xgb)
print("AUC ROC:", roc_auc)

Accuracy: 0.8479381443298969
AUC ROC: 0.7425443223443224


XGBoost with Resample module

In [None]:
import numpy as np
from sklearn.utils import resample
from xgboost import XGBClassifier

In [None]:
# 다수 클래스와 소수 클래스로 분리
majority_class = train_X[train_y == 0]
minority_class = train_X[train_y == 1]

# 소수 클래스를 다수 클래스와 동일한 수로 언더샘플링
majority_class_downsampled = resample(majority_class, replace=True, n_samples=len(minority_class), random_state=0)

# 언더샘플링된 다수 클래스와 소수 클래스를 결합
undersampled_X = pd.concat([majority_class_downsampled, minority_class])
undersampled_y = np.concatenate([np.zeros(len(minority_class)), np.ones(len(minority_class))])

In [None]:
# XGBoost 모델 초기화 및 학습
xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(undersampled_X, undersampled_y)

In [None]:
predictions = xgb_model.predict(test_X)

accuracy = accuracy_score(test_y, predictions)
print("Accuracy:", accuracy)

probs = xgb_model.predict_proba(test_X)[:, 1]
auc = roc_auc_score(test_y, probs)
print("AUC:", auc)

XGBoost with RUS

In [None]:
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler

# 언더샘플링 수행
rus = RandomUnderSampler(random_state=0)
train_X_resampled, train_y_resampled = rus.fit_resample(train_X, train_y)

# XGBoost 모델 초기화
xgb_model = xgb.XGBClassifier(random_state=0)

# 훈련 데이터에 모델 훈련
xgb_model.fit(train_X_resampled, train_y_resampled)

# 테스트 데이터로 예측
y_pred = xgb_model.predict(test_X)

accuracy = accuracy_score(test_y, y_pred)
print("Accuracy:", accuracy)

y_pred_proba_xgb = xgb_model.predict_proba(test_X)[:, 1]
roc_auc = roc_auc_score(test_y, y_pred_proba_xgb)
print("AUC ROC:", roc_auc)

Accuracy: 0.695360824742268
AUC ROC: 0.7652959706959708


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(test_y, y_pred_proba_xgb)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()