In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.combine import SMOTEENN

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score

In [None]:
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)

In [None]:
custom_palette = ['#33A4FF', '#FFB733', '#9FF767']
sns.set_palette(custom_palette)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/data_mining/sample_for_sme.csv")
df

Unnamed: 0,sex,age,sight_left,sight_right,DBP,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN,bmi,diabete
0,0.000000,3.000000,1.000000,1.000000,70.000000,153.000000,75.000000,65.000000,65.000000,11.800000,1.000000,0.500000,21.000000,14.000000,17.000000,1.000000,1.000000,16.649324,0.0
1,1.000000,2.000000,1.200000,1.200000,63.000000,173.000000,33.000000,107.000000,162.000000,16.800000,1.000000,0.900000,18.000000,13.000000,20.000000,3.000000,1.000000,20.202020,0.0
2,0.000000,2.000000,1.000000,1.200000,52.000000,192.000000,52.000000,125.000000,76.000000,13.600000,1.000000,0.800000,17.000000,12.000000,10.000000,1.000000,1.000000,19.531250,0.0
3,0.000000,1.000000,0.100000,0.100000,53.000000,154.000000,64.000000,77.000000,65.000000,14.500000,1.000000,0.700000,14.000000,11.000000,18.000000,2.000000,0.000000,19.531250,0.0
4,1.000000,2.000000,1.200000,0.900000,70.000000,173.000000,49.000000,106.000000,88.000000,16.300000,1.000000,0.800000,21.000000,28.000000,20.000000,3.000000,1.000000,29.387755,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87818,0.653541,2.653541,1.326770,1.292124,79.299138,138.157512,43.811053,50.425215,219.645598,15.472435,1.000000,0.834646,38.456651,44.803110,48.267703,2.307081,1.000000,28.544855,2.0
87819,1.000000,3.000000,1.075119,1.075119,87.399044,203.403822,45.352150,135.600956,107.155017,17.834259,1.000000,0.815024,52.201911,77.253583,139.854539,1.150239,1.000000,26.018666,2.0
87820,1.000000,3.000000,0.914258,1.157225,81.140674,243.861237,51.427753,133.005731,296.138763,13.886315,3.714832,0.628517,19.998090,26.285168,50.000000,3.000000,1.000000,24.809210,2.0
87821,0.000000,2.652235,0.965224,0.795671,63.477648,154.000000,51.956705,85.695530,79.000000,13.569553,1.000000,0.800000,19.261176,13.652235,9.956705,1.000000,0.347765,23.437500,2.0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("diabete", axis = 1),
                                                    df["diabete"],
                                                    test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

In [None]:
y_train_pred = knn_classifier.predict(X_train)

accuracy = accuracy_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred, average = 'micro')
f1 = f1_score(y_train, y_train_pred, average = 'micro')
precision = precision_score(y_train, y_train_pred, average = 'micro')

print(f'Accuracy: {accuracy:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')

In [None]:
y_pred = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average = 'micro')
f1 = f1_score(y_test, y_pred, average = 'micro')
precision = precision_score(y_test, y_pred, average = 'micro')

print(f'Accuracy: {accuracy:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 혼동 행렬 계산
conf_matrix = confusion_matrix(y_test, y_pred)

# seaborn을 사용하여 혼동 행렬을 히트맵으로 표시
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])

plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/data_mining/sample_for_sme.csv")

# 특성과 타겟 분리
X = df.drop("diabete", axis=1)
y = df["diabete"]

# 데이터 스케일링 (표준화)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# KNN 모델 정의
knn_classifier = KNeighborsClassifier()

# Grid Search를 위한 하이퍼파라미터 후보 설정 (range 사용)
param_grid = {'n_neighbors': range(1, 10, 2)}

# Grid Search 수행
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 확인
best_k = grid_search.best_params_['n_neighbors']
print(f'Best Number of Neighbors: {best_k}')

# 최적의 하이퍼파라미터로 모델 재훈련
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
best_knn_classifier.fit(X_train, y_train)

# 훈련 데이터에서 예측 및 정확도 계산
y_train_pred = best_knn_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

# 테스트 데이터에서 예측 및 정확도 계산
y_test_pred = best_knn_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

# 훈련 데이터에서 precision, recall, F1 계산
train_precision = precision_score(y_train, y_train_pred, average='micro')
train_recall = recall_score(y_train, y_train_pred, average='micro')
train_f1 = f1_score(y_train, y_train_pred, average='micro')

print(f'Training Precision: {train_precision:.2f}')
print(f'Training Recall: {train_recall:.2f}')
print(f'Training F1 Score: {train_f1:.2f}')

# 테스트 데이터에서 precision, recall, F1 계산
test_precision = precision_score(y_test, y_test_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

print(f'Test Precision: {test_precision:.2f}')
print(f'Test Recall: {test_recall:.2f}')
print(f'Test F1 Score: {test_f1:.2f}')


아래 코드가 SMOTETomeck에서 이상적인 confusion matrix 발견


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/data_mining/sample_for_sme.csv")

# 특성과 타겟 분리
X = df.drop("diabete", axis=1)
y = df["diabete"]

# 데이터 스케일링 (표준화)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# KNN 모델 정의
knn_classifier = KNeighborsClassifier()

# Grid Search를 위한 하이퍼파라미터 후보 설정 (range 사용)
param_grid = {'n_neighbors': range(1, 10, 2)}  # 3부터 11까지 2씩 증가하는 값

# Grid Search 수행
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 확인
best_k = grid_search.best_params_['n_neighbors']
print(f'Best Number of Neighbors: {best_k}')

# 최적의 하이퍼파라미터로 모델 재훈련
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
best_knn_classifier.fit(X_train, y_train)

# 훈련 데이터에서 예측 및 정확도 계산
y_train_pred = best_knn_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

# 테스트 데이터에서 예측 및 정확도 계산
y_test_pred = best_knn_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

# 훈련 데이터에서 precision, recall, F1 계산
train_precision = precision_score(y_train, y_train_pred, average='micro')
train_recall = recall_score(y_train, y_train_pred, average='micro')
train_f1 = f1_score(y_train, y_train_pred, average='micro')

print(f'Training Precision: {train_precision:.2f}')
print(f'Training Recall: {train_recall:.2f}')
print(f'Training F1 Score: {train_f1:.2f}')

# 테스트 데이터에서 precision, recall, F1 계산
test_precision = precision_score(y_test, y_test_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

print(f'Test Precision: {test_precision:.2f}')
print(f'Test Recall: {test_recall:.2f}')
print(f'Test F1 Score: {test_f1:.2f}')

# Confusion Matrix 시각화
def plot_confusion_matrix(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    plt.figure(figsize=(8, 6))
    sns.heatmap(df_cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# 시각화 함수 호출
class_names = sorted(df["diabete"].unique())
plot_confusion_matrix(y_test, y_test_pred, class_names)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/data_mining/sample_for_sme.csv")

# 특성과 타겟 분리
X = df.drop("diabete", axis=1)
y = df["diabete"]

# 데이터 스케일링 (표준화)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k_best = SelectKBest(f_classif, k=5)
X_selected = k_best.fit_transform(X_scaled, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# KNN 모델 정의
knn_classifier = KNeighborsClassifier()

# Grid Search를 위한 하이퍼파라미터 후보 설정 (range 사용)
param_grid = {'n_neighbors': range(1, 10, 2)}  # 1부터 9까지 2씩 증가하는 값

# Grid Search 수행
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 확인
best_k = grid_search.best_params_['n_neighbors']
print(f'Best Number of Neighbors: {best_k}')

# 최적의 하이퍼파라미터로 모델 재훈련
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
best_knn_classifier.fit(X_train, y_train)

# 훈련 데이터에서 예측 및 정확도 계산
y_train_pred = best_knn_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

# 테스트 데이터에서 예측 및 정확도 계산
y_test_pred = best_knn_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

# 훈련 데이터에서 precision, recall, F1 계산
train_precision = precision_score(y_train, y_train_pred, average='micro')
train_recall = recall_score(y_train, y_train_pred, average='micro')
train_f1 = f1_score(y_train, y_train_pred, average='micro')

print(f'Training Precision: {train_precision:.2f}')
print(f'Training Recall: {train_recall:.2f}')
print(f'Training F1 Score: {train_f1:.2f}')

# 테스트 데이터에서 precision, recall, F1 계산
test_precision = precision_score(y_test, y_test_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

print(f'Test Precision: {test_precision:.2f}')
print(f'Test Recall: {test_recall:.2f}')
print(f'Test F1 Score: {test_f1:.2f}')
