In [6]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

df = pd.read_csv('C:/ML_project_predict_heart_disease/data/heart_2020_final.csv')
X = df.drop(columns = 'HeartDisease').values
y = df['HeartDisease']

smote = SMOTE(random_state = 42)
X_smote, y_smote = smote.fit_resample(X,y)

In [13]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

def fold_K(X, y, model):    
    kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    acc_test_score = []
    acc_train_score = []
    rec_test_score = []
    rec_train_score = []
    
    for train_index, test_index in kf.split(X): # 5번
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_train = model.predict(X_train)

        acc_train_score.append(accuracy_score(y_train,y_pred_train))
        acc_test_score.append(accuracy_score(y_test,y_pred))
        
        rec_train_score.append(recall_score(y_train , y_pred_train))
        rec_test_score.append(recall_score(y_test , y_pred))

        
    print('정확도 : train score : {}'.format(np.array(acc_train_score).mean()))
    print('정확도 : test score : {}'.format(np.array(acc_test_score).mean()))
    # print('재현율 : train score : {}'.format(np.array(rec_train_score).mean()))
    # print('재현율 : test score : {}'.format(np.array(rec_test_score).mean()))

In [14]:
# CatBoost 모델을 만듭니다.
catboost = CatBoostClassifier(random_state = 42, bagging_temperature = 0, depth = 9, l2_leaf_reg = 3, learning_rate =0.1)
fold_K(X_smote, y_smote, catboost)

0:	learn: 0.5920657	total: 157ms	remaining: 2m 37s
1:	learn: 0.5208783	total: 313ms	remaining: 2m 36s
2:	learn: 0.4765884	total: 465ms	remaining: 2m 34s
3:	learn: 0.4508269	total: 611ms	remaining: 2m 32s
4:	learn: 0.4247100	total: 761ms	remaining: 2m 31s
5:	learn: 0.4028360	total: 914ms	remaining: 2m 31s
6:	learn: 0.3774237	total: 1.07s	remaining: 2m 31s
7:	learn: 0.3543441	total: 1.22s	remaining: 2m 30s
8:	learn: 0.3315434	total: 1.37s	remaining: 2m 30s
9:	learn: 0.3210527	total: 1.53s	remaining: 2m 31s
10:	learn: 0.3091600	total: 1.68s	remaining: 2m 30s
11:	learn: 0.2964201	total: 1.83s	remaining: 2m 31s
12:	learn: 0.2890671	total: 1.98s	remaining: 2m 30s
13:	learn: 0.2773908	total: 2.14s	remaining: 2m 30s
14:	learn: 0.2704469	total: 2.29s	remaining: 2m 30s
15:	learn: 0.2644709	total: 2.44s	remaining: 2m 29s
16:	learn: 0.2571079	total: 2.6s	remaining: 2m 30s
17:	learn: 0.2507298	total: 2.77s	remaining: 2m 31s
18:	learn: 0.2462979	total: 2.92s	remaining: 2m 30s
19:	learn: 0.2416816	to

KeyboardInterrupt: 

In [4]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


X_smote_train , X_smote_test , y_smote_train , y_smote_test = train_test_split(X_smote,y_smote,test_size = 0.2 , random_state=42)

# CatBoost 모델을 만듭니다.
catboost = CatBoostClassifier(random_state = 42)

# 하이퍼파라미터 후보군을 지정합니다.
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'depth': [3, 6, 9],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 1, 10]
}

# 그리드 서치를 수행합니다.
grid_search = GridSearchCV(estimator=catboost , param_grid = param_grid, cv=3)
grid_search.fit(X_smote_train, y_smote_train)

0:	learn: 0.6866374	total: 185ms	remaining: 3m 4s
1:	learn: 0.6802290	total: 232ms	remaining: 1m 55s
2:	learn: 0.6741737	total: 273ms	remaining: 1m 30s
3:	learn: 0.6714850	total: 311ms	remaining: 1m 17s
4:	learn: 0.6659035	total: 355ms	remaining: 1m 10s
5:	learn: 0.6633554	total: 390ms	remaining: 1m 4s
6:	learn: 0.6608718	total: 435ms	remaining: 1m 1s
7:	learn: 0.6584693	total: 473ms	remaining: 58.7s
8:	learn: 0.6524429	total: 518ms	remaining: 57.1s
9:	learn: 0.6467812	total: 563ms	remaining: 55.7s
10:	learn: 0.6444829	total: 600ms	remaining: 53.9s
11:	learn: 0.6423027	total: 639ms	remaining: 52.6s
12:	learn: 0.6401189	total: 678ms	remaining: 51.5s
13:	learn: 0.6348355	total: 720ms	remaining: 50.7s
14:	learn: 0.6327384	total: 758ms	remaining: 49.8s
15:	learn: 0.6306710	total: 799ms	remaining: 49.1s
16:	learn: 0.6259606	total: 850ms	remaining: 49.2s
17:	learn: 0.6240460	total: 895ms	remaining: 48.8s
18:	learn: 0.6192352	total: 962ms	remaining: 49.7s
19:	learn: 0.6173885	total: 1.02s	rem

KeyboardInterrupt: 

In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/heart_2020_final.csv')
X = df.drop(columns = 'HeartDisease').values
y = df['HeartDisease']

smote = SMOTE(random_state = 42)
X_smote, y_smote = smote.fit_resample(X,y)

FileNotFoundError: [Errno 2] No such file or directory: '/content/heart_2020_final.csv'

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


X_smote_train , X_smote_test , y_smote_train , y_smote_test = train_test_split(X_smote,y_smote,test_size = 0.2 , random_state=42)

# CatBoost 모델을 만듭니다.
catboost = CatBoostClassifier(random_state = 42, task_type="GPU",)

# 하이퍼파라미터 후보군을 지정합니다.
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'depth': [3, 6, 9],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 1, 10]
}

# 그리드 서치를 수행합니다.
grid_search = GridSearchCV(estimator=catboost , param_grid = param_grid, cv=3)
grid_search.fit(X_smote_train, y_smote_train)

In [1]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


X_smote_train , X_smote_test , y_smote_train , y_smote_test = train_test_split(X_smote,y_smote,test_size = 0.2 , random_state=42)

# CatBoost 모델을 만듭니다.
catboost = CatBoostClassifier(random_state = 42, task_type="GPU",)

# 하이퍼파라미터 후보군을 지정합니다.
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'depth': [3, 6, 9],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 1, 10]
}

# # 그리드 서치를 수행합니다.
# grid_search = GridSearchCV(estimator=catboost , param_grid = param_grid, cv=3)
# grid_search.fit(X_smote_train, y_smote_train)

NameError: name 'train_test_split' is not defined

In [None]:
# grid_search.best_params_
accuracy_score(y_smote_train, grid_search.best_estimator_.predict(X_smote_train))
accuracy_score(y_smote_test, grid_search.best_estimator_.predict(X_smote_test))

In [None]:

# {'bagging_temperature': 0, 'depth': 9, 'l2_leaf_reg': 3, 'learning_rate': 0.1}