In [9]:
import pandas as pd

df = pd.read_csv('C:/ML_project_predict_heart_disease/data/heart_2020_cleaned.csv')

In [10]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [11]:
# 중복행 확인 : 18078개
print(f'중복행 개수 : {df.duplicated().sum()}')

# 중복행 제거
df = df.drop_duplicates().reset_index(drop = True)

중복행 개수 : 18078


In [12]:
from sklearn.preprocessing import LabelEncoder
import re

### 라벨 인코딩(컬럼 내 유니크 값 두 개)
label_features = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke',
                'DiffWalking', 'Sex','PhysicalActivity',
                'Asthma', 'KidneyDisease', 'SkinCancer']

for feature in label_features:
    encoder = LabelEncoder()
    item = df[feature].unique()
    encoder.fit(item)
    df[feature] = encoder.transform(df[feature])

### 원핫 인코딩(컬럼 내 유니크 값 세 개 이상)
onehot_features = ['AgeCategory', 'Race', 'GenHealth', 'Diabetic']
df = pd.get_dummies(df, columns= onehot_features)
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df.to_csv('heart_2020_final.csv', index = False)

In [14]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex',
       'PhysicalActivity', 'SleepTime', 'Asthma', 'KidneyDisease',
       'SkinCancer', 'AgeCategory_1824', 'AgeCategory_2529',
       'AgeCategory_3034', 'AgeCategory_3539', 'AgeCategory_4044',
       'AgeCategory_4549', 'AgeCategory_5054', 'AgeCategory_5559',
       'AgeCategory_6064', 'AgeCategory_6569', 'AgeCategory_7074',
       'AgeCategory_7579', 'AgeCategory_80orolder',
       'Race_AmericanIndianAlaskanNative', 'Race_Asian', 'Race_Black',
       'Race_Hispanic', 'Race_Other', 'Race_White', 'GenHealth_Excellent',
       'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor',
       'GenHealth_Verygood', 'Diabetic_No', 'Diabetic_Noborderlinediabetes',
       'Diabetic_Yes', 'Diabetic_Yesduringpregnancy'],
      dtype='object')

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

def get_SMOTE(X_train, y_train):
    # SMOTE train 함수
    from imblearn.over_sampling import SMOTE
    smote = SMOTE()
    return smote.fit_resample(X_train,y_train)

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    
    # ROC-AUC 추가 
    # roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    # print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},/
    # # F1: {3:.4f}'.format(accuracy, precision, recall, f1))


X = df.drop(columns = 'HeartDisease')
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X,y)

cat = CatBoostClassifier()
# X_train, y_train = get_SMOTE(X_train, y_train)
cat.fit(X_train, y_train)
cat.predict(X_test)

cat.save_model('C:/ML_project_predict_heart_disease/data/cat.pkl')
# joblib.dump(cat, 'C:/ML_project_predict_heart_disease/data/logistic.pkl')

# get_clf_eval(y_train, cat.predict(X_train))
# get_clf_eval(y_test, cat.predict(X_test))
        

Learning rate set to 0.104321
0:	learn: 0.5826975	total: 178ms	remaining: 2m 57s
1:	learn: 0.5009975	total: 208ms	remaining: 1m 43s
2:	learn: 0.4397092	total: 255ms	remaining: 1m 24s
3:	learn: 0.3981732	total: 281ms	remaining: 1m 10s
4:	learn: 0.3636361	total: 313ms	remaining: 1m 2s
5:	learn: 0.3393245	total: 339ms	remaining: 56.2s
6:	learn: 0.3198043	total: 371ms	remaining: 52.6s
7:	learn: 0.3049465	total: 401ms	remaining: 49.7s
8:	learn: 0.2923331	total: 431ms	remaining: 47.5s
9:	learn: 0.2831951	total: 476ms	remaining: 47.1s
10:	learn: 0.2752960	total: 508ms	remaining: 45.7s
11:	learn: 0.2686631	total: 535ms	remaining: 44.1s
12:	learn: 0.2638202	total: 578ms	remaining: 43.9s
13:	learn: 0.2602023	total: 603ms	remaining: 42.5s
14:	learn: 0.2575488	total: 626ms	remaining: 41.1s
15:	learn: 0.2552162	total: 654ms	remaining: 40.2s
16:	learn: 0.2528698	total: 696ms	remaining: 40.3s
17:	learn: 0.2507169	total: 724ms	remaining: 39.5s
18:	learn: 0.2493235	total: 756ms	remaining: 39s
19:	learn

In [24]:
y_train

62546     0
21930     0
11143     0
199016    1
32601     0
         ..
81580     0
46215     1
270769    0
211676    0
1673      0
Name: HeartDisease, Length: 226287, dtype: int32

In [16]:
X

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,SleepTime,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Verygood,Diabetic_No,Diabetic_Noborderlinediabetes,Diabetic_Yes,Diabetic_Yesduringpregnancy
0,16.60,1,0,0,3.0,30.0,0,0,1,5.0,...,1,0,0,0,0,1,0,0,1,0
1,20.34,0,0,1,0.0,0.0,0,0,1,7.0,...,1,0,0,0,0,1,1,0,0,0
2,26.58,1,0,0,20.0,30.0,0,1,1,8.0,...,1,0,1,0,0,0,0,0,1,0
3,24.21,0,0,0,0.0,0.0,0,0,0,6.0,...,1,0,0,1,0,0,1,0,0,0
4,23.71,0,0,0,28.0,0.0,1,0,1,8.0,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301712,27.41,1,0,0,7.0,0.0,1,1,0,6.0,...,0,0,1,0,0,0,0,0,1,0
301713,29.84,1,0,0,0.0,0.0,0,1,1,5.0,...,0,0,0,0,0,1,1,0,0,0
301714,24.24,0,0,0,0.0,0.0,0,0,1,6.0,...,0,0,0,1,0,0,1,0,0,0
301715,32.81,0,0,0,0.0,0.0,0,0,0,12.0,...,0,0,0,1,0,0,1,0,0,0


In [96]:
cat.save_model('cat.pkl',
            format="cbm",
            export_parameters=None,
            pool=None)

In [None]:
from catboost import CatBoostClassifier
from_file = CatBoostClassifier()
from_file.load_model('')

In [95]:
import joblib
joblib.dump(cat, 'cat.pkl')

['cat.pkl']

In [94]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix

confusion_matrix(y_train, cat.predict(X_train))
confusion_matrix(y_test, cat.predict(X_test))

array([[68031,   615],
       [ 6126,   658]], dtype=int64)

In [97]:
reg = CatBoostClassifier()
reg.load_model('C:/ML_project_predict_heart_disease/data/cat.pkl')

<catboost.core.CatBoostClassifier at 0x2300b83b490>

In [101]:
reg.predict_proba(X_test)[0]

array([0.99388174, 0.00611826])

In [103]:
reg.predict(X_test)[0]

0