In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler,NearMiss
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV

from sklearn.metrics import recall_score

import pickle

In [32]:
data= pd.read_csv('heart.csv')

In [33]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


**Cek MV**

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Gunakan Supervised (Classifikasi)

**Cek Balencing Data**

In [35]:
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

**Cek Outlier**

In [36]:
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
IQR = q3-q1

In [37]:
((data < (q1-1.5*IQR)) | (data > (q3+1.5*IQR))).sum()/len(data)*100

age          0.000000
sex          0.000000
cp           0.000000
trestbps     2.970297
chol         1.650165
fbs         14.851485
restecg      0.000000
thalach      0.330033
exang        0.000000
oldpeak      1.650165
slope        0.000000
ca           8.250825
thal         0.660066
target       0.000000
dtype: float64

terdapat outlier yg cukup tinggi di column fasting blood sugar (fbs), sehingga gunakan metode yg tidak sensitif thdp outlier

**Model Benchmark**

In [38]:
X= data.drop(['target'],axis=1)
y = data['target']

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=2000,test_size=0.2)

In [40]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 2000)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = 2000)

In [41]:
logreg_pipe = Pipeline([
    ('scaler',RobustScaler()),
    ('logreg',logreg)
])

tree_pipe= Pipeline([
    ('tree',tree)
])

knn_pipe =Pipeline([
    ('scaler',RobustScaler()),
    ('knn',knn)
])

rf_pipe = Pipeline([
    ('rf',rf)
])

**validation**

In [42]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

In [43]:
logreg_pipe_cv = model_evaluation(logreg_pipe, 'recall')
tree_pipe_cv = model_evaluation(tree_pipe, 'recall')
knn_pipe_cv = model_evaluation(knn_pipe, 'recall')
rf_pipe_cv = model_evaluation(rf_pipe, 'recall')

In [44]:
for model in [logreg_pipe, tree_pipe, knn_pipe, rf_pipe]:
    model.fit(X_train, y_train)

In [45]:
score_mean = [logreg_pipe_cv.mean(), tree_pipe_cv.mean(), knn_pipe_cv.mean(),
              rf_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(), tree_pipe_cv.std(), knn_pipe_cv.std(),
             rf_pipe_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe.predict(X_test)),
            recall_score(y_test, tree_pipe.predict(X_test)), 
            recall_score(y_test, knn_pipe.predict(X_test)), 
            recall_score(y_test, rf_pipe.predict(X_test))]
method_name = ['Logistic Regression', 'Decision Tree Classifier',
              'KNN Classifier', 'Random Forest Classifier']
cv_result = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
cv_result

Unnamed: 0,method,mean score,std score,recall score
0,Logistic Regression,0.909402,0.051033,0.939394
1,Decision Tree Classifier,0.772934,0.040528,0.818182
2,KNN Classifier,0.901425,0.039265,0.878788
3,Random Forest Classifier,0.833048,0.06732,0.878788


**Tunning**

In [46]:
estimator= Pipeline([
    ('scaler',RobustScaler()),
    ('model',logreg)
])


hyperparam_space = {
    'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'model__solver': ['liblinear', 'newton-cg']
}

random = RandomizedSearchCV(
                estimator,
                param_distributions = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'recall',
                n_iter = 10,
                n_jobs = -1)

random.fit(X_train, y_train)

print('best score', random.best_score_)
print('best param', random.best_params_)

best score 1.0
best param {'model__solver': 'newton-cg', 'model__C': 0.001}


In [48]:
logreg_best=random.best_estimator_
logreg_best.fit(X_train, y_train)
print(classification_report(y_test, logreg_best.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.04      0.07        28
           1       0.55      1.00      0.71        33

    accuracy                           0.56        61
   macro avg       0.78      0.52      0.39        61
weighted avg       0.76      0.56      0.42        61



In [None]:
dengan menggunakan model Logreg yang ditunning, bisa mendapatkan nilai recall 1.