In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression   
from sklearn.ensemble import RandomForestClassifier   
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report,r2_score


In [2]:
df=pd.read_csv('heart_attack_prediction_indonesia.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,60,Male,Rural,Middle,0,1,211,0,83,0,...,62,173,48,121,101,Normal,0,0,0,0
1,53,Female,Urban,Low,0,0,208,0,106,1,...,76,70,58,83,138,Normal,1,0,1,0
2,62,Female,Urban,Low,0,0,231,1,112,1,...,74,118,69,130,171,Abnormal,0,1,0,1
3,73,Male,Urban,Low,1,0,202,0,82,1,...,65,98,52,85,146,Normal,0,1,1,0
4,52,Male,Urban,Middle,1,0,232,0,89,0,...,75,104,59,127,139,Normal,1,0,1,1


In [4]:
df.shape

(158355, 28)

In [5]:
df.isnull().sum()

age                                   0
gender                                0
region                                0
income_level                          0
hypertension                          0
diabetes                              0
cholesterol_level                     0
obesity                               0
waist_circumference                   0
family_history                        0
smoking_status                        0
alcohol_consumption               94848
physical_activity                     0
dietary_habits                        0
air_pollution_exposure                0
stress_level                          0
sleep_hours                           0
blood_pressure_systolic               0
blood_pressure_diastolic              0
fasting_blood_sugar                   0
cholesterol_hdl                       0
cholesterol_ldl                       0
triglycerides                         0
EKG_results                           0
previous_heart_disease                0


In [6]:
df["alcohol_consumption"] = df["alcohol_consumption"].fillna(df["alcohol_consumption"].mode()[0])


In [7]:
df.isnull().sum()

age                               0
gender                            0
region                            0
income_level                      0
hypertension                      0
diabetes                          0
cholesterol_level                 0
obesity                           0
waist_circumference               0
family_history                    0
smoking_status                    0
alcohol_consumption               0
physical_activity                 0
dietary_habits                    0
air_pollution_exposure            0
stress_level                      0
sleep_hours                       0
blood_pressure_systolic           0
blood_pressure_diastolic          0
fasting_blood_sugar               0
cholesterol_hdl                   0
cholesterol_ldl                   0
triglycerides                     0
EKG_results                       0
previous_heart_disease            0
medication_usage                  0
participated_in_free_screening    0
heart_attack                

In [8]:
df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
158350    False
158351    False
158352    False
158353    False
158354    False
Length: 158355, dtype: bool

In [9]:
df.describe()

Unnamed: 0,age,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,sleep_hours,blood_pressure_systolic,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
count,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0,158355.0
mean,54.543778,0.299069,0.199804,199.533264,0.249901,93.268504,0.300218,6.480064,129.515772,79.490809,110.736762,49.491478,129.569916,149.715885,0.200575,0.49977,0.601029,0.401004
std,11.910897,0.457851,0.399854,39.737565,0.432957,16.382205,0.458354,1.425398,15.005641,10.002964,27.673445,9.982634,34.913318,49.023473,0.400432,0.500002,0.489688,0.490103
min,25.0,0.0,0.0,100.0,0.0,20.0,0.0,3.0,61.0,37.0,70.0,8.0,-19.0,50.0,0.0,0.0,0.0,0.0
25%,46.0,0.0,0.0,172.0,0.0,82.0,0.0,5.492985,119.0,73.0,89.0,43.0,106.0,116.0,0.0,0.0,0.0,0.0
50%,55.0,0.0,0.0,199.0,0.0,93.0,0.0,6.507461,130.0,80.0,109.0,49.0,130.0,149.0,0.0,0.0,1.0,0.0
75%,63.0,1.0,0.0,226.0,0.0,104.0,1.0,7.52064,140.0,86.0,130.0,56.0,153.0,183.0,0.0,1.0,1.0,1.0
max,90.0,1.0,1.0,350.0,1.0,173.0,1.0,9.0,199.0,127.0,230.0,93.0,282.0,380.0,1.0,1.0,1.0,1.0


In [10]:
X = df.drop("heart_attack", axis=1)
y = df["heart_attack"]

In [11]:
print(" Features shape:", X.shape)
print(" Target shape:", y.shape)


 Features shape: (158355, 27)
 Target shape: (158355,)


In [12]:
categorical = X.select_dtypes(include=["object"]).columns
numeric = X.select_dtypes(exclude=["object"]).columns

In [13]:
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("scaler", StandardScaler(), numeric)
])

In [14]:
models = [
    ("KNN", KNeighborsClassifier()),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42))
    
]

models_accuracy = []
modelsR2_score = []

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  
)

In [16]:
result=[]

In [17]:
for name, model in models:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)   
    ])

    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    y_pred_train = pipeline.predict(x_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print("the accuracy for train",accuracy_train)
    accuracy = accuracy_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name} → Accuracy: {accuracy:.4f}, R2: {r2:.4f}")

    models_accuracy.append(accuracy)
    modelsR2_score.append(r2)


the accuracy for train 0.7874632944965426
KNN → Accuracy: 0.6842, R2: -0.3149
the accuracy for train 0.7318209087177544
Logistic Regression → Accuracy: 0.7273, R2: -0.1355
the accuracy for train 1.0
Decision Tree → Accuracy: 0.6563, R2: -0.4310
the accuracy for train 1.0
Random Forest → Accuracy: 0.7316, R2: -0.1176


In [18]:
print(models_accuracy)

[0.6841590098197089, 0.7272583751697136, 0.6562786145053835, 0.7315525243913991]


In [19]:
print(modelsR2_score)

[-0.31491627295204383, -0.1354840313665655, -0.4309885581681445, -0.11760653330383652]


In [20]:
log_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))])


In [21]:
log_pipe.fit(x_train, y_train)

In [22]:
   
y_pred = log_pipe.predict(x_test) 


In [23]:
accuracy=accuracy_score(y_test,y_pred)


In [24]:
print(accuracy)

0.7315525243913991


In [25]:
result.append({
        "Model": model,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_pred)
    })

In [26]:
print(result)

[{'Model': RandomForestClassifier(random_state=42), 'Accuracy': 0.7315525243913991, 'Precision': 0.694243938552656, 'Recall': 0.5907086614173228, 'F1 Score': 0.6383051135880201, 'ROC-AUC': np.float64(0.7082740502806396)}]


In [27]:
print(f"\n{model} Report:\n")
print(classification_report(y_test, y_pred))



RandomForestClassifier(random_state=42) Report:

              precision    recall  f1-score   support

           0       0.75      0.83      0.79     18971
           1       0.69      0.59      0.64     12700

    accuracy                           0.73     31671
   macro avg       0.72      0.71      0.71     31671
weighted avg       0.73      0.73      0.73     31671



In [29]:

pickle.dump(log_pipe, open('heart_attack_prediction_randomforest.pkl', 'wb'))


In [30]:

model = pickle.load(open('heart_attack_prediction_randomforest.pkl', 'rb'))




In [None]:
y_pred = model.predict(x_test)


In [33]:
y_pred[1:2]

array([0])