In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read csv and store in df variabel
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")
df.head()

## EDA

In [None]:
# check df is missing value or not
df.isna().sum()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(df.dtypes.map(str))
plt.show()

In [None]:
df_under_twenty = df.age < 20
df.groupby([df_under_twenty, 'sex'])['target'].sum().plot(kind='bar')

In [None]:
# check summarize the df
df.describe()

In [None]:
sns.set_theme();
x = df.age
ax = sns.displot(x)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data=df.corr(), annot=True)
plt.show()

In [None]:
fig, ax = plt.subplots(1,3)
sns.countplot(data=df, x='target', ax=ax[0])
sns.countplot(data=df, x='sex', ax=ax[1])
sns.countplot(data=df, x='fbs', ax=ax[2])
fig.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
X = df.drop(columns='target')
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.columns

In [None]:
numeric_pipeline = Pipeline([
    ('scaling', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])
])

pipeline = Pipeline([
    ('pre', preprocessor),
    ('algo', RandomForestClassifier(random_state=42))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.get_params()

In [None]:
parameter = {
    'algo__max_depth' : [20, 40, 60, 80],
    'algo__n_estimators' : [50, 100, 150, 200],
    'algo__max_features': ['auto', 'sqrt'],
    'algo__min_samples_leaf': [1, 2, 4],
    'algo__min_samples_split': [2, 5, 10],
}

In [None]:
model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

In [None]:
print(model.score(X_train, y_train)), print(model.score(X_test, y_test))