In [216]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load
import numpy as np

In [217]:
df_master = pd.read_csv('../../data/heart_disease_data.csv')
df = df_master.copy()

In [218]:
df = df.drop(columns=['Id'])

In [277]:

column_mapping = {
    'Gender': 'gender',
    'ChestPainType': 'chest_pain_type',
    'RestingECG': 'resting_ecg',
    'ExerciseAngina': 'exercise_angina',
    'Cholesterol': 'cholesterol',
    'RestingBP': 'resting_bp',
    'MaxHR': 'max_hr',
    'Oldpeak': 'old_peak',
    'ST_Slope': 'st_slope',
    'FastingBS': 'fasting_bs',
    'HeartDisease': 'heart_disease',
    'Age': 'age'
}

df.rename(columns=column_mapping, inplace=True)


In [278]:
print(df['gender'].value_counts())

gender
M    578
F    156
Name: count, dtype: int64


In [249]:
# categorical_cols = ['gender', 'chest_pain_type', 'resting_ecg', 'exercise_angina', 'st_slope']
# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [250]:
df

Unnamed: 0,age,gender,chest_pain_type,resting_bp,cholesterol,fasting_bs,resting_ecg,max_hr,exercise_angina,old_peak,st_slope,heart_disease
0,42,M,ATA,120,198,0,Normal,155,N,0.0,Up,0
1,54,M,ASY,140,239,0,Normal,160,N,1.2,Up,0
2,60,M,NAP,141,316,1,ST,122,Y,1.7,Flat,1
3,54,M,ASY,124,266,0,LVH,109,Y,2.2,Flat,1
4,55,M,ATA,160,292,1,Normal,143,Y,2.0,Flat,1
...,...,...,...,...,...,...,...,...,...,...,...,...
729,50,F,NAP,120,219,0,Normal,158,N,1.6,Flat,0
730,55,M,ASY,140,229,0,Normal,110,Y,0.5,Flat,0
731,60,M,ASY,130,0,1,ST,130,Y,1.1,Down,1
732,64,M,ASY,110,0,1,Normal,114,Y,1.3,Down,1


In [251]:
# df['cholesterol'] = df['cholesterol'].replace(0, np.nan)

In [252]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy='median')
# df['cholesterol'] = imputer.fit_transform(df[['cholesterol']])

In [253]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Select continuous columns
# continuous_cols = ['age', 'resting_bp', 'cholesterol', 'max_hr', 'old_peak']
# df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

In [254]:
# df['heart_disease'] = df['heart_disease'].replace({1: 1, 0: 0})

In [255]:
X = df.drop(columns=['heart_disease'])  # Features
y = df['heart_disease']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [256]:
categorical_columns = []
for i in df.columns:
    if df[i].dtype == 'object':
        categorical_columns.append(i)

In [257]:
dump(categorical_columns, 'models/categorical_columns.joblib')

['models/categorical_columns.joblib']

In [258]:
ordinal = OrdinalEncoder()

In [259]:
ordinal.fit(X_train[categorical_columns])

X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

In [260]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)

In [261]:
model = LogisticRegression()

In [262]:
model.fit(X_train, y_train)

In [263]:
dump(ordinal, 'models/Ordinal_Encoder.joblib')
dump(scaler, 'models/Standard_Scaler.joblib')
dump(model, 'models/Logistic_Regression.joblib')

['models/Logistic_Regression.joblib']

# Eval

In [264]:
scaler = load('models/Standard_Scaler.joblib')
ordinal = load('models/Ordinal_Encoder.joblib')
model = load('models/Logistic_Regression.joblib')
categorical_columns = load('models/categorical_columns.joblib')

In [265]:
X_test[categorical_columns] = ordinal.transform(X_test[categorical_columns])

In [266]:
X_test = scaler.transform(X_test)

In [267]:
y_pred = model.predict(X_test)

In [268]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8435374149659864


In [269]:
print("metrix:", confusion_matrix(y_test, y_pred))

metrix: [[54 10]
 [13 70]]


In [270]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [271]:
model = RandomForestClassifier(
    n_estimators=100,   
    max_depth=6,        
    min_samples_split=2,
    min_samples_leaf=1, 
    max_features='sqrt',
    bootstrap=True,     
    random_state=42     
)

In [272]:
model.fit(X_train, y_train)

In [273]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.87


In [274]:
dump(model, 'models/Random_Forest.joblib')

['models/Random_Forest.joblib']

In [275]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.83      0.85        64
           1       0.87      0.90      0.89        83

    accuracy                           0.87       147
   macro avg       0.87      0.87      0.87       147
weighted avg       0.87      0.87      0.87       147



In [276]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [3, 4, 5, 7, 8],                  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],               # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],                 # Minimum samples required at a leaf node
    'max_features': ['log2', 'sqrt'],              # Number of features to consider at each split
    'bootstrap': [True, False]                     # Whether to use bootstrap sampling
}

# Create a GridSearchCV object for Random Forest
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,                  # 3-fold cross-validation
                           verbose=1,             # Show the process
                           n_jobs=-1)             # Use all available cores

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate performance with best model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Random Forest Accuracy: {accuracy}")


Fitting 3 folds for each of 1080 candidates, totalling 3240 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Random Forest Accuracy: 0.8639455782312925


In [279]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [280]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86
