In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load
import numpy as np

In [24]:
df_master = pd.read_csv('../../data/heart_disease_data.csv')
df = df_master.copy()

In [25]:
df = df.drop(columns=['Id'])

In [29]:
print(df['Gender'].value_counts())

Gender
M    578
F    156
Name: count, dtype: int64


In [31]:
categorical_cols = ['Gender', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [34]:
df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)

In [35]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
df['Cholesterol'] = imputer.fit_transform(df[['Cholesterol']])

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Select continuous columns
continuous_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

In [37]:
df['HeartDisease'] = df['HeartDisease'].replace({1: 1, 0: 0})

In [38]:
X = df.drop(columns=['HeartDisease'])  # Features
y = df['HeartDisease']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
categorical_columns = []
for i in df.columns:
    if df[i].dtype == 'object':
        categorical_columns.append(i)

In [40]:
dump(categorical_columns, 'models/categorical_columns.joblib')

['models/categorical_columns.joblib']

In [41]:
ordinal = OrdinalEncoder()

In [42]:
ordinal.fit(X_train[categorical_columns])

X_train[categorical_columns] = ordinal.transform(X_train[categorical_columns])

In [43]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)

In [44]:
model = LogisticRegression()

In [45]:
model.fit(X_train, y_train)

In [46]:
dump(ordinal, 'models/Ordinal_Encoder.joblib')
dump(scaler, 'models/Standard_Scaler.joblib')
dump(model, 'models/Logistic_Regression.joblib')

['models/Logistic_Regression.joblib']

# Eval

In [47]:
scaler = load('models/Standard_Scaler.joblib')
ordinal = load('models/Ordinal_Encoder.joblib')
model = load('models/Logistic_Regression.joblib')
categorical_columns = load('models/categorical_columns.joblib')

In [48]:
X_test[categorical_columns] = ordinal.transform(X_test[categorical_columns])

In [49]:
X_test = scaler.transform(X_test)

In [50]:
y_pred = model.predict(X_test)

In [51]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8503401360544217


In [52]:
print("metrix:", confusion_matrix(y_test, y_pred))

metrix: [[54 10]
 [12 71]]


In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [57]:
model = RandomForestClassifier(
    n_estimators=100,   
    max_depth=6,        
    min_samples_split=2,
    min_samples_leaf=1, 
    max_features='sqrt',
    bootstrap=True,     
    random_state=42     
)

In [58]:
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86


In [60]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.86      0.84        64
           1       0.89      0.86      0.87        83

    accuracy                           0.86       147
   macro avg       0.85      0.86      0.86       147
weighted avg       0.86      0.86      0.86       147



In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [3, 4, 5, 7, 8],                  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],               # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],                 # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt'],              # Number of features to consider at each split
    'bootstrap': [True, False]                     # Whether to use bootstrap sampling
}

# Create a GridSearchCV object for Random Forest
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,                  # 3-fold cross-validation
                           verbose=1,             # Show the process
                           n_jobs=-1)             # Use all available cores

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate performance with best model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Random Forest Accuracy: {accuracy}")


Fitting 3 folds for each of 1080 candidates, totalling 3240 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best Random Forest Accuracy: 0.8707482993197279


1620 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
780 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/saadkhalid/.local/share/virtualenvs/dsp-final-project-673cRAb2/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/saadkhalid/.local/share/virtualenvs/dsp-final-project-673cRAb2/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/saadkhalid/.local/share/virtualenvs/dsp-final-project-673cRAb2/lib/python3.9/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_

In [63]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [64]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.87
