# Data Collection & Preprocessing

## Normalization

In [34]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

first_heart_csv = "Heart_disease_cleveland_new.csv"
df = pd.read_csv(first_heart_csv)
X1 = df.iloc[:, :-1]
y1 = df.iloc[:, -1]
scaleMinMax_1 = MinMaxScaler(feature_range=(0, 1)) #normalize to value between 0 and 1
X = scaleMinMax_1.fit_transform(X1)

X1_normalized = pd.DataFrame(X, columns= X1.columns)
X1_normalized['target'] = y1

print(X1_normalized.head())
print(X1_normalized.describe().round(3))
X1_normalized.to_csv("heart_normalized.csv", index=False)

print("\nThe whole dataset:\n", X1_normalized)

        age  sex        cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333  1.0  0.000000  0.481132  0.244292  1.0      1.0  0.603053    0.0   
1  0.791667  1.0  1.000000  0.622642  0.365297  0.0      1.0  0.282443    1.0   
2  0.791667  1.0  1.000000  0.245283  0.235160  0.0      1.0  0.442748    1.0   
3  0.166667  1.0  0.666667  0.339623  0.283105  0.0      0.0  0.885496    0.0   
4  0.250000  0.0  0.333333  0.339623  0.178082  0.0      1.0  0.770992    0.0   

    oldpeak  slope        ca  thal  target  
0  0.370968    1.0  0.000000   0.5       0  
1  0.241935    0.5  1.000000   0.0       1  
2  0.419355    0.5  0.666667   1.0       1  
3  0.564516    1.0  0.000000   0.0       0  
4  0.225806    0.0  0.000000   0.0       0  
           age      sex       cp  trestbps     chol      fbs  restecg  \
count  303.000  303.000  303.000   303.000  303.000  303.000  303.000   
mean     0.530    0.680    0.719     0.356    0.276    0.149    0.495   
std      0.188    0.467 

# Model Selection & Training

## Random Forest 

### Using RandomizedSearchCV for parameter calculation

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import joblib

# Load the dataset
df1 = pd.read_csv("heart_normalized.csv")
print(df1.head())

# Split the input and output
X1 = df1.drop(columns=["target"])
y1 = df1["target"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
print("Number of rows and columns in train and test data ", X_train.shape, X_test.shape)

# RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform RandomizedSearchCV to find the best parameters
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

# This does the randomized search process as well as calculates the best model
random_search.fit(X_train, y_train)

# Get the best parameters from the random search
best_params = random_search.best_params_
print(f"Best Parameters Found: {best_params}")

# Get the best model
rf_model_updated = random_search.best_estimator_


# Predict using the rf model
y_pred_updated = rf_model_updated.predict(X_test)

print("Predicted values (First five): ", y_pred_updated[:5])

accuracy = accuracy_score(y_test, y_pred_updated)
precision = precision_score(y_test, y_pred_updated)
f1 = f1_score(y_test, y_pred_updated)
recall = recall_score(y_test, y_pred_updated)

# Get the predicted probabilities for the  model
y_proba_updated = rf_model_updated.predict_proba(X_test)

print("Predicted Probabilities for the first 5 samples:")
print(y_proba_updated[:5])

# Evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")

# Save the updated model
joblib.dump(rf_model_updated, 'random_forest_model_RandomizedSearchCV.pkl')
print("THe Random Forest model has been saved successfully!")

        age  sex        cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333  1.0  0.000000  0.481132  0.244292  1.0      1.0  0.603053    0.0   
1  0.791667  1.0  1.000000  0.622642  0.365297  0.0      1.0  0.282443    1.0   
2  0.791667  1.0  1.000000  0.245283  0.235160  0.0      1.0  0.442748    1.0   
3  0.166667  1.0  0.666667  0.339623  0.283105  0.0      0.0  0.885496    0.0   
4  0.250000  0.0  0.333333  0.339623  0.178082  0.0      1.0  0.770992    0.0   

    oldpeak  slope        ca  thal  target  
0  0.370968    1.0  0.000000   0.5       0  
1  0.241935    0.5  1.000000   0.0       1  
2  0.419355    0.5  0.666667   1.0       1  
3  0.564516    1.0  0.000000   0.0       0  
4  0.225806    0.0  0.000000   0.0       0  
Number of rows and columns in train and test data  (242, 13) (61, 13)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters Found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20}
Pre

### Using GridSearchCV for better parameter calculation

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# This will take more time but we get better results. So keep this as seperate notebook cell.
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}


In [37]:
rf_model_grid_search_updated = grid_search.best_estimator_
y_pred_updated = rf_model_grid_search_updated.predict(X_test)

print("The test input values:\n", X_test[:5])
y_pred_updated = rf_model_updated.predict(X_test)

print("Actual result values:\n", df1.loc[X_test.index[:5], "target"])

print("Predicted values (First five): ", y_pred_updated[:5])

accuracy = accuracy_score(y_test, y_pred_updated)
precision = precision_score(y_test, y_pred_updated)
f1 = f1_score(y_test, y_pred_updated)
recall = recall_score(y_test, y_pred_updated)

# Get the predicted probabilities for the updated model
y_proba_updated = rf_model_grid_search_updated.predict_proba(X_test)

# Print the predicted probabilities for the first few samples of the updated model
print("Predicted Probabilities for the first 5 samples :")
print(y_proba_updated[:5])

# Evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")

# Save the updated model if needed
joblib.dump(rf_model_grid_search_updated, 'random_forest_model_GridSearchCV.pkl')
print("Updated Random Forest model using GridSearchCV saved successfully.")


The test input values:
           age  sex        cp  trestbps      chol  fbs  restecg   thalach  \
179  0.500000  1.0  0.666667  0.339623  0.273973  1.0      1.0  0.778626   
228  0.520833  1.0  1.000000  0.150943  0.182648  0.0      1.0  0.282443   
111  0.562500  1.0  1.000000  0.292453  0.280822  1.0      1.0  0.557252   
246  0.604167  1.0  1.000000  0.056604  0.246575  0.0      0.0  0.648855   
60   0.458333  0.0  1.000000  0.339623  0.408676  0.0      0.0  0.541985   

     exang   oldpeak  slope        ca  thal  
179    0.0  0.000000    0.0  1.000000   0.0  
228    1.0  0.000000    0.5  0.333333   0.0  
111    1.0  0.193548    0.5  0.333333   0.0  
246    0.0  0.016129    0.0  0.333333   1.0  
60     1.0  0.193548    0.5  0.000000   1.0  
Actual result values:
 179    0
228    1
111    1
246    1
60     1
Name: target, dtype: int64
Predicted values (First five):  [0 1 1 1 1]
Predicted Probabilities for the first 5 samples :
[[0.678343   0.321657  ]
 [0.29140335 0.70859665]
 [0.

## XGBOOST

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np
df = pd.read_csv("heart_normalized.csv")
x=df.drop(columns=['target'])
y=df["target"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Number of rows and columns in train and test data ",X_train.shape, X_test.shape)

model = xgb.XGBClassifier(
    max_depth=5,
    n_estimators=100,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

print("The test input values (First five):\n", X_test[:5])

print("Actual result values:\n", df.loc[X_test.index[:5], "target"])

y_pred = model.predict(X_test)

print("Predicted values (First five): ", y_pred[:5])

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")


joblib.dump(rf_model_grid_search_updated, 'xgboost_model.pkl')
print("The xgboost model has been saved successfully.")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Number of rows and columns in train and test data  (242, 13) (61, 13)
The test input values (First five):
           age  sex        cp  trestbps      chol  fbs  restecg   thalach  \
179  0.500000  1.0  0.666667  0.339623  0.273973  1.0      1.0  0.778626   
228  0.520833  1.0  1.000000  0.150943  0.182648  0.0      1.0  0.282443   
111  0.562500  1.0  1.000000  0.292453  0.280822  1.0      1.0  0.557252   
246  0.604167  1.0  1.000000  0.056604  0.246575  0.0      0.0  0.648855   
60   0.458333  0.0  1.000000  0.339623  0.408676  0.0      0.0  0.541985   

     exang   oldpeak  slope        ca  thal  
179    0.0  0.000000    0.0  1.000000   0.0  
228    1.0  0.000000    0.5  0.333333   0.0  
111    1.0  0.193548    0.5  0.333333   0.0  
246    0.0  0.016129    0.0  0.333333   1.0  
60     1.0  0.193548    0.5  0.000000   1.0  
Actual

Exception ignored in: <function ResourceTracker.__del__ at 0x714d1cc79c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=150; total time=   0.4s


Exception ignored in: <function ResourceTracker.__del__ at 0x72ffafb71c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.3s


Exception ignored in: <function ResourceTracker.__del__ at 0x7be473b81c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=150; total time=   0.4s


Exception ignored in: <function ResourceTracker.__del__ at 0x7f2abff7dc60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=150; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=150; total time=   0.3s


Exception ignored in: <function ResourceTracker.__del__ at 0x7f8f4ea71c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.2s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s


Exception ignored in: <function ResourceTracker.__del__ at 0x73b0fe67dc60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.1s


Exception ignored in: <function ResourceTracker.__del__ at 0x709d7237dc60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=150; total time=   0.4s


Exception ignored in: <function ResourceTracker.__del__ at 0x7788ce179c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=150; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.3s


Exception ignored in: <function ResourceTracker.__del__ at 0x72062897dc60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s


Exception ignored in: <function ResourceTracker.__del__ at 0x74a718079c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.3s


Exception ignored in: <function ResourceTracker.__del__ at 0x7747fc479c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=150; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=150; total time=   0.3s


Exception ignored in: <function ResourceTracker.__del__ at 0x7e2a06579c60>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
