In [12]:
pip install tpot h2o scikit-learn pandas numpy




In [13]:
import pandas as pd

# Load the Heart Disease dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
data = pd.read_csv(url, names=columns)

# Replace '?' with NaN and drop missing values
data = data.replace('?', pd.NA).dropna()

# Separate features (X) and target (y)
X = data.iloc[:, :-1]
y = (data['target'] > 0).astype(int)  # 0: No disease, 1: Disease present
# Target is 0 (no disease) or 1+ (presence of disease)


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
from tpot import TPOTClassifier

# Initialize TPOT with evolutionary algorithm settings
tpot = TPOTClassifier(
    generations=10,
    population_size=50,
    max_time_mins=30,  # Limits total run time to 30 minutes
    verbosity=2,
    random_state=42
)
tpot.fit(X_train, y_train)

# Print the best model pipeline
print(f"Best Pipeline: {tpot.fitted_pipeline_}")

# Evaluate performance on the test set
test_score = tpot.score(X_test, y_test)
print(f"Test Accuracy: {test_score:.4f}")

# Export the best pipeline to a Python script
tpot.export('best_heart_disease_pipeline.py')


Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8311170212765958

Generation 2 - Current best internal CV score: 0.8311170212765958

Generation 3 - Current best internal CV score: 0.8311170212765958

Generation 4 - Current best internal CV score: 0.8352836879432625

Generation 5 - Current best internal CV score: 0.839450354609929

Generation 6 - Current best internal CV score: 0.839450354609929

Generation 7 - Current best internal CV score: 0.839450354609929

Generation 8 - Current best internal CV score: 0.839450354609929

Generation 9 - Current best internal CV score: 0.839450354609929

Generation 10 - Current best internal CV score: 0.839450354609929

Best pipeline: ExtraTreesClassifier(ZeroCount(GaussianNB(input_matrix)), bootstrap=False, criterion=gini, max_features=0.7000000000000001, min_samples_leaf=4, min_samples_split=7, n_estimators=100)
Best Pipeline: Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=GaussianNB())),
                ('zero



In [19]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

# Convert Pandas DataFrame to H2OFrame
data_h2o = h2o.H2OFrame(data)
X_cols = data_h2o.columns[:-1]
y_col = 'target'

# Train AutoML with a maximum of 20 models
aml = H2OAutoML(max_models=20, seed=42)
aml.train(x=X_cols, y=y_col, training_frame=data_h2o)

# Display the leaderboard and the best model
print(aml.leaderboard)
print("Best Model:", aml.leader)



Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp9pzuv49u
  JVM stdout: /tmp/tmp9pzuv49u/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp9pzuv49u/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 18 days
H2O_cluster_name:,H2O_from_python_unknownUser_iri656
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                     rmse       mse       mae     rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_1_20241017_154637  0.852753  0.727188  0.632011  0.397607                  0.727188
GLM_1_AutoML_1_20241017_154637                           0.861485  0.742156  0.65195   0.4139                    0.742156
StackedEnsemble_AllModels_1_AutoML_1_20241017_154637     0.862491  0.743891  0.6409    0.402765                  0.743891
DRF_1_AutoML_1_20241017_154637                           0.870549  0.757856  0.626854  0.398821                  0.757856
GBM_4_AutoML_1_20241017_154637                           0.872888  0.761934  0.631845  0.39408                   0.761934
GBM_2_AutoML_1_20241017_154637                           0.874886  0.765426  0

In [20]:
from sklearn.metrics import classification_report

# Predict on the test set
y_pred = tpot.predict(X_test)

# Print a detailed classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.84      0.88      0.86        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60





In [21]:
# Predict on the test data using H2O.ai's best model
pred = aml.leader.predict(data_h2o)
print(pred.head())


stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
  predict
0.335389
2.2852
2.13812
0.587785
0.0425415
0.0221973
2.52481
0.0657148
1.9679
1.64805
[10 rows x 1 column]



In [22]:
h2o.shutdown(prompt=False)

H2O session _sid_963e closed.


  h2o.shutdown(prompt=False)
