In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


In [None]:
# Load the dataset
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
data = load_dataset("marianeft/heart-diease-dataset")
print(data)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  target  
0         2   2     3       0  
1         0

In [3]:
# Convert non-numeric data to numeric data
le = LabelEncoder()
data['sex'] = le.fit_transform(data['sex'])
data['cp'] = le.fit_transform(data['cp'])
data['fbs'] = le.fit_transform(data['fbs'])
data['restecg'] = le.fit_transform(data['restecg'])
data['exang'] = le.fit_transform(data['exang'])
data['slope'] = le.fit_transform(data['slope'])
data['thal'] = le.fit_transform(data['thal'])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
X=data.drop(['target'], axis =1)
y=data['target']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Train a Decision Tree to get feature importance
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Get feature importance scores
feature_importances = dt.feature_importances_

# Create a DataFrame to view importance rankings
feature_names = X.columns  # Assuming X is a Pandas DataFrame
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort and select the top 7 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(7)['Feature'].values

print("Top 7 Selected Features:", top_features)

# Keep only the selected features in training and testing sets
X_train_selected = pd.DataFrame(X_train, columns=feature_names)[top_features]
X_test_selected = pd.DataFrame(X_test, columns=feature_names)[top_features]

# Retrain the model with selected features
dt_selected = DecisionTreeClassifier(random_state=42)
dt_selected.fit(X_train_selected, y_train)

# Evaluate with cross-validation
cv_scores = cross_val_score(dt_selected, X_train_selected, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy with Top 7 Features: {cv_scores.mean():.4f}")

# Predict and print classification report
y_pred = dt_selected.predict(X_test_selected)
print("Classification Report on Test Data:")
print(classification_report(y_test, y_pred))

Top 7 Selected Features: ['cp' 'thal' 'ca' 'chol' 'oldpeak' 'age' 'thalach']
Cross-Validation Accuracy Scores: [0.98780488 0.9695122  0.98780488 0.98780488 0.93292683]
Mean CV Accuracy with Top 7 Features: 0.9732
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [6]:
import pickle

with open("heart_model.pkl", "wb") as file:
    pickle.dump(dt_selected, file)

**How to use ?**

In [None]:
import pickle
import numpy as np

# ['cp' 'thal' 'ca' 'chol' 'oldpeak' 'age' 'thalach']
"""
age:	Age of the patient (years)	Older individuals have a higher risk of heart disease.
cp (Chest Pain Type):	0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic	Certain types of chest pain are strong indicators of heart problems.
chol (Serum Cholesterol):	Cholesterol level in mg/dL	High cholesterol can lead to plaque buildup in arteries, increasing heart disease risk.
thalach (Maximum Heart Rate Achieved):	Maximum recorded heart rate during stress testing	A lower max heart rate could indicate heart disease.
oldpeak (ST Depression):	ST segment depression in ECG	Higher values may indicate ischemia (reduced blood flow to the heart).
ca (Number of Major Vessels Colored by Fluoroscopy)	0–3 :	Higher values indicate blocked arteries, increasing heart disease risk.
thal (Thalassemia) :	0 = Normal, 1 = Fixed Defect, 2 = Reversible Defect	Blood disorder that can affect oxygen transport and heart function.

"""

# Load the model
with open("heart_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

input_data = np.array([[1, 2, 0, 210, 0.7, 34, 192]])  

# Make a prediction
y_pred = loaded_model.predict(input_data)

print("Predicted Output:", y_pred)

Predicted Output: [1]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


to Compute Old Peak using the ecg values

If you use a 12-bit ADC (e.g., ESP32, STM32):

The ADC range is 0 to 4095.
Reference voltage (Vref) is typically 3.3V or 5V.
The conversion formula:
𝑉
analog
=
(
ADC Value
×
𝑉
ref
4095
)
V 
analog
​
 =( 
4095
ADC Value×V 
ref
​
 
​
 )
Example: If ADC reads 2048 and Vref is 3.3V,
𝑉
analog
=
(
2048
×
3.3
𝑉
4095
)
=
1.65
𝑉
V 
analog
​
 =( 
4095
2048×3.3V
​
 )=1.65V