# Heart Disease Prediction



![image.png](https://t3.ftcdn.net/jpg/06/06/29/34/360_F_606293479_9iTncv5OBYwY2RBMsFa6yTmIedXjR1VZ.jpg)

### Heart disease is one of the leading causes of death worldwide. Early detection can significantly improve the chances of effective treatment and management. This project aims to develop a machine learning model to predict the likelihood of heart disease based on various health metrics.

# Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from joblib import dump, load

# Loading the dataset

In [2]:
data = pd.read_csv('/content/heart.csv')

data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Data Preprocessing

## Chacking for Null values

In [3]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [4]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## One-Hot encoding

In [5]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

# Feature scaling
scaler = StandardScaler()
features = data_encoded.drop('target', axis=1)
scaled_features = scaler.fit_transform(features)

# Create a DataFrame for scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)

# Combine the scaled features with the target
processed_data = pd.concat([scaled_features_df, data_encoded['target']], axis=1)

In [6]:
processed_data.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3,target
0,-0.268437,-0.377636,-0.659332,0.821321,-0.060888,-0.661504,0.661504,1.030715,-0.441179,-0.619085,...,-1.13713,-0.53184,2.578615,-0.268655,-0.133697,-0.082923,-0.258065,-1.063474,1.224745,0
1,-0.158157,0.479107,-0.833861,0.255968,1.727137,-0.661504,0.661504,1.030715,-0.441179,-0.619085,...,0.879407,-0.53184,-0.387805,-0.268655,-0.133697,-0.082923,-0.258065,-1.063474,1.224745,0
2,1.716595,0.764688,-1.396233,-1.048692,1.301417,-0.661504,0.661504,1.030715,-0.441179,-0.619085,...,0.879407,-0.53184,-0.387805,-0.268655,-0.133697,-0.082923,-0.258065,-1.063474,1.224745,0
3,0.724079,0.936037,-0.833861,0.5169,-0.912329,-0.661504,0.661504,1.030715,-0.441179,-0.619085,...,-1.13713,1.880265,-0.387805,-0.268655,-0.133697,-0.082923,-0.258065,-1.063474,1.224745,0
4,0.834359,0.364875,0.930822,-1.874977,0.705408,1.511706,-1.511706,1.030715,-0.441179,-0.619085,...,-1.13713,-0.53184,-0.387805,3.72224,-0.133697,-0.082923,-0.258065,0.940314,-0.816497,0


# Splitting the dataset

In [7]:
# Define features and target
X = processed_data.drop('target', axis=1)
y = processed_data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((820, 30), (205, 30), (820,), (205,))

# Apply PCA for dimensionality reduction

In [8]:
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Training and evaluating Gradient Boosting

In [10]:
gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train_pca, y_train)
y_pred_gradient_boosting = gradient_boosting.predict(X_test_pca)
gradient_boosting_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_gradient_boosting),
    'precision': precision_score(y_test, y_pred_gradient_boosting),
    'recall': recall_score(y_test, y_pred_gradient_boosting),
    'f1_score': f1_score(y_test, y_pred_gradient_boosting),
    'roc_auc': roc_auc_score(y_test, y_pred_gradient_boosting)
}

print("Gradient Boosting Metrics:", gradient_boosting_metrics)

Gradient Boosting Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': 1.0}


# Saving the model to disk

In [11]:
# Specify the file path where you want to save the model
model_file_path = 'gradient_boosting_model.joblib'

# Save the model to disk
dump(gradient_boosting, model_file_path)

['gradient_boosting_model.joblib']

## Predicting using user's input

In [13]:
# Load the saved model
model_file_path = 'gradient_boosting_model.joblib'
gradient_boosting = load(model_file_path)

# Simulate user input
user_input = {
    'age': int(input("Enter your age: ")),
    'sex': int(input("Enter your sex (1: male, 0: female): ")),
    'cp': int(input("Enter chest pain type (0: typical angina, 1: atypical anginaValue, 2: non-anginal pain, 3: asymptomatic): ")),
    'trestbps': int(input("Enter resting blood pressure (in mm Hg): ")),
    'chol': int(input("Enter serum cholesterol in mg/dl: ")),
    'fbs': int(input("Enter fasting blood sugar > 120 mg/dl (1: true, 0: false): ")),
    'restecg': int(input("Enter resting electrocardiographic results (0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy): ")),
    'thalach': int(input("Enter maximum heart rate achieved: ")),
    'exang': int(input("Enter exercise induced angina (1: yes, 0: no): ")),
    'oldpeak': float(input("Enter ST depression induced by exercise relative to rest: ")),
    'slope': int(input("Enter the slope of the peak exercise ST segment (0: upsloping, 1: flat 2: downsloping): ")),
    'ca': int(input("Enter number of major vessels colored by fluoroscopy (0-3): ")),
    'thal': int(input("Enter thalassemia (1: normal, 2: fixed defect, 3: reversable defect): "))
}
# Convert user input to DataFrame
input_df = pd.DataFrame(user_input, index=[0])

# One-hot encode the user input to match the training data
input_df_encoded = pd.get_dummies(input_df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

# Align input with training data columns (features.columns)
for column in features.columns:
    if column not in input_df_encoded.columns:
        input_df_encoded[column] = 0

# Ensure the column order matches the training data
input_df_encoded = input_df_encoded[features.columns]

# Scale the user input
scaled_input = scaler.transform(input_df_encoded)

# Apply PCA to the scaled user input
pca_input = pca.transform(scaled_input)

# Make prediction
user_prediction = gradient_boosting.predict(pca_input)
user_prediction_proba = gradient_boosting.predict_proba(pca_input)

# Display the result as a percentage
prediction_percentage = user_prediction_proba[0][1] * 100

if user_prediction[0] == 1:
    print(f"The model predicts that the user is at risk of heart disease with a probability of {prediction_percentage:.2f}%.")
else:
    print(f"The model predicts that the user is not at risk of heart disease with a probability of {100 - prediction_percentage:.2f}%.")


Enter your age: 58
Enter your sex (1: male, 0: female): 0
Enter chest pain type (0: typical angina, 1: atypical anginaValue, 2: non-anginal pain, 3: asymptomatic): 0
Enter resting blood pressure (in mm Hg): 100
Enter serum cholesterol in mg/dl: 248
Enter fasting blood sugar > 120 mg/dl (1: true, 0: false): 0
Enter resting electrocardiographic results (0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy): 0
Enter maximum heart rate achieved: 122
Enter exercise induced angina (1: yes, 0: no): 0
Enter ST depression induced by exercise relative to rest: 1
Enter the slope of the peak exercise ST segment (0: upsloping, 1: flat 2: downsloping): 1
Enter number of major vessels colored by fluoroscopy (0-3): 0
Enter thalassemia (1: normal, 2: fixed defect, 3: reversable defect): 2
The model predicts that the user is at risk of heart disease with a probability of 96.38%.


