# HEARTWISE - A HEART DISEASE PREDICTOR ML MODEL

## Import the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import pickle

: 

## Import the dataset

In [None]:
df = pd.read_csv('heart.csv')
df.head(10)

: 

In [None]:
df.info()
df.describe()

: 

## Modify Datatypes

In [None]:
# Replace infinity values with NaN
df.replace([float('inf'), float('-inf')], pd.NA, inplace=True)

# Drop rows with NaN values in 'cp' and 'target'
df.dropna(subset=['cp', 'target'], inplace=True)

: 

## Split the Dataset into Train and Test Datasets

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training set size:", train_df.shape)
print("Test set size:", test_df.shape)

: 

## Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(train_df.corr(), annot=True, fmt=".2f", cmap='magma', center=0)
plt.title('Correlation Heatmap of Training Dataset')
plt.show()

: 

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=train_df, x='chol', kde=True, bins=30)
plt.title('Histogram of Cholesterol Levels')
plt.xlabel('Cholesterol')
plt.ylabel('Frequency')
plt.show()

: 

In [None]:
# Distribution of Age
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age', kde=True, bins=30, color='skyblue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

: 

In [None]:
# Boxplot for Cholesterol by Target
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='target', y='chol', palette='viridis')
plt.title('Boxplot of Cholesterol Levels by Heart Disease')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Cholesterol')
plt.show()

: 

In [None]:
# Count Plot for Chest Pain Type
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='cp', palette='Set2')
plt.title('Count Plot of Chest Pain Types')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.show()

: 

In [None]:
# Scatter Plot for Age vs Max Heart Rate Achieved
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='age', y='thalach', hue='target', palette='coolwarm')
plt.title('Scatter Plot of Age vs Max Heart Rate Achieved')
plt.xlabel('Age')
plt.ylabel('Max Heart Rate Achieved')
plt.show()

: 

In [None]:
# Pair Plot
sns.pairplot(df, hue='target', palette='coolwarm', markers=["o", "s"])
plt.suptitle('Pair Plot of Features', y=1.02)
plt.show()

: 

In [None]:
sns.scatterplot(data=train_df, x='age', y='chol', hue='target')
plt.title('Scatter Plot of Age and Cholesterol with Target')
plt.show()

: 

In [None]:
sns.scatterplot(data=train_df, x='age', y='trestbps', hue='target')
plt.title('Scatter Plot of Age and Resting Blood Pressure')
plt.show()

: 

In [None]:
# Distribution of the Target Variable
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='target', palette='Set1')
plt.title('Distribution of the Target Variable')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

: 

## Importing Necessary Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

: 

In [None]:
# Removing target column from data

a_train = train_df.drop(['target'], axis=1)
b_train = train_df['target']

a_test = test_df.drop(['target'], axis=1)
b_test = test_df['target']

: 

## Training Models

In [None]:
model_lr = LogisticRegression(max_iter = 1500)
model_lr.fit(a_train, b_train)

: 

In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(a_train, b_train)

: 

In [None]:
b_pred = model_lr.predict(a_test)

: 

In [None]:
Brf_pred = model_rf.predict(a_test)

: 

## Compare the Models

In [None]:
print("LOGISTIC REGRESSSION")
print(classification_report(y_pred=b_pred, y_true=b_test))
print("RANDOM FOREST CLASSIFIER")
print(classification_report(y_pred=Brf_pred, y_true=b_test))

: 

In [None]:
print(accuracy_score(y_pred=b_pred, y_true=b_test))

print(accuracy_score(y_pred= Brf_pred, y_true=b_test))

: 

In [None]:
# Confusion matrix for Logistic Regression

conf_matrix_lr = confusion_matrix(b_test, b_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Heart Disease', 'Heart Disease'],
            yticklabels=['No Heart Disease', 'Heart Disease'])
plt.title('Confusion Matrix - Logistic Regression Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

: 

In [None]:
# Confusion matrix for random forest

conf_matrix_rf = confusion_matrix(b_test, Brf_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Heart Disease', 'Heart Disease'],
            yticklabels=['No Heart Disease', 'Heart Disease'])
plt.title('Confusion Matrix - Random Forest Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

: 

In [None]:
# Predictions from Logistic Regression model
b_pred_lr = model_lr.predict(a_test)
accuracy_lr = accuracy_score(b_test, b_pred_lr)


# Predictions from Random Forest Classifier model
b_pred_rf = model_rf.predict(a_test)
accuracy_rf = accuracy_score(b_test, b_pred_rf)

# Bar graph
labels = ['Logistic Regression', 'Random Forest Classifier']
accuracies = [accuracy_lr, accuracy_rf]

plt.bar(labels, accuracies, color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Comparison of Model Accuracies on test data')
plt.ylim(0, 1) 
plt.show()

: 

## Saving the Random Forest Model as a PKL File

In [None]:
with open('heart_disease.pkl', 'wb') as file:
    pickle.dump(model_rf, file)

: 

## Testing the Model with User Inputs

In [None]:
age = int(input("enter age: "))
sex = int(input("enter sex(0 female 1 male): "))
cp = int(input("enter cp (0-3):"))
trestbps = int(input("enter trestbps: "))
chol = int(input("enter chol: "))
fbs = int(input("enter fbs: "))
restecg = int(input("enter restecg: "))
thalach = int(input("enter thalach: "))
exang = int(input("enter exang: "))
oldpeak = float(input("enter oldpeak: "))
slope = int(input("enter slope: "))
ca = int(input("enter ca(0-3): "))
thal = int(input("enter thal: "))

user_input = {
        'age': [age],
        'sex': [sex],
        'cp': [cp],
        'trestbps': [trestbps],
        'chol': [chol],
        'fbs': [fbs],
        'restecg': [restecg],
        'thalach': [thalach],
        'exang': [exang],
        'oldpeak': [oldpeak],
        'slope': [slope],
        'ca': [ca],
        'thal': [thal]
    }
user_DF = pd.DataFrame(user_input)

: 

In [None]:
def heart_prediction(user_DF):
    pred_user = model_rf.predict(user_DF)
    if pred_user == 0:
        return "No heart disease\nYou're healthy"
    elif pred_user == 1:
        return "Heart disease\nTake precautions"
    
result = heart_prediction(user_DF)
print(result)

: 

: 