In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the heart attack prediction dataset
dataset = pd.read_csv('heart.csv')

# Split the dataset into training and testing sets
X = dataset.drop('target', axis=1)
y = dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize different machine learning algorithms
logreg = LogisticRegression(random_state=42, max_iter=60000)
naive_bayes = GaussianNB()
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Train and evaluate the algorithms on the training and testing datasets
for clf in [logreg, naive_bayes, decision_tree, random_forest]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f'{clf.__class__.__name__}:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-score: {f1:.2f}')
    print(f'ROC AUC: {roc_auc:.2f}')
    print('-'*50)

# Determine the algorithm with the highest accuracy
accuracies = [accuracy_score(y_test, clf.predict(X_test)) for clf in [logreg, naive_bayes, decision_tree, random_forest]]
best_clf_index = accuracies.index(max(accuracies))
best_clf_name = [clf.__class__.__name__ for clf in [logreg, naive_bayes, decision_tree, random_forest]][best_clf_index]
print(f'The algorithm with the highest accuracy is {best_clf_name} with an accuracy of {max(accuracies):.2f}')

###############################################################################################################################
# read in the two data frames
df1 = pd.read_csv('framingham.csv')
df2 = pd.read_csv('heart.csv')

# merge the two datasets based on Age and Sex columns
combined_df = pd.merge(df1, df2, on=['Age', 'Sex'])

# write the combined data frame to a new csv file
combined_df.to_csv('combined.csv', index=False)

# Load the combined.csv file into a pandas dataframe
combined_df = pd.read_csv('combined.csv')

# Drop any rows with empty values
cleaned_df = combined_df.dropna()

# Save the cleaned dataframe to a new csv file
cleaned_df.to_csv('cleaned_combined.csv', index=False)

# load the cleaned_combined.csv file
df = pd.read_csv('cleaned_combined.csv')

# specify the features to be used for prediction
features = ['Sex', 'Age', 'Education', 'Current Smoker', 'Cigs Per Day', 'BPMeds', 'Prevalent Stroke', 'Prevalent Hyp', 'Diabetes', 'TotChol', 'SysBP',
            'DiaBP', 'BMI', 'HeartRate', 'Glucose', 'Cp', 'Trestbps', 'Chol', 'Fbs', 'Restecg', 'Thalach', 'Exang', 'Oldpeak', 'slope', 'ca', 'thal']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['TenYearCHD'], test_size=0.2, random_state=42)

# get input from user for the features
print(" Enter the following details: ")

# basic details
sex = int(input(" Sex (1=male, 0=female): "))
age = int(input(" Age: "))
edu = int(input(" Education (0=less than high school, 1=high school, 2=some college, 3=college): "))

# smoking details
smoker = int(input(" Current Smoker (1=yes, 0=no): "))
cigs = int(input(" Cigarettes per day: "))

# blood pressure details
bpmeds = int(input(" BP Meds (1=yes, 0=no): "))
sysbp = float(input(" Systolic Blood Pressure: "))
diabp = float(input(" Diastolic Blood Pressure: "))
trestbps = float(input(" Resting Blood Pressure: "))
hyp = int(input(" Prevalent Hypertension (1=yes, 0=no): "))

# cholestrol details
totchol = float(input(" Total Cholesterol: "))
chol = float(input(" Serum Cholesterol: "))
hdl_chol = int(input(" HDL cholesterol: "))

# heart details
hr = float(input(" Heart Rate: "))
thalach = float(input(" Maximum Heart Rate Achieved: "))
stroke = int(input(" Prevalent Stroke (1=yes, 0=no): "))

# chest pain details
cp = int(input(" Chest Pain Type (1=typical angina, 2=atypical angina, 3=non-anginal pain, 4=asymptomatic): "))
exang = int(input(" Exercise Induced Angina (1=yes, 0=no): "))

# ecg details
restecg = int(input(" Resting ECG (0=normal, 1=ST-T wave abnormality, 2=left ventricular hypertrophy): "))
oldpeak = float(input(" ST Depression Induced by Exercise Relative to Rest: "))
slope = int(input(" Slope of the Peak Exercise ST Segment (1=upsloping, 2=flat, 3=downsloping): "))
ca = float(input(" Number of Major Vessels Colored by Flourosopy (0-3):"))

# general health details
bmi = float(input(" BMI: "))
glucose = float(input(" Glucose: "))
fbs = float(input(" Fasting Blood Sugar > 120 mg/dl (1=yes, 0=no): "))
thal = float(input(" Thalassemia (0=normal, 1=fixed defect, 2=reversable defect): "))
diabetes = int(input(" Diabetes (1=yes, 0=no): "))

# health score
def get_health_score(age, bmi, sysbp, diabp, chol, hdl):
    # Calculate the ratio of systolic blood pressure to diastolic blood pressure
    bp_ratio = sysbp / diabp
    
    # Calculate the ratio of total cholesterol to HDL cholesterol
    chol_ratio = chol / hdl_chol
    
    # Calculate the health score based on age, BMI, blood pressure ratio, and cholesterol ratio
    health_score = age - bmi + bp_ratio + chol_ratio
    
    return health_score

# bp ratio
def calculate_blood_pressure_ratio(sysbp, diabp):
    return sysbp / diabp

# calculation
bp_ratio = calculate_blood_pressure_ratio(sysbp, diabp)
user_input = [[sex, age, edu, smoker, cigs, bpmeds, stroke, hyp, diabetes, totchol,
sysbp, diabp, bmi, hr, glucose, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]]
health_score = get_health_score(age, bmi, sysbp, diabp, chol, hdl_chol)

if(best_clf_name == "LogisticRegression"):
        # create the logistic regression model
    lr = LogisticRegression(random_state=42)
    # train the model on the training data
    lr.fit(X_train, y_train)
    prediction = lr.predict(user_input)[0]
    if prediction == 1:
    # predict the probability of having a heart attack
        probability = lr.predict_proba(user_input)[0][1] * 100
        print(" Based on the given inputs, the patient is at risk of having a heart attack with a probability of {:.2f}%".format(probability))
    else:
        print(" Based on the given inputs, the patient is not at risk of having a heart attack.")
elif(best_clf_name == "GaussianNB"):
    # create the logistic regression model
    gn = GaussianNB()
    # train the model on the training data
    gn.fit(X_train, y_train)
    prediction = gn.predict(user_input)[0]
    if prediction == 1:
    # predict the probability of having a heart attack
        probability = gn.predict_proba(user_input)[0][1] * 100
        print(" Based on the given inputs, the patient is at risk of having a heart attack with a probability of {:.2f}%".format(probability))
    else:
        print(" Based on the given inputs, the patient is not at risk of having a heart attack.")
elif(best_clf_name == "DecisionTreeClassifier"):
    # create the logistic regression model
    dtc = DecisionTreeClassifier(random_state=42)
    # train the model on the training data
    dtc.fit(X_train, y_train)
    prediction = dtc.predict(user_input)[0]
    if prediction == 1:
    # predict the probability of having a heart attack
        probability = dtc.predict_proba(user_input)[0][1] * 100
        print(" Based on the given inputs, the patient is at risk of having a heart attack with a probability of {:.2f}%".format(probability))
    else:
        print(" Based on the given inputs, the patient is not at risk of having a heart attack.")
elif(best_clf_name == "RandomForestClassifier"):
    # create the logistic regression model
    rfc = RandomForestClassifier(random_state=42)
    # train the model on the training data
    rfc.fit(X_train, y_train)
    prediction = rfc.predict(user_input)[0]
    if prediction == 1:
    # predict the probability of having a heart attack
        probability = rfc.predict_proba(user_input)[0][1] * 100
        print(" Based on the given inputs, the patient is at risk of having a heart attack with a probability of {:.2f}%".format(probability))
    else:
        print(" Based on the given inputs, the patient is not at risk of having a heart attack.")   

        
# print the health score and bp ratio
print(f'The bp ratio is {bp_ratio:.2f}')
print(f'The health score is {health_score:.2f}')

LogisticRegression:
Accuracy: 0.80
Precision: 0.76
Recall: 0.87
F1-score: 0.81
ROC AUC: 0.79
--------------------------------------------------
GaussianNB:
Accuracy: 0.80
Precision: 0.75
Recall: 0.89
F1-score: 0.82
ROC AUC: 0.80
--------------------------------------------------
DecisionTreeClassifier:
Accuracy: 0.99
Precision: 1.00
Recall: 0.97
F1-score: 0.99
ROC AUC: 0.99
--------------------------------------------------
RandomForestClassifier:
Accuracy: 0.99
Precision: 1.00
Recall: 0.97
F1-score: 0.99
ROC AUC: 0.99
--------------------------------------------------
The algorithm with the highest accuracy is DecisionTreeClassifier with an accuracy of 0.99
 Enter the following details: 
 Sex (1=male, 0=female): 1
 Age: 22
 Education (0=less than high school, 1=high school, 2=some college, 3=college): 2
 Current Smoker (1=yes, 0=no): 1
 Cigarettes per day: 20
 BP Meds (1=yes, 0=no): 0
 Systolic Blood Pressure: 23.3
 Diastolic Blood Pressure: 23
 Resting Blood Pressure: 4
 Prevalent Hy

