In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [32]:
data = pd.read_csv('heart_disease_df.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
293,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
294,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
295,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1


In [33]:
print("Percentage with heart disease:",(len(data[data['target'] == 1]))/ len(data) *100, "%")

Percentage with heart disease: 46.12794612794613 %


In [34]:
feature_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

X = data[feature_columns]
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Count the number of 1's (heart disease patients)
num_ones = y_test.sum()
print(f"Percentage of people with heart disease in test set: {num_ones/len(y_test) * 100:.2f}%")

model = LogisticRegression(max_iter=5000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix: \n", cm)

AUROC = roc_auc_score(y_test, y_pred)
print(f'\nAUROC: {AUROC:.2f}')

# Save the linear predictors (coefficients) and outcomes (predictions)
coefficients = model.coef_

results = pd.DataFrame(coefficients, columns=feature_columns)
results['true_outcomes'] = y_test.reset_index(drop=True)

results.to_csv('hd_linear_predictors_and_true_outcomes.csv', index=False)

Percentage of people with heart disease in test set: 45.56%

Accuracy: 0.88

Confusion matrix: 
 [[45  4]
 [ 7 34]]

AUROC: 0.87
