In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [None]:
plt.rc("figure", figsize=(15,15))

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.head()

Going to rename the columns to their longer, more descriptive forms.

In [None]:
df = df.rename(columns = {'exng' : "exercise induced agnia", 'caa': 'number of major vessels', 'cp' : 'chest pain type', 
                     'trtbps' : 'resting blood pressure',  'chol' : 'cholestoral', 'fbs': 'high fasting blood sugar',
                    'restecg' : 'resting_ecg', 'thalachh' : 'maximum heart rate'})

In [None]:
df.head()

In [None]:
df.describe()

I want to simulate forcasting, so rather than divide these in train/test, I'm going to pretend we've been given data on patients with low heart rates and we want to predict what would happen as heart rate increases. A heart rate of 165 is chosen arbitrarily.

In [None]:
low_heart_rate = df[df['maximum heart rate'] <= 165]
high_heart_rate = df[df['maximum heart rate'] > 165]

In [None]:
low_heart_rate.corr()

We aren't seeing any multicollinearity in our training data.

In [None]:
low_heart_rate.describe()

In [None]:
low_heart_rate.head()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
res = smf.logit("""output ~ age + sex + C(Q("chest pain type")) + Q('resting blood pressure') 
+ cholestoral + Q('high fasting blood sugar') + C(resting_ecg) 
+  Q('maximum heart rate') + Q('exercise induced agnia') + Q('oldpeak') 
+ C(slp) +  Q('number of major vessels') + C(thall)""", data=low_heart_rate).fit()

In [None]:
res.summary()

Let's run f tests on the categorical parameters, rather than relying on their individual p values.

In [None]:
res.f_test('(C(Q("chest pain type"))[T.1] = 0), (C(Q("chest pain type"))[T.2] = 0), (C(Q("chest pain type"))[T.3] = 0)')

In [None]:
res.f_test('(C(resting_ecg)[T.1] = 0), (C(resting_ecg)[T.2] = 0)')

In [None]:
res.f_test('(C(slp)[T.1] = 0), (C(slp)[T.2] = 0)')

In [None]:
res.f_test('(C(thall)[T.1] = 0), (C(thall)[T.2] = 0), (C(thall)[T.3] = 0)')

We've got quite a few variables with very high p scores, let's remove them from the model.

In [None]:
res = smf.logit("""output ~ sex + Q('resting blood pressure') + C(Q("chest pain type"))
+  Q('maximum heart rate') + Q('exercise induced agnia') + Q('oldpeak') 
+  Q('number of major vessels')""", data=low_heart_rate).fit()

In [None]:
res.summary()

In [None]:
res = smf.logit("""output ~ sex + C(Q("chest pain type")) + Q('resting blood pressure') + Q('exercise induced agnia') + Q('oldpeak') 
+  Q('number of major vessels')""", data=low_heart_rate).fit()

In [None]:
res.summary()

We do lose a small amount of R2, but this is to be expected whenever we remove variables.

In [None]:
predicted = res.predict(high_heart_rate)

In [None]:
high_heart_rate = high_heart_rate.merge(predicted.rename('predicted'), left_index=True, right_index=True)

In [None]:
high_heart_rate['predicted_cat'] = round(high_heart_rate.predicted)

In [None]:
high_heart_rate.head()

In [None]:
from sklearn.metrics import accuracy_score

'{0:.0%}'.format(accuracy_score(high_heart_rate.output, round(predicted)))

Far from ideal but not bad for such a simple model.

In [None]:
high_heart_rate[high_heart_rate.output != high_heart_rate.predicted_cat].describe()

In [None]:
high_heart_rate.describe()

In [None]:
len(high_heart_rate[high_heart_rate.output != high_heart_rate.predicted_cat])

At a glance it looks like the model is performing poorly for patients in high numbers of major blood vessles.

In [None]:
len(high_heart_rate[(high_heart_rate['number of major vessels'] > 0) & (high_heart_rate['output'] != high_heart_rate['predicted_cat'])])

They make up abouthalf of our incorrect predictions.

In [None]:
high_heart_rate[(high_heart_rate['number of major vessels'] > 0) & (high_heart_rate['output'] != high_heart_rate['predicted_cat'])]

In [None]:
lhr_vessels = len(low_heart_rate[low_heart_rate['number of major vessels'] > 0]) / len(low_heart_rate)
hhr_vessels = len(high_heart_rate[high_heart_rate['number of major vessels'] > 0]) / len(high_heart_rate)
lhr_attack_rate = len(low_heart_rate[(low_heart_rate['number of major vessels'] > 0) & (low_heart_rate.output == 1)]) / len(low_heart_rate)
hhr_attack_rate = len(high_heart_rate[(high_heart_rate['number of major vessels'] > 0) & (high_heart_rate.output == 1)]) / len(high_heart_rate)
print(f"The number of vessels present in low heart patients is a lot higher than in the number of high heart rate patients: {lhr_vessels:.0%} vs {hhr_vessels:.0%}. \nMaybe there's some condition among people with large numbers of blood vessels we're missing.")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(high_heart_rate.output, round(predicted)))

Looks like most of room for improve is in false positives. We have a lot of cases where we're predicting heart attack in patients that will not experience them.