In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



In [2]:
cn = ['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','exang','oldpeak','slope','num_mjr_vess','thal','dx']


df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', names=cn)

In [3]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
df.describe() # need to preprocess the data!

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,dx
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [5]:
df['dx'].value_counts()

0    164
1     55
2     36
3     35
4     13
Name: dx, dtype: int64

In [6]:
df.dtypes

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess        object
thal                object
dx                   int64
dtype: object

In [7]:
df['thal'] = pd.to_numeric(df['thal'], errors='coerce')
df['num_mjr_vess'] = pd.to_numeric(df['num_mjr_vess'], errors='coerce')

df.dtypes





age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess       float64
thal               float64
dx                   int64
dtype: object

In [8]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [9]:
df['heart_dx'] = df['dx'].apply(lambda x: 1 if x >= 1 else 0)

df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0


In [10]:
df.describe()

# thal and num_mjr_vess have less columns than others. Need to know which columns to drop from others, 
# should I use dropna?


Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0,1.0


In [11]:
df.dropna(inplace=True)

In [12]:
df['sex'].value_counts()

1.0    201
0.0     96
Name: sex, dtype: int64

In [13]:
df['Man'] = df['sex'] == 1 # Males

df['Woman'] = df['sex'] == 0 # Females

# df.head()

In [14]:
df['Man'] = pd.to_numeric(df['Man'], errors='coerce')

df['Woman'] = pd.to_numeric(df['Woman'], errors='coerce')

### All Data Log Regression Models Comparing Different Attributes

In [15]:
model_all = 'heart_dx ~ age + sex + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_all = smf.logit(formula=model_all, data=df).fit()
print(result_all.summary())

# lets say alpha = 0.05

Optimization terminated successfully.
         Current function value: 0.420514
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      285
Method:                           MLE   Df Model:                           11
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3907
Time:                        20:45:49   Log-Likelihood:                -124.89
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 1.239e-28
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.0037      2.522     -2.777      0.005     -11.947      -2.061
age           

In [16]:
model_1 = 'heart_dx ~ sex + chest_pain_type + restbps +  max_heartrate + exang + oldpeak'

result_1 = smf.logit(formula=model_1, data=df).fit()
print(result_1.summary())

Optimization terminated successfully.
         Current function value: 0.435038
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      290
Method:                           MLE   Df Model:                            6
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3696
Time:                        20:45:49   Log-Likelihood:                -129.21
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 3.667e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.5124      1.827     -1.923      0.054      -7.092       0.068
sex           

In [17]:
model_2 = 'heart_dx ~ sex + chest_pain_type + restbps +  max_heartrate + exang + oldpeak'

result_2 = smf.logit(formula=model_2, data=df).fit()
print(result_2.summary())

Optimization terminated successfully.
         Current function value: 0.435038
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      290
Method:                           MLE   Df Model:                            6
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3696
Time:                        20:45:49   Log-Likelihood:                -129.21
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 3.667e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.5124      1.827     -1.923      0.054      -7.092       0.068
sex           

In [18]:
model_3 = 'heart_dx ~ sex + chest_pain_type + restbps +  max_heartrate'

result_3 = smf.logit(formula=model_3, data=df).fit()
print(result_3.summary())

Optimization terminated successfully.
         Current function value: 0.477560
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3080
Time:                        20:45:50   Log-Likelihood:                -141.84
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 2.436e-26
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -2.4002      1.742     -1.378      0.168      -5.814       1.014
sex           

### Attempting to compare Males Only against other attribtues

In [19]:
model_Men_all = 'heart_dx ~ age + Man + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_men_all = smf.logit(formula=model_Men_all, data=df).fit()
print(result_men_all.summary())

# lets say alpha = 0.05


#Same inital results

Optimization terminated successfully.
         Current function value: 0.420514
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      285
Method:                           MLE   Df Model:                           11
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3907
Time:                        20:45:50   Log-Likelihood:                -124.89
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 1.239e-28
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.0037      2.522     -2.777      0.005     -11.947      -2.061
Man[T.True]   

In [20]:
model_Men1 = 'heart_dx ~ age + Man + chest_pain_type + chol'

result_men1 = smf.logit(formula=model_Men1, data=df).fit()
print(result_men1.summary())



Optimization terminated successfully.
         Current function value: 0.519765
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.2469
Time:                        20:45:50   Log-Likelihood:                -154.37
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 5.445e-21
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -9.5610      1.421     -6.729      0.000     -12.346      -6.776
Man[T.True]   

In [21]:
model_Men2 = 'heart_dx ~ age + Man + chol + blood_sugar'

result_men2 = smf.logit(formula=model_Men2, data=df).fit()
print(result_men2.summary())

Optimization terminated successfully.
         Current function value: 0.606936
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.1206
Time:                        20:45:50   Log-Likelihood:                -180.26
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.757e-10
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -5.9432      1.109     -5.358      0.000      -8.117      -3.769
Man[T.True]     1.6369    

### Attempt with Women

In [22]:
model_W_all = 'heart_dx ~ age + Woman + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_W_all = smf.logit(formula=model_W_all, data=df).fit()
print(result_W_all.summary())

# lets say alpha = 0.05

Optimization terminated successfully.
         Current function value: 0.420514
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      285
Method:                           MLE   Df Model:                           11
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.3907
Time:                        20:45:50   Log-Likelihood:                -124.89
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 1.239e-28
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -5.0709      2.419     -2.096      0.036      -9.812      -0.329
Woman[T.True] 

In [23]:
model_W1 = 'heart_dx ~ age + Woman + chest_pain_type + chol'

result_W1 = smf.logit(formula=model_W1, data=df).fit()
print(result_W1.summary())

Optimization terminated successfully.
         Current function value: 0.519765
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.2469
Time:                        20:45:50   Log-Likelihood:                -154.37
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 5.445e-21
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.7260      1.268     -6.094      0.000     -10.211      -5.241
Woman[T.True] 

In [24]:
model_W2 = 'heart_dx ~ age + Woman + chol + blood_sugar'

result_W2 = smf.logit(formula=model_W2, data=df).fit()
print(result_W2.summary())

Optimization terminated successfully.
         Current function value: 0.606936
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.1206
Time:                        20:45:50   Log-Likelihood:                -180.26
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.757e-10
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -4.3062      0.986     -4.366      0.000      -6.239      -2.373
Woman[T.True]    -1.

In [25]:
# So far all results are exactly the same. Maybe We should try a different approach?

### Nothing really...wasn't sure if I should have been trying to standardize the data or not.

In [26]:
X = df[['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','slope','exang','oldpeak','num_mjr_vess','thal']]

y = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55) # if we don't specify wha the test and train set is (% wise) the default is the test set will be 25% of the data.

X_test.shape

(75, 13)