In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [2]:
cn = ['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','exang','oldpeak','slope','num_mjr_vess','thal','dx']


df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', names=cn)

In [3]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
df.describe() # need to preprocess the data!

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,dx
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [5]:
df.isnull().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heartrate      0
exang              0
oldpeak            0
slope              0
num_mjr_vess       0
thal               0
dx                 0
dtype: int64

In [6]:
df.isna().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heartrate      0
exang              0
oldpeak            0
slope              0
num_mjr_vess       0
thal               0
dx                 0
dtype: int64

In [7]:
df.shape

(303, 14)

In [8]:
df['dx'].value_counts()

0    164
1     55
2     36
3     35
4     13
Name: dx, dtype: int64

In [9]:
df.dtypes

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess        object
thal                object
dx                   int64
dtype: object

In [10]:
# df['sex'] = df['sex'].astype(object)
# df['chest_pain_type'] = df['chest_pain_type'].astype(object)
# df['restecg'] = df['restecg'].astype(object)
# df['exang'] = df['exang'].astype(object)
# df['slope'] = df['slope'].astype(object)


In [11]:
df.dtypes

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess        object
thal                object
dx                   int64
dtype: object

In [12]:
df['thal'] = pd.to_numeric(df['thal'], errors='coerce')
df['num_mjr_vess'] = pd.to_numeric(df['num_mjr_vess'], errors='coerce')

df.dtypes


# Should I onehotencode these instead? 
# If so, do I add normal attributes or do I add the name of the new onehotencoder function?

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess       float64
thal               float64
dx                   int64
dtype: object

In [13]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [None]:
# add dropna

In [14]:
df['heart_dx'] = df['dx'].apply(lambda x: 1 if x >= 1 else 0)

df.tail()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0,0


In [15]:
df['heart_dx'].value_counts(normalize=True)

0    0.541254
1    0.458746
Name: heart_dx, dtype: float64

### Below, I thought I should scale the enitre data set, except the diagnosis column, however, I don't think this would work. That is because there are also categorical columns within this data set that need to be preprocessed.

In [16]:
# X = df.iloc[:,13]

# scaler = StandardScaler()

# X_scaled = scaler.fit_transform(X)

# X_scaled = pd.DataFrame(X_scaled, columns=cnn)

# X_scaled.head()

---

In [17]:
df.shape

(303, 15)

In [18]:
df['sex'].value_counts()

1.0    206
0.0     97
Name: sex, dtype: int64

## Should probably use OneHotEncoder instead, however, I can't get simple imputer to remove the '?' or get the smf results function to run using the categorical attributes. Thoughts? Help?

In [19]:
df['Men'] = df['sex'] == 1 # Males

df['Women'] = df['sex'] == 0 # Females

# df.head()

In [20]:
df['Men'] = pd.to_numeric(df['Men'], errors='coerce')

df['Women'] = pd.to_numeric(df['Women'], errors='coerce')

In [70]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women,Age_low,Age_med,Age_high
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0,True,False,False,False,True
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1,True,False,False,False,True
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1,True,False,False,False,True
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,True,False,True,False,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0,False,True,True,False,False


In [22]:
# df.head(170)
# # didn't remove '?'!!!!

### All Data Log Regression Models Comparing Different Attributes

In [23]:
model_all = 'heart_dx ~ age + sex + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_all = smf.logit(formula=model_all, data=df).fit()
print(result_all.summary())

# lets say alpha = 0.05
# so far sex is only showing for True/False or 1 or 0, therefore I excluded from each model not specific to sex.


# this model yielded potentially usefuly results for chest paint type, exang, oldpeak. Is this acceptable?
# This is totally different when adding sex as a factor (attribute).
# Would have to remove the categorical data here becasue onehotencoder didn't work?! 
# Should we keep this or scrap this?



Optimization terminated successfully.
         Current function value: 0.416553
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3961
Time:                        14:51:52   Log-Likelihood:                -126.22
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 9.700e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.1919      2.511     -2.864      0.004     -12.113      -2.271
age           

In [24]:
model_1 = 'heart_dx ~ chest_pain_type + slope + num_mjr_vess + thal + restbps + oldpeak'

result_1 = smf.logit(formula=model_1, data=df).fit()
print(result_1.summary())


Optimization terminated successfully.
         Current function value: 0.385960
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      290
Method:                           MLE   Df Model:                            6
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.4408
Time:                        14:51:52   Log-Likelihood:                -114.63
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 2.426e-36
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -9.0183      1.634     -5.518      0.000     -12.222      -5.815
chest_pain_typ

### Pipeline for ALL Genders and ALL Ages

In [25]:
df[df['thal'] == '?']

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women


In [26]:
df[df['num_mjr_vess'] == '?']

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women


### Attempting to compare Males Only against other attribtues

In [27]:
model_Men_all = 'heart_dx ~ age + Men + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_men_all = smf.logit(formula=model_Men_all, data=df).fit()
print(result_men_all.summary())

# lets say alpha = 0.05
# add an interaction term to determine the signifcance 

#Same inital results

Optimization terminated successfully.
         Current function value: 0.416553
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3961
Time:                        14:51:52   Log-Likelihood:                -126.22
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 9.700e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.1919      2.511     -2.864      0.004     -12.113      -2.271
Men[T.True]   

In [28]:
model_Men1 = 'heart_dx ~ Men + chest_pain_type + restbps + max_heartrate + exang + oldpeak'

result_men1 = smf.logit(formula=model_Men1, data=df).fit()
print(result_men1.summary())

#age seems obsolete in this model. Changed it for the lowest p values from previous model.

Optimization terminated successfully.
         Current function value: 0.431285
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      296
Method:                           MLE   Df Model:                            6
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3747
Time:                        14:51:52   Log-Likelihood:                -130.68
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 3.071e-31
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.6460      1.819     -2.004      0.045      -7.211      -0.081
Men[T.True]   

In [29]:
model_Men2 = 'heart_dx ~ Men + chest_pain_type + restbps + max_heartrate + exang + oldpeak'

result_men2 = smf.logit(formula=model_Men2, data=df).fit()
print(result_men2.summary())

#changed again, based on p values from previous model

Optimization terminated successfully.
         Current function value: 0.431285
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      296
Method:                           MLE   Df Model:                            6
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3747
Time:                        14:51:52   Log-Likelihood:                -130.68
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 3.071e-31
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.6460      1.819     -2.004      0.045      -7.211      -0.081
Men[T.True]   

### Attempt with Women

In [30]:
model_W_all = 'heart_dx ~ age + Women + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_W_all = smf.logit(formula=model_W_all, data=df).fit()
print(result_W_all.summary())

# lets say alpha = 0.05
# model with all attributes

Optimization terminated successfully.
         Current function value: 0.416553
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3961
Time:                        14:51:52   Log-Likelihood:                -126.22
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 9.700e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -5.2361      2.410     -2.172      0.030      -9.960      -0.512
Women[T.True] 

In [31]:
model_W1 = 'heart_dx ~ Women + chest_pain_type + restbps + max_heartrate + exang + oldpeak'

result_W1 = smf.logit(formula=model_W1, data=df).fit()
print(result_W1.summary())

#age seems obsolete in this model. Changed it for the lowest p values from previous model.

Optimization terminated successfully.
         Current function value: 0.431285
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      296
Method:                           MLE   Df Model:                            6
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3747
Time:                        14:51:52   Log-Likelihood:                -130.68
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 3.071e-31
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -2.0077      1.787     -1.123      0.261      -5.510       1.495
Women[T.True] 

In [32]:
# So far all results are exactly the same. Maybe We should try a different approach?

### Checking for Age Range Volume

In [33]:
df[df['age'].between(29,45)].shape[0]

63

In [34]:
df[df['age'].between(46, 60)].shape[0]

161

In [35]:
df[df['age'].between(61,77)].shape[0]

79

In [36]:
df[df['age'].between(62, 77)].shape[0]

71

In [37]:
df['Age_low'] = df['age'].between(29, 45)

In [38]:
df['Age_med'] = df['age'].between(46, 61) #smallest range but biggest volume

In [39]:
df['Age_high'] = df['age'].between(62, 77)

In [40]:
df.tail()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women,Age_low,Age_med,Age_high
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,1,True,False,True,False,False
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,1,True,False,False,False,True
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,1,True,False,False,True,False
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,1,False,True,False,True,False
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0,0,True,False,True,False,False


### First Age Group 29 - 45

In [41]:
model_Age1 = 'heart_dx ~ Age_low + sex + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_Age1 = smf.logit(formula=model_Age1, data=df).fit() 
print(result_Age1.summary())

# alpha = 0.05

Optimization terminated successfully.
         Current function value: 0.415389
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3978
Time:                        14:51:52   Log-Likelihood:                -125.86
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 6.945e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -5.8522      2.140     -2.735      0.006     -10.047      -1.658
Age_low[T.True

In [42]:
# age is yielding slightly different results. The top identifiers at the moment are oldpeak, exang, chest_pain_type,
# and max_heartrate based on 0.05 AS alpha.


In [43]:
model_Age1a = 'heart_dx ~ Age_low + sex + chest_pain_type + restbps +  max_heartrate + exang + oldpeak'

result_Age1a = smf.logit(formula=model_Age1a, data=df).fit()
print(result_Age1a.summary())

# alpha = 0.05

Optimization terminated successfully.
         Current function value: 0.427456
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            7
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3803
Time:                        14:51:52   Log-Likelihood:                -129.52
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 5.351e-31
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.8011      1.843     -2.062      0.039      -7.414      -0.189
Age_low[T.True

In [44]:
# model_Age1b = 'heart_dx ~ Age_low + restbps + chol + blood_sugar + restecg + slope'

# result_Age1b = smf.logit(formula=model_Age1b, data=df).fit()
# print(result_Age1b.summary())

# alpha = 0.05

#tried something different. Used the attributes with the highest p-value to run against agre groups to see if there 
# were different attributes to consider. There were not useful results from this model. The pvalue for age was it's
# lowest though.


In [45]:
model_Age2 = 'heart_dx ~ Age_med + sex + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_Age2 = smf.logit(formula=model_Age2, data=df).fit()
print(result_Age2.summary())

# results are still pretty similar

Optimization terminated successfully.
         Current function value: 0.415122
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3981
Time:                        14:51:52   Log-Likelihood:                -125.78
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 6.434e-30
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -6.0693      2.154     -2.818      0.005     -10.290      -1.848
Age_med[T.True

In [46]:
model_Age2a = 'heart_dx ~ Age_med + sex + chest_pain_type + restbps +  max_heartrate + exang + oldpeak'

result_Age2a = smf.logit(formula=model_Age2a, data=df).fit()
print(result_Age2a.summary())

# results are still pretty similar

Optimization terminated successfully.
         Current function value: 0.428938
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            7
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3781
Time:                        14:51:52   Log-Likelihood:                -129.97
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 8.267e-31
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.7489      1.824     -2.056      0.040      -7.323      -0.174
Age_med[T.True

In [47]:
model_Age3 = 'heart_dx ~ Age_high + sex + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak'

result_Age3 = smf.logit(formula=model_Age3, data=df).fit()
print(result_Age3.summary())

# this model yielded the most unique results. While the restbps and chol is higher than 0.05 (alpha), it is much closer 
# than the others. Though, the primary attributes are the same as the overall model. Likewise, the p-value for Age
# is highest amongst the 3 age groupings.


Optimization terminated successfully.
         Current function value: 0.418208
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3937
Time:                        14:51:52   Log-Likelihood:                -126.72
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 1.559e-29
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -5.6177      2.131     -2.636      0.008      -9.794      -1.441
Age_high[T.

In [48]:
model_Age3a = 'heart_dx ~ Age_high + sex + chest_pain_type + restbps + chol + max_heartrate + exang + oldpeak'

result_Age3a = smf.logit(formula=model_Age3a, data=df).fit()
print(result_Age3a.summary())
#maybe this age groups attributes are different. I left chol and restbps and their p-values went down significantly.
# Does the p-value of Age_high matter?

Optimization terminated successfully.
         Current function value: 0.423121
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  303
Model:                          Logit   Df Residuals:                      294
Method:                           MLE   Df Model:                            8
Date:                Wed, 30 Nov 2022   Pseudo R-squ.:                  0.3865
Time:                        14:51:52   Log-Likelihood:                -128.21
converged:                       True   LL-Null:                       -208.99
Covariance Type:            nonrobust   LLR p-value:                 7.506e-31
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -4.8525      1.965     -2.469      0.014      -8.704      -1.001
Age_high[T.

In [49]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women,Age_low,Age_med,Age_high
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0,True,False,False,False,True
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1,True,False,False,False,True
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1,True,False,False,False,True
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,True,False,True,False,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0,False,True,True,False,False


In [50]:
# Need to determine which attributes fit all models ran. Then run the test_train_split function on each model 
# (all, by gender, and by age group) model. 

# Then perform cross val score to determine accuracy of the models selected attributes. 
#Then add some visualizations!


In [51]:
Xo = df[['age','sex','chest_pain_type','restbps','max_heartrate','exang','oldpeak']]

yo = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xo, yo, random_state=40) 

X_test.shape


model = LogisticRegression()

model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:,1]

y_pred = model.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
from sklearn import metrics

cm = metrics.confusion_matrix(y_test, y_pred) 

df_cm = pd.DataFrame(data=cm, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,37,5
true: Heart Disease,5,29


In [53]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        42
           1       0.85      0.85      0.85        34

    accuracy                           0.87        76
   macro avg       0.87      0.87      0.87        76
weighted avg       0.87      0.87      0.87        76



In [54]:
Xm = df[['age','Men','chest_pain_type','restbps','max_heartrate','exang','oldpeak']]

ym = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xm, ym, random_state=50) 

X_test.shape


model2 = LogisticRegression()

model2.fit(X_train, y_train)

y_proba2 = model2.predict_proba(X_test)[:,1]

y_pred2 = model2.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
cm2 = metrics.confusion_matrix(y_test, y_pred2) 

df_cm2 = pd.DataFrame(data=cm2, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm2

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,32,5
true: Heart Disease,12,27


In [56]:
print(metrics.classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        37
           1       0.84      0.69      0.76        39

    accuracy                           0.78        76
   macro avg       0.79      0.78      0.78        76
weighted avg       0.79      0.78      0.77        76



In [57]:
Xw = df[['age','Women','chest_pain_type','restbps','max_heartrate','exang','oldpeak']]

yw = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xw, yw, random_state=35) 

X_test.shape


model3 = LogisticRegression()

model3.fit(X_train, y_train)

y_proba3 = model3.predict_proba(X_test)[:,1]

y_pred3 = model3.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
cm3 = metrics.confusion_matrix(y_test, y_pred3) 

df_cm3 = pd.DataFrame(data=cm3, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm3

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,34,5
true: Heart Disease,11,26


In [59]:
print(metrics.classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81        39
           1       0.84      0.70      0.76        37

    accuracy                           0.79        76
   macro avg       0.80      0.79      0.79        76
weighted avg       0.80      0.79      0.79        76



In [60]:
Xa1 = df[['Age_low','sex','chest_pain_type','restbps','max_heartrate','exang','oldpeak']]

ya1 = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xa1, ya1, random_state=15) 


model4 = LogisticRegression()

model4.fit(X_train, y_train)

y_proba4 = model4.predict_proba(X_test)[:,1]

y_pred4 = model4.predict(X_test)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
cm4 = metrics.confusion_matrix(y_test, y_pred4) 

df_cm4 = pd.DataFrame(data=cm4, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm4

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,38,5
true: Heart Disease,8,25


In [62]:
print(metrics.classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85        43
           1       0.83      0.76      0.79        33

    accuracy                           0.83        76
   macro avg       0.83      0.82      0.82        76
weighted avg       0.83      0.83      0.83        76



In [63]:
Xa2 = df[['Age_med','sex','chest_pain_type','restbps','max_heartrate','exang','oldpeak']]

ya2 = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xa2, ya2, random_state=25) 

X_test.shape


model5 = LogisticRegression()

model5.fit(X_train, y_train)

y_proba5 = model5.predict_proba(X_test)[:,1]

y_pred5 = model5.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
cm5 = metrics.confusion_matrix(y_test, y_pred5) 

df_cm5 = pd.DataFrame(data=cm5, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm5

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,33,4
true: Heart Disease,6,33


In [65]:
print(metrics.classification_report(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87        37
           1       0.89      0.85      0.87        39

    accuracy                           0.87        76
   macro avg       0.87      0.87      0.87        76
weighted avg       0.87      0.87      0.87        76



In [66]:
Xa3 = df[['Age_med','sex','chest_pain_type','restbps','chol','max_heartrate','exang','oldpeak']]

ya3 = df['heart_dx']


X_train, X_test, y_train, y_test = train_test_split(Xa3, ya3, random_state=88) 

X_test.shape


model6 = LogisticRegression()

model6.fit(X_train, y_train)

y_proba6 = model6.predict_proba(X_test)[:,1]

y_pred6 = model6.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
cm6 = metrics.confusion_matrix(y_test, y_pred6) 

df_cm6 = pd.DataFrame(data=cm6, columns=['predict: No Heart Disease', 'predict: Heart Disease'],
                    index=['true: No Heart Disease', 'true: Heart Disease'])

df_cm6

Unnamed: 0,predict: No Heart Disease,predict: Heart Disease
true: No Heart Disease,36,6
true: Heart Disease,9,25


In [68]:
print(metrics.classification_report(y_test, y_pred6))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83        42
           1       0.81      0.74      0.77        34

    accuracy                           0.80        76
   macro avg       0.80      0.80      0.80        76
weighted avg       0.80      0.80      0.80        76



In [69]:
# Need to perform cross val score to determine accuracy of the models selected attributes. 
#Then add some visualizations!