# Analysis of Factors Affecting Cardiovascular Health Among Men and Women

## 1. Explore Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
cn = ['age','sex','chest_pain_type','restbps','chol','blood_sugar','restecg','max_heartrate','exang','oldpeak','slope','num_mjr_vess','thal','dx']

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', 
                 names=cn)
df.head(3)


Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1


In [3]:
df.shape

(303, 14)

In [4]:
df.dtypes

age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess        object
thal                object
dx                   int64
dtype: object

In [6]:
      df[(df['age'] == '?') |
                (df['sex'] == '?') |
                (df['chest_pain_type'] == '?') |
                (df['restbps'] == '?') |  
                (df['chol'] == '?') |
                 (df['chest_pain_type'] == '?') |
                (df['restecg'] == '?') |
                (df['max_heartrate'] == '?') |
                (df['exang'] == '?') |
                (df['oldpeak'] == '?') |
                (df['slope'] == '?') |
                (df['num_mjr_vess'] == '?') |
                (df['thal'] == '?') |
                (df['dx'] == '?')]

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [7]:
df[(df=='?')] = np.nan

In [8]:
   df[(df['age'] == '?') |
                (df['sex'] == '?') |
                (df['chest_pain_type'] == '?') |
                (df['restbps'] == '?') |  
                (df['chol'] == '?') |
                 (df['chest_pain_type'] == '?') |
                (df['restecg'] == '?') |
                (df['max_heartrate'] == '?') |
                (df['exang'] == '?') |
                (df['oldpeak'] == '?') |
                (df['slope'] == '?') |
                (df['num_mjr_vess'] == '?') |
                (df['thal'] == '?') |
                (df['dx'] == '?')]

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx


In [9]:
df.isna().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heartrate      0
exang              0
oldpeak            0
slope              0
num_mjr_vess       4
thal               2
dx                 0
dtype: int64

## 2. Preprocess the data

Since there are Nan values in columns num_mjr_vess and thal , dropping those columns for preprocessing

In [10]:
df.dropna(inplace=True)

In [11]:
df.isna().sum()

age                0
sex                0
chest_pain_type    0
restbps            0
chol               0
blood_sugar        0
restecg            0
max_heartrate      0
exang              0
oldpeak            0
slope              0
num_mjr_vess       0
thal               0
dx                 0
dtype: int64

converting thal and num_mjr_vess to numeric datatype

In [12]:
df['thal'] = pd.to_numeric(df['thal'], errors='coerce')
df['num_mjr_vess'] = pd.to_numeric(df['num_mjr_vess'], errors='coerce')
df.dtypes


age                float64
sex                float64
chest_pain_type    float64
restbps            float64
chol               float64
blood_sugar        float64
restecg            float64
max_heartrate      float64
exang              float64
oldpeak            float64
slope              float64
num_mjr_vess       float64
thal               float64
dx                   int64
dtype: object

dx: diagnosis of heart disease (angiographic disease status).
This column provides the severity of heart patient ranging from (0 to 4) where 0 is abcense of  heart disease and 4 is the highest severity of heart disease


In [13]:
df['dx'].value_counts()

0    160
1     54
2     35
3     35
4     13
Name: dx, dtype: int64

To simplify our analysis we are creating other column named "heart_dx" based on "dx" where the values are 0 and 1 .
0 = No heart disease
1 = Has heart disease ( 1,2,3,4)

In [15]:
df['heart_dx'] = df['dx'].apply(lambda x: 1 if x >= 1 else 0)
df['heart_dx'].value_counts()

0    160
1    137
Name: heart_dx, dtype: int64

Creating specific columns for Men and Women to use in later analysis

In [16]:
df['Men'] = df['sex'] == 1 # Males
df['Women'] = df['sex'] == 0 # Females
df['Men'] = pd.to_numeric(df['Men'], errors='coerce')
df['Women'] = pd.to_numeric(df['Women'], errors='coerce')
df.head()

Unnamed: 0,age,sex,chest_pain_type,restbps,chol,blood_sugar,restecg,max_heartrate,exang,oldpeak,slope,num_mjr_vess,thal,dx,heart_dx,Men,Women
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0,True,False
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1,True,False
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1,True,False
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,True,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0,False,True


## 3.LOGISTIC REGRESSION MODEL based on Sex

### model_all - Logistic regression model comparing all attributes

In [17]:
model_all = 'heart_dx ~ \
                 age + sex + chest_pain_type + restbps + chol + blood_sugar + \
                 restecg + max_heartrate + slope + exang + oldpeak + age:sex'

result_all = smf.logit(formula=model_all, data=df).fit()
print(result_all.summary())

Optimization terminated successfully.
         Current function value: 0.420448
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      284
Method:                           MLE   Df Model:                           12
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3908
Time:                        19:50:54   Log-Likelihood:                -124.87
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.781e-28
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.3797      3.174     -2.325      0.020     -13.601      -1.159
age           

We used above model to determine significant attributes considering alpha=0.05.Based on the results, we found that the significant factors are chest_pain_type, slope, num_mjr_vess, thal, restbps, and oldpeak.

### model_1 - Logistic regression model on age, sex, chol, blood_sugar, restecg, slope, and age:sex interaction


In [18]:
model_1 = 'heart_dx ~ chest_pain_type + restbps + max_heartrate + exang + oldpeak'
result_1 = smf.logit(formula=model_1, data=df).fit()
print(result_1.summary())

Optimization terminated successfully.
         Current function value: 0.473520
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      291
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3139
Time:                        19:56:51   Log-Likelihood:                -140.64
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.545e-26
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -2.2496      1.760     -1.278      0.201      -5.699       1.199
chest_pain_typ

From the above reults,it is shown that restbps was no longer considered a significant factor anymore. Therefore, it was removed from the next model, model_2.

### model_2¶- Logistic regression model on chest_pain_type, max_heartrate, exang, oldpeak

In [19]:
model_2 = 'heart_dx ~ chest_pain_type + max_heartrate + exang + oldpeak'
result_2 = smf.logit(formula=model_2, data=df).fit()
print(result_2.summary())

Optimization terminated successfully.
         Current function value: 0.479995
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            4
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3045
Time:                        20:01:49   Log-Likelihood:                -142.56
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.964e-26
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.0356      1.329     -0.027      0.979      -2.640       2.569
chest_pain_typ

# model_Men_all - Logistic regression model on Male comparing all attributes

In [20]:
model_Men_all = 'heart_dx ~ age + Men + \
                            chest_pain_type + restbps + chol + blood_sugar + \
                            restecg + max_heartrate + slope + exang + oldpeak + Men:age'

result_men_all = smf.logit(formula=model_Men_all, data=df).fit()
print(result_men_all.summary())


Optimization terminated successfully.
         Current function value: 0.420448
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      284
Method:                           MLE   Df Model:                           12
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3908
Time:                        20:08:27   Log-Likelihood:                -124.87
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.781e-28
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.3797      3.174     -2.325      0.020     -13.601      -1.159
Men[T.True]   

The above model is used to determine significant factors for Men causing heart diseas considering alpha=0.05.
Based on the results, we found that the significant factors are the same as the above models: chest pain type, max heartrate, restbps, exang, and oldpeak. 
Likewise, instead of an interaction effect on Age and Sex, I used Men (males) and Age as the interaction effect.
It proved to have no significance either.
It is also noticed thatage and sex are not a significant factor like model_all

### model_Men1 - Logistic regression model on age, male, chol, blood_sugar, restecg, slope, and male:sex interaction

In [24]:
model_Men1 = 'heart_dx ~ chest_pain_type + restbps + max_heartrate + exang + oldpeak'

result_men1 = smf.logit(formula=model_Men1, data=df[df['Men'] == True]).fit()
print(result_men1.summary())

Optimization terminated successfully.
         Current function value: 0.477626
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  201
Model:                          Logit   Df Residuals:                      195
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3043
Time:                        20:28:39   Log-Likelihood:                -96.003
converged:                       True   LL-Null:                       -138.00
Covariance Type:            nonrobust   LLR p-value:                 1.219e-16
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.9222      2.229      0.414      0.679      -3.447       5.292
chest_pain_typ

In the above model, all the insiginificant factors, including the interaction effect are removed.
That left the following factors to be considered significant: Chest pain type, max heartrate, exang, and oldpeak. This is the same results as in Model_1. Removing the factor restbps as a factor before running the final model, model_Men2.


### model_Men2 - Logistic regression model on Male, chest_pain_type, max_heartrate, exang, oldpeak

In [27]:
model_Men2 = 'heart_dx ~ chest_pain_type + max_heartrate + exang + oldpeak'
result_men2 = smf.logit(formula=model_Men2, data=df[df['Men'] == True]).fit()
print(result_men2.summary())

Optimization terminated successfully.
         Current function value: 0.481420
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  201
Model:                          Logit   Df Residuals:                      196
Method:                           MLE   Df Model:                            4
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.2988
Time:                        20:30:46   Log-Likelihood:                -96.765
converged:                       True   LL-Null:                       -138.00
Covariance Type:            nonrobust   LLR p-value:                 5.202e-17
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           2.7314      1.691      1.615      0.106      -0.583       6.046
chest_pain_typ

### model_W_all - Logistic regression model on Female comparing all attributes

In [28]:
model_W_all = 'heart_dx ~ age + Women + chest_pain_type + restbps + chol + blood_sugar + restecg + max_heartrate + slope + exang + oldpeak + Women:age'

result_W_all = smf.logit(formula=model_W_all, data=df).fit()
print(result_W_all.summary())

Optimization terminated successfully.
         Current function value: 0.420448
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                  297
Model:                          Logit   Df Residuals:                      284
Method:                           MLE   Df Model:                           12
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.3908
Time:                        20:34:51   Log-Likelihood:                -124.87
converged:                       True   LL-Null:                       -204.97
Covariance Type:            nonrobust   LLR p-value:                 4.781e-28
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -4.9552      2.488     -1.992      0.046      -9.832      -0.078
Women[T.

### model_W1 - Logistic regression model on age, women, chol, blood_sugar, restecg, slope, and male:sex interaction¶

In [29]:
model_W1 = 'heart_dx ~ chest_pain_type + restbps + max_heartrate + exang + oldpeak'

result_W1 = smf.logit(formula=model_W1, data=df[df['Women'] == True]).fit()
print(result_W1.summary())


Optimization terminated successfully.
         Current function value: 0.312217
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                   96
Model:                          Logit   Df Residuals:                       90
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.4556
Time:                        20:36:14   Log-Likelihood:                -29.973
converged:                       True   LL-Null:                       -55.055
Covariance Type:            nonrobust   LLR p-value:                 1.282e-09
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -8.4017      3.941     -2.132      0.033     -16.126      -0.678
chest_pain_typ

### model_W2 -  Logistic regression model on Women, chest_pain_type, max_heartrate, exang, oldpeak¶

In [31]:
model_W2 = 'heart_dx ~ chest_pain_type + max_heartrate + exang + oldpeak'

result_W2 = smf.logit(formula=model_W2, data=df[df['Women'] == True]).fit()
print(result_W2.summary())

Optimization terminated successfully.
         Current function value: 0.333756
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               heart_dx   No. Observations:                   96
Model:                          Logit   Df Residuals:                       91
Method:                           MLE   Df Model:                            4
Date:                Thu, 08 Dec 2022   Pseudo R-squ.:                  0.4180
Time:                        20:37:11   Log-Likelihood:                -32.041
converged:                       True   LL-Null:                       -55.055
Covariance Type:            nonrobust   LLR p-value:                 2.429e-09
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -3.8285      2.977     -1.286      0.198      -9.664       2.007
chest_pain_typ

## 4 . Logistic Regression model based on Age group