### Data Exploration & Analysis on Heart Failure Dataset from UCI: https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records#

#### <ins>Import Data<ins>

In [2]:
HeartFail = read.csv('C:/Users/Mia/OneDrive/Documents/GitHub/Final_Project/Datasets/heart_failure_clinical_records_dataset.csv')

In [3]:
head(HeartFail)

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
Unnamed: 0_level_1,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>
1,75,0,582,0,20,1,265000,1.9,130,1,0,4,1
2,55,0,7861,0,38,0,263358,1.1,136,1,0,6,1
3,65,0,146,0,20,0,162000,1.3,129,1,1,7,1
4,50,1,111,0,20,0,210000,1.9,137,1,0,7,1
5,65,1,160,1,20,0,327000,2.7,116,0,0,8,1
6,90,1,47,0,40,1,204000,2.1,132,1,1,8,1


#### <ins>Load Libraries<ins>

In [11]:
library("caret")
library("gmodels")
library("MASS")
library("dplyr")
library("magrittr")
library("dplyr")
library("tidyr")
library("lmtest")
library("popbio")
library("e1071")


Attaching package: 'popbio'


The following object is masked from 'package:caret':

    sensitivity




In [10]:
install.packages("popbio")

Installing package into 'C:/Users/Mia/OneDrive/Documents/R/win-library/4.1'
(as 'lib' is unspecified)



package 'popbio' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Mia\AppData\Local\Temp\Rtmpugg6wc\downloaded_packages


#### <ins><font color = blue>Which of the Continuous Independent Variables have the most influence on heart failure<ins></font>? 

#### - age: age of the patient (years)
#### - creatinine phosphokinase (CPK): level of the CPK enzyme in the blood (mcg/L)
#### - ejection fraction: percentage of blood leaving the heart at each contraction (percentage)
#### - platelets: platelets in the blood (kiloplatelets/mL)
#### - serum creatinine: level of serum creatinine in the blood (mg/dL)
#### - serum sodium: level of serum sodium in the blood (mEq/L)
#### - time: follow-up period (days)

#### <ins><font color= blue>Setting up Logistic Regression<ins></font>
#### Data Wrangling: Dependent Variable of death event is already coded to 1s and 0s so no recoding necessary

#### <ins>Testing Assumptions For Sample Size<ins>
#### Running base model for each IV

In [13]:
Base_Model_age <- glm(DEATH_EVENT ~ age, data=HeartFail, family="binomial")
Base_Model_CP <- glm(DEATH_EVENT ~ creatinine_phosphokinase, data=HeartFail, family="binomial")
Base_Model_EF <- glm(DEATH_EVENT ~ ejection_fraction, data=HeartFail, family="binomial")
Base_Model_platelets <- glm(DEATH_EVENT ~ platelets, data=HeartFail, family="binomial")
Base_Model_SC <- glm(DEATH_EVENT ~ serum_creatinine, data=HeartFail, family="binomial")
Base_Model_SS <- glm(DEATH_EVENT ~ serum_sodium, data=HeartFail, family="binomial")
Base_Model_time <- glm(DEATH_EVENT ~ time, data=HeartFail, family="binomial")

In [16]:
#### Predicting Risk of Heart Failure

In [19]:
probabilities_age <- predict(Base_Model_age, type = "response")
HeartFail$age <- ifelse(probabilities_age > .5, 1, 0)

probabilities_CP <- predict(Base_Model_CP, type = "response")
HeartFail$creatinine_phosphokinase <- ifelse(probabilities_CP > .5, 1, 0)

probabilities_EF <- predict(Base_Model_EF, type = "response")
HeartFail$ejection_fraction <- ifelse(probabilities_EF > .5, 1, 0)

probabilities_platelets <- predict(Base_Model_platelets, type = "response")
HeartFail$platelets <- ifelse(probabilities_platelets > .5, 1, 0)

probabilities_SC <- predict(Base_Model_SC, type = "response")
HeartFail$serum_creatinine <- ifelse(probabilities_SC > .5, 1, 0)

probabilities_SS <- predict(Base_Model_SS, type = "response")
HeartFail$serum_sodium <- ifelse(probabilities_SS > .5, 1, 0)

probabilities_time <- predict(Base_Model_time, type = "response")
HeartFail$time <- ifelse(probabilities_time > .5, 1, 0)

#### Confusion Matrix for each IV

In [20]:
dplyr::glimpse(HeartFail)

Rows: 299
Columns: 14
$ age                      [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ~
$ anaemia                  [3m[90m<int>[39m[23m 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ~
$ creatinine_phosphokinase [3m[90m<dbl>[39m[23m 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ diabetes                 [3m[90m<int>[39m[23m 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ~
$ ejection_fraction        [3m[90m<dbl>[39m[23m 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ high_blood_pressure      [3m[90m<int>[39m[23m 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, ~
$ platelets                [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ serum_creatinine         [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ~
$ serum_sodium             [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ sex                      [3m[90m<int>[39m[23m 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,

In [21]:
head(HeartFail)

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,creatinine_Phosphokinase
Unnamed: 0_level_1,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>
1,0,0,0,0,1,1,0,0,0,1,0,1,1,0
2,0,0,1,0,0,0,0,0,0,1,0,1,1,1
3,0,0,0,0,1,0,0,0,0,1,1,1,1,0
4,0,1,0,0,1,0,0,0,0,1,0,1,1,0
5,0,1,0,1,1,0,0,1,1,0,0,1,1,0
6,1,1,0,0,0,1,0,0,0,1,1,1,1,0


In [22]:
str(HeartFail)

'data.frame':	299 obs. of  14 variables:
 $ age                     : num  0 0 0 0 0 1 0 0 0 1 ...
 $ anaemia                 : int  0 0 0 1 1 1 1 1 0 1 ...
 $ creatinine_phosphokinase: num  0 1 0 0 0 0 0 0 0 0 ...
 $ diabetes                : int  0 0 0 0 1 0 0 1 0 0 ...
 $ ejection_fraction       : num  1 0 1 1 1 0 1 0 0 0 ...
 $ high_blood_pressure     : int  1 0 0 0 0 1 0 0 0 1 ...
 $ platelets               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ serum_creatinine        : num  0 0 0 0 1 0 0 0 0 1 ...
 $ serum_sodium            : num  0 0 0 0 1 0 0 0 0 0 ...
 $ sex                     : int  1 1 1 1 0 1 1 1 0 1 ...
 $ smoking                 : int  0 0 1 0 0 1 0 1 0 1 ...
 $ time                    : num  1 1 1 1 1 1 1 1 1 1 ...
 $ DEATH_EVENT             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ creatinine_Phosphokinase: num  0 1 0 0 0 0 0 0 0 0 ...


#### I made an error on the creatinine_phosphokinase, and had the P capitizied above so I had to drop that column so now using HeartFail 1 as you see below

In [23]:
HeartFail1 <- HeartFail[,1:13]

In [24]:
head(HeartFail1)

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
Unnamed: 0_level_1,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>
1,0,0,0,0,1,1,0,0,0,1,0,1,1
2,0,0,1,0,0,0,0,0,0,1,0,1,1
3,0,0,0,0,1,0,0,0,0,1,1,1,1
4,0,1,0,0,1,0,0,0,0,1,0,1,1
5,0,1,0,1,1,0,0,1,1,0,0,1,1
6,1,1,0,0,0,1,0,0,0,1,1,1,1


In [29]:
HeartFail1$age <- as.factor(HeartFail1$age)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$age, HeartFail1$DEATH_EVENT)
conf_mat


Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 193  78
         1  10  18
                                          
               Accuracy : 0.7057          
                 95% CI : (0.6505, 0.7567)
    No Information Rate : 0.6789          
    P-Value [Acc > NIR] : 0.1767          
                                          
                  Kappa : 0.17            
                                          
 Mcnemar's Test P-Value : 9.183e-13       
                                          
            Sensitivity : 0.9507          
            Specificity : 0.1875          
         Pos Pred Value : 0.7122          
         Neg Pred Value : 0.6429          
             Prevalence : 0.6789          
         Detection Rate : 0.6455          
   Detection Prevalence : 0.9064          
      Balanced Accuracy : 0.5691          
                                          
       'Positive' Class : 0               
                              

#### The balanced accuracy is 56.9% for age predicting heart failure
#### Meets assumption for sample size

In [31]:
HeartFail1$creatinine_phosphokinase <- as.factor(HeartFail1$creatinine_phosphokinase)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$creatinine_phosphokinase, HeartFail1$DEATH_EVENT)
conf_mat

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 203  94
         1   0   2
                                          
               Accuracy : 0.6856          
                 95% CI : (0.6297, 0.7378)
    No Information Rate : 0.6789          
    P-Value [Acc > NIR] : 0.4291          
                                          
                  Kappa : 0.0281          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 1.00000         
            Specificity : 0.02083         
         Pos Pred Value : 0.68350         
         Neg Pred Value : 1.00000         
             Prevalence : 0.67893         
         Detection Rate : 0.67893         
   Detection Prevalence : 0.99331         
      Balanced Accuracy : 0.51042         
                                          
       'Positive' Class : 0               
                              

#### The balanced accuracy is 51% for predicting heart failure
#### This does not meet the assumption for sample size

In [33]:
HeartFail1$ejection_fraction <- as.factor(HeartFail1$ejection_fraction)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$ejection_fraction, HeartFail1$DEATH_EVENT)
conf_mat

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 200  76
         1   3  20
                                         
               Accuracy : 0.7358         
                 95% CI : (0.682, 0.7849)
    No Information Rate : 0.6789         
    P-Value [Acc > NIR] : 0.01921        
                                         
                  Kappa : 0.2421         
                                         
 Mcnemar's Test P-Value : 5.467e-16      
                                         
            Sensitivity : 0.9852         
            Specificity : 0.2083         
         Pos Pred Value : 0.7246         
         Neg Pred Value : 0.8696         
             Prevalence : 0.6789         
         Detection Rate : 0.6689         
   Detection Prevalence : 0.9231         
      Balanced Accuracy : 0.5968         
                                         
       'Positive' Class : 0              
                                         

#### Balanced accuracy for ejection fraction is 59.7% however this those not meet sample size requirement

In [34]:
HeartFail1$platelets <- as.factor(HeartFail1$platelets)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$platelets, HeartFail1$DEATH_EVENT)
conf_mat

"Levels are not in the same order for reference and data. Refactoring data to match."


Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 203  96
         1   0   0
                                          
               Accuracy : 0.6789          
                 95% CI : (0.6228, 0.7315)
    No Information Rate : 0.6789          
    P-Value [Acc > NIR] : 0.5276          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 1.0000          
            Specificity : 0.0000          
         Pos Pred Value : 0.6789          
         Neg Pred Value :    NaN          
             Prevalence : 0.6789          
         Detection Rate : 0.6789          
   Detection Prevalence : 1.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : 0               
                              

In [35]:
HeartFail1$serum_creatinine <- as.factor(HeartFail1$serum_creatinine)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$serum_creatinine, HeartFail1$DEATH_EVENT)
conf_mat

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 192  79
         1  11  17
                                          
               Accuracy : 0.699           
                 95% CI : (0.6435, 0.7505)
    No Information Rate : 0.6789          
    P-Value [Acc > NIR] : 0.2491          
                                          
                  Kappa : 0.1511          
                                          
 Mcnemar's Test P-Value : 1.636e-12       
                                          
            Sensitivity : 0.9458          
            Specificity : 0.1771          
         Pos Pred Value : 0.7085          
         Neg Pred Value : 0.6071          
             Prevalence : 0.6789          
         Detection Rate : 0.6421          
   Detection Prevalence : 0.9064          
      Balanced Accuracy : 0.5614          
                                          
       'Positive' Class : 0               
                              

#### Serum Creatinine has 56% balanced accuracy and meets the assumption for sample size

In [37]:
HeartFail1$serum_sodium <- as.factor(HeartFail1$serum_sodium)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$serum_sodium, HeartFail1$DEATH_EVENT)
conf_mat

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 199  89
         1   4   7
                                         
               Accuracy : 0.689          
                 95% CI : (0.6331, 0.741)
    No Information Rate : 0.6789         
    P-Value [Acc > NIR] : 0.381          
                                         
                  Kappa : 0.0694         
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.98030        
            Specificity : 0.07292        
         Pos Pred Value : 0.69097        
         Neg Pred Value : 0.63636        
             Prevalence : 0.67893        
         Detection Rate : 0.66555        
   Detection Prevalence : 0.96321        
      Balanced Accuracy : 0.52661        
                                         
       'Positive' Class : 0              
                                         

#### Serum sodium has a 52.7% balanced accuracy 

In [38]:
HeartFail1$time <- as.factor(HeartFail1$time)
HeartFail1$DEATH_EVENT <- as.factor(HeartFail1$DEATH_EVENT)
conf_mat <- caret::confusionMatrix(HeartFail1$time, HeartFail1$DEATH_EVENT)
conf_mat

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 190  35
         1  13  61
                                          
               Accuracy : 0.8395          
                 95% CI : (0.7929, 0.8792)
    No Information Rate : 0.6789          
    P-Value [Acc > NIR] : 2.136e-10       
                                          
                  Kappa : 0.6081          
                                          
 Mcnemar's Test P-Value : 0.002437        
                                          
            Sensitivity : 0.9360          
            Specificity : 0.6354          
         Pos Pred Value : 0.8444          
         Neg Pred Value : 0.8243          
             Prevalence : 0.6789          
         Detection Rate : 0.6355          
   Detection Prevalence : 0.7525          
      Balanced Accuracy : 0.7857          
                                          
       'Positive' Class : 0               
                              

#### Time has a 78.6% balanced accuracy and meets sample size requirements