In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Purpose
Fitting a logistic regression model for determining a subject's risk at coronary heart disease in last 10 years.
(`TenYearCHD`= 1 for YES, and 0 for NO).

### Data description
[Framingham Heart Decease Dataset](https://www.kaggle.com/amanajmera1/framingham-heart-study-dataset)

### Load data

In [2]:
df = pd.read_csv('data/framingham.csv')

In [3]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


#### Original shape of the dataframe

In [17]:
df.shape

(4240, 16)

In [15]:
df.dtypes # variable type

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

#### Check whether there're missing values in each variable

In [16]:
df.isnull().any() 

male               False
age                False
education           True
currentSmoker      False
cigsPerDay          True
BPMeds              True
prevalentStroke    False
prevalentHyp       False
diabetes           False
totChol             True
sysBP              False
diaBP              False
BMI                 True
heartRate           True
glucose             True
TenYearCHD         False
dtype: bool

### Find missing values in each row

In [21]:
df['full_cnt'] = df.apply(lambda x: x.count(), axis=1)

In [29]:
df_cdh = df[(df['full_cnt'] == 16)] # pick up rows with full values

In [30]:
df_cdh.shape  # only 3658 rows left

(3658, 17)

In [31]:
round(3658/4240,2) # about 86% of rows left 

0.86

#### Check the new dataframe if it has missing values

In [32]:
df_cdh.isnull().any()

male               False
age                False
education          False
currentSmoker      False
cigsPerDay         False
BPMeds             False
prevalentStroke    False
prevalentHyp       False
diabetes           False
totChol            False
sysBP              False
diaBP              False
BMI                False
heartRate          False
glucose            False
TenYearCHD         False
full_cnt           False
dtype: bool

In [38]:
df_cdh.drop(['TenYearCHD'], axis=1).head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,full_cnt
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,16
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,16
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,16
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,16
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,16


In [40]:
df_x = df_cdh.drop(['TenYearCHD','full_cnt'], axis=1)
df_y = df_cdh['TenYearCHD']

### Split data for 70% train and 30% test samples 

In [33]:
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=100)

### Fit a logistic regression

In [51]:
from sklearn.linear_model  import LogisticRegression

In [55]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### The fitted coefficients

In [53]:
print(lr.coef_)

[[ 0.53237364  0.05109603 -0.10861581 -0.16791442  0.02063026  0.14861338
   0.64507278  0.55000253  0.07677307  0.00145946  0.01283651 -0.01058918
  -0.0294604  -0.00770306  0.00520004]]


#### The fitted intercept

In [54]:
print(lr.intercept_)

[-5.02192261]


### Using test data to predict `TenYearCHD`

In [110]:
y_pred_TenYearCHD = lr.predict(x_test)

To compare the predicted results with the original test data, we need a confusion matrix, which is defined as:

### <center> Confusion Matrix </center>
|           |Observe YES      |Observe No        |
|-----------|-----------------|------------------|
|Predict YES|True Positive  (TP)|False Positive  (FP)|
|Predict  NO|False Negative (FN)|True Negative   (TN)|

### Making confusion matrix manually

In [123]:
y_test = np.array(y_test)

In [125]:
cnt_tp = 0
cnt_fp = 0
cnt_fn = 0
cnt_tn = 0

for i in range(len(y_test)):
    if (y_test[i] == 1 and y_pred_TenYearCHD[i] == 1):
        cnt_tp = cnt_tp + 1
    elif (y_test[i] == 0 and y_pred_TenYearCHD[i] == 1):
        cnt_fp = cnt_fp + 1
    elif (y_test[i] == 1 and y_pred_TenYearCHD[i] == 0):
        cnt_fn = cnt_fn + 1
    elif (y_test[i] == 0 and y_pred_TenYearCHD[i] == 0):
        cnt_tn = cnt_tn + 1

In [126]:
cnt_fn + cnt_fp + cnt_tn + cnt_tp

1098

### <center> Result confusion matrix </center>
|           |Observe YES      |Observe NO        |
|-----------|-----------------|------------------|
|Predict YES|                9|                 3|
|Predict  NO|              164|               922|
|      Total|              173|               925|

### Compute Accuracy
The formula for computing accracy of the fitted logistic regression model:

$$Accuracy = \frac{TP+TN}{N}$$

where $N$ is the sample size of the test data.

In [127]:
round((9+922)/len(y_test),3)

0.848

The prediction accuracy of our fitted model is about 84.8%.