In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Intro

**Dataset** The Framingham Heart study datasets present in Kaggle is a collection of over 4000 observations of subjects at 10 year risk (or not) of coronary heart disease CHD. The dataset includes 15 demographic, behavioural and medical variables and a variable 'TenYearCHD' of 10 year risk of coronary heart disease CHD.

**Analysis** Logistic regression will be used for binary classification of the 10 year risk/norisk of CHD (dependent variable). The independent variables will be the 15 numerical and categorical variables of this dataset. The accuracy of logistic regression will be compared to others linear approaches.

### Pipeline
- **Import** of python packages and of the dataset
- **Explore and visualization** of the dataset 
- **Preprocessing**: missing data, feature engineering, normalization of continuous variables  
- **Logistic regression analysis and comparison** with other linear approaches

## Import packages and modules

In [None]:
# data handling 
import numpy as np 
import pandas as pd

from statistics import mode, median

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings 
warnings.simplefilter('ignore') 
%matplotlib inline 

In [None]:
# machine learning
# data preprocessing
from sklearn.preprocessing import MinMaxScaler, Normalizer

# linear models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# models metrics
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

# models validation
from sklearn.model_selection import train_test_split, KFold, cross_val_score

### Import data

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-prediction-using-logistic-regression/framingham.csv')

## Explore 

In [None]:
df.shape

In [None]:
df.describe()

### Split variables 
between categorical(binary/ordinal)   
and numeric(continuous/discrete)

In [None]:
# splitting variables 
bin_vars = ['male','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD']
ord_vars = ['education']
con_vars = ['totChol','sysBP','diaBP','BMI','heartRate','glucose']
dis_vars = ['age', 'cigsPerDay']

### Visualize categorical
Several binary variables are not balanced between categories with a few positive observations (values = 1)

In [None]:
bin_count = pd.DataFrame(columns = bin_vars)
for var in bin_vars: bin_count[var] = round(df[var].value_counts()/df[var].count()*100)

bin_count.T.plot(kind="bar", stacked=True, color=['cadetblue','indianred'])
plt.legend(loc='lower left')
plt.ylabel("Percentage",fontsize=12)
plt.show()

In [None]:
df.education.value_counts().plot(kind='pie', startangle=90, figsize=(3,3), autopct='%1.1f%%', legend=False, colors=['indianred','darkorange','darkolivegreen','forestgreen'])
plt.tight_layout()

### Visualize numeric
Some numeric variables appear to be skewed (glucose) and some highly correlated (sysBP and diaBP)

In [None]:
num_vars = con_vars + dis_vars

In [None]:
sns.boxplot(data = df[num_vars])
plt.show()

In [None]:
for var in num_vars:
    sns.distplot(df[var], bins=5, label= var, axlabel=False)
plt.legend()
plt.show()

In [None]:
df[num_vars].skew().plot(kind='bar', ylabel ="Skewness", title= 'Skewness of variables',figsize=(5,2),legend=False, fontsize=12, use_index=True, rot=45, color = 'cadetblue')
plt.show()

In [None]:
corr = df[num_vars].corr(method='pearson')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(corr)] = True
sns.heatmap(corr,mask=mask,annot=True, fmt=".1f",cmap="YlGnBu")
plt.show()

In [None]:
combis = (('sysBP','diaBP'),('sysBP','BMI'),('sysBP','age'))

fig, axes = plt.subplots(1, len(combis), figsize=(7, 3))

for i, c in enumerate(combis):
    sns.regplot(df[list(c)[0]], df[list(c)[1]], line_kws={'color': 'indianred'},scatter_kws={'color': 'cadetblue'}, ax=axes[i])

fig.tight_layout()

## Preprocessing

### Missing data
Missing values will be replaced by central trend statistics:
- continuous vars --> mean
- binary and ordinal vars --> mode
- discrete vars --> median


No variables will be dropped because missing percent < 15%

In [None]:
sns.heatmap(df.isna(),yticklabels=False,cbar=False,cmap='summer')
plt.show()

In [None]:
md = (df.isna().sum()/df.shape[0]*100).sort_values(ascending=False)
md.get(md.values > 0).plot(kind='bar', ylabel = 'Missing (%)', title ="Percentage of missing values per variable",figsize=(5,2),
                           legend=False, rot=45, color = 'darkolivegreen')
plt.show()

In [None]:
# Replace missing data

for var in md.get(md.values > 0).index:  
    if var in con_vars: #continuous --> mean
        avg = df[var].mean()
        df[var].fillna(value=avg, inplace=True)
        print(f'{var}, average: {round(avg,2)}')
    elif var in bin_vars or var in ord_vars: #binary/ordinal --> mode
        mod = mode(df[var])
        df[var].fillna(value=mod, inplace=True)
        print(f'{var}, mode: {mod}')
    elif var in dis_vars: # discrete --> median
        med = median(df[var])
        df[var].fillna(value=med, inplace=True)
        print(f'{var}, median: {med}')

In [None]:
#check
sns.heatmap(df.isna(),yticklabels=False,cbar=False,cmap='summer')
plt.show()

### Feature engineering
Variable currentSmoker and cigsPerDay are redundant.   
I will create a multi-level categorical variable where:
- 0 are non smoker, 
- 1 are light smokers (1-20 cigsPerDay) 
- 2 are heavy smokers (> 20 cigsPerDay)

In [None]:
def smoker(sub):
    if sub['cigsPerDay'] == 0:
        return 0
    elif 0 < sub['cigsPerDay'] < 21:
        return 1
    elif sub['cigsPerDay'] > 20:
        return 2

df['smoker'] = df.apply (lambda sub: smoker(sub), axis=1)

In [None]:
# drop old variable
df.drop(columns=['currentSmoker','cigsPerDay'], axis=1, inplace=True) 

In [None]:
# update
bin_vars = ['male','BPMeds','prevalentStroke','prevalentHyp','diabetes','TenYearCHD']
ord_vars = ['education', 'smoker']
con_vars = ['totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
dis_vars = ['age']

In [None]:
# reorder variables : categorical, ordinal, continuous
new_order = bin_vars[0:5] + ord_vars + dis_vars + con_vars + ['TenYearCHD']
df = df[new_order]
df.head(2)

### Normalization
of numerical variables

In [None]:
num_vars = con_vars + dis_vars

In [None]:
# same scale btw 0 and 1
df[num_vars] = MinMaxScaler(feature_range=(0, 1)).fit_transform(df[num_vars])

# normalize
df[num_vars]= Normalizer().fit(df[con_vars]).transform(df[num_vars])

In [None]:
#check
sns.boxplot(data = df[num_vars])
plt.show()

In [None]:
#check
for var in num_vars:
    sns.distplot(df[var], bins=5, label= var,axlabel =False)
plt.legend()
plt.show()

## Logistic regression classification


In [None]:
# split array into input and output components
array = df.values
X = array[:,0:df.shape[1]-1] #in
Y = array[:,df.shape[1]-1] #out

# split between train and Test Sets
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.33, 
                                                    random_state=7, 
                                                    shuffle=True)

# Logistic regression model
LR = LogisticRegression(solver='liblinear', random_state=0)
LR.fit(X_train, Y_train) 

#accuracy: the ratio of the number of correct predictions to the number of observations
print(f"Accuracy: {round((LR.score(X_test, Y_test)) *100.0,2)}")

In [None]:
print('Classification report:\n', classification_report(Y_test, LR.predict(X_test)))

In [None]:
#confusion matrix
cm = confusion_matrix(Y_test, LR.predict(X_test),labels=[0,1])
# Plot confusion matrix
titles = [("Not normalized confusion matrix", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles:
    disp = plot_confusion_matrix(LR, X_test, Y_test,
                                 display_labels=['No risk (0)', 'Risk (1)'],
                                 cmap=plt.cm.Greens,
                                 normalize=normalize)
    disp.ax_.set_title(title)
    plt.grid(False) 
    print(title)
    print(disp.confusion_matrix)

plt.show()

### Different models

Comparison between different linear classification algoritms: 

- Logistic regression LR
- Perceptron
- Linear support vector machine SVM
- Gaussian naive bayes classifier GNB
- Linear discriminant analysys LDA

In [None]:
names = ['logReg','Perc','linearSVC','GauNaiBay','linearDA']

mods = [LogisticRegression(), 
        Perceptron(tol=1e-3, random_state=0), 
        LinearSVC(),
        GaussianNB(),
        LinearDiscriminantAnalysis()]


for n, mod in zip(names,mods):
    
    mod.fit(X_train, Y_train) 
    
    print(f"Accuracy {n}: {round((mod.score(X_test, Y_test)) *100,2)}")

### Conclusions

The logistic regression model (as the linear SVC) shows an accuracy of nearly 85%, however it generates several false negative, missing cases at risk. 

**The dataset**
- The amount of missing data is limited
- Some categorical variables have few observations in one of the categories
- Some continuous variables show a not normal distribution
- Some continuous variables show correlation

**The analysis**
- Logistic regression is the algorithm showing best accuracy
- However the sensitivity is very low accounting for many false negative 

**Sugegstions**  
- Use an extended versions of the database (more balanced variables with a more normal distribution)
- Try with non linear models 
- Try different parameters other than accuracy to evaluate the models 
- Differen preprocessing approaches and feature selections 