In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing required libraries for visualization
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading the data set
df = pd.read_csv('/kaggle/input/heart-disease-prediction-using-logistic-regression/framingham.csv')

In [None]:
#Sneak Peek over the dataset 
df.head()

Performing EDA

Step 1: Handling missing values

In [None]:
df.shape

Our dataset has 4238 number of rows and 16 columns

In [None]:
df.info()

Column education, currentSmoker, cigsPerDay, BPMeds, totChol, BMI, heartRate, glucose has missing values which has to be handled. Below output shows how much of the individual columns has missing data

In [None]:
df.isnull().sum()

Handling the missing values by filling their respective mean / median / mode values.

In [None]:
df['education'].fillna(1,inplace=True)
df['cigsPerDay'].fillna(df['cigsPerDay'].median(),inplace=True)
df['BPMeds'].fillna(0,inplace=True)
df['totChol'].fillna(df['totChol'].mean(),inplace=True)
df['BMI'].fillna(df['BMI'].mean(),inplace=True)
df['heartRate'].fillna(df['heartRate'].mean(),inplace=True)
df['glucose'].fillna(df['glucose'].mean(),inplace=True)

In [None]:
df.isnull().sum()

All the missing values has been handled 

Step 2: Handling Outliers

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df,x='heartRate',whis=3)
plt.show()

In [None]:
#Removing the outliers
df[df['heartRate']>125]

In [None]:
df.drop([339,358,3142],inplace=True)

Step 3: Handling Skewness

In [None]:
#Splitting categorical and numerical data
df_num = df[['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
df_cat = df[['male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes','TenYearCHD']]

In [None]:
from scipy.stats import skew
for col in df_num:
  try:
    print(col,"=",skew(df_num[col]))
    sns.distplot(df_num[col])
    plt.show()
  except:
    pass
  finally:
    print("**********************************************")

As all the column has positive skewness, with least correlation with the Target Variable
None of the columns have negative values, hence handling skewness for all the columns

In [None]:
df_num['cigsPerDay'] = np.sqrt(df_num['cigsPerDay'])
df_num['totChol'] = np.sqrt(df_num['totChol'])
df_num['sysBP'] = np.log(df_num['sysBP'])
df_num['diaBP'] = np.sqrt(df_num['diaBP'])
df_num['BMI'] = np.sqrt(df_num['BMI'])

Column glucose has highly skewed data. As it has a higher correlation with diabetes, considering diabetes feature for model prediction and hence excluding glucose from dataset.

In [None]:
df_num.drop('glucose',axis=1,inplace=True)

Concatenating both categorical and numerical dataset

In [None]:
df_new = pd.concat([df_num,df_cat],axis=1)

Performing scaling over the dataset using Min-Max Scaler

In [None]:
#Dataset before performing scaling 
df_new.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
for col in df_new:
  mm = MinMaxScaler()
  df_new[col] = mm.fit_transform(df_new[[col]])
df_new.head()

Modelling and Feature Selection

In [None]:
#Dividing the dataset into train and test data
from sklearn.model_selection import train_test_split

x = df_new.drop('TenYearCHD',axis=1)
y = df_new['TenYearCHD']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

Performing Logistic Regression on the dataset

In [None]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()
logr.fit(x_train,y_train)

y_hat = logr.predict(x_test)

In [None]:
#Importing few metrics to check model performace
from sklearn.metrics import confusion_matrix
from sklearn.metrics import(accuracy_score, recall_score, precision_score, f1_score)
cm = confusion_matrix(y_test,y_hat)

In [None]:
#Confusion matrix 
print(cm)

In [None]:
print("Accuracy Score: ",accuracy_score(y_test, y_hat))
print("Recall Score: ",recall_score(y_test, y_hat))
print("Precision Score: ",precision_score(y_test, y_hat))
print("F1 Score: ",f1_score(y_test, y_hat))

Plotting ROC-AUC Curve

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test,y_hat))

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, threshold = roc_curve(y_test,y_hat)
plt.plot(fpr,tpr,'r-',label="Logistic Model")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend()
plt.show()

ROC-AUC score for Logistic regression is pretty low 

Checking the performance using Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)

Performing feature selection using ANNOVA Test

In [None]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
annova = SelectKBest(score_func=f_regression,k=10)
annova.fit(x_train,y_train)
x_train_annova = annova.transform(x_train)
x_test_annova = annova.transform(x_test)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_annova, y_train)
y_hat_annova = lr.predict(x_test_annova)
print("Bias = ",lr.score(x_train_annova,y_train))
print("Variance = ",lr.score(x_test_annova,y_test))


In [None]:
cm = confusion_matrix(y_test,y_hat_annova)

In [None]:
print(cm)