In [None]:
# Data Loading and Numerical Operations
import pandas as pd
import numpy as np
# Data Visualizations
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Data Resampling
from sklearn.utils import resample
# Data Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Data Splitting
from sklearn.model_selection import train_test_split
# Data Scaling
from sklearn.preprocessing import MinMaxScaler
# Data Modeling
from sklearn.linear_model import LogisticRegression

from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading Dataset
data = pd.read_csv("/kaggle/input/heart-disease-prediction-using-logistic-regression/framingham.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info(10)

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
print((data["glucose"].mode())[0])



Filling the missing spaces of glucosecolumn with 
the mode of the data (Mode = 75) present to reduce the number of missing data in our dataset


In [None]:
data["glucose"].fillna((data["glucose"].mode())[0], inplace=True)

In [None]:
data.dropna(inplace=True)
data.isnull().sum()

In [None]:
plt.figure(figsize=(30,10), facecolor='w')
sns.boxplot(data=data)
plt.show()



Removable Outliers are detected in totChol and sysBP columns of our dataset. 
Outliers in all other numerical columns are important and thus cannot be removed.
The Outlier present in totChol is 600.
The Outlier present in sysBP is 295.


In [None]:
print(data['totChol'].max())
print(data['sysBP'].max())

In [None]:
data.shape
data = data[data['totChol']<600.0]
data = data[data['sysBP']<295.0]

In [None]:
data.describe()

In [None]:
#Checking relationship between variables
cor=data.corr()
plt.figure(figsize=(20,10), facecolor='w')
sns.heatmap(cor,xticklabels=cor.columns,yticklabels=cor.columns,annot=True)
plt.title("Correlation among all the Variables of the Dataset", size=20)
cor

Compared to all the independent data, the correlation coefficient
between education and and target
variable TenYearCHD is very low and actually negative

In [None]:
#categorical  features
categorical_features = ['male', 'education', 
                        'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']

In [None]:
for feature in categorical_features:
    print(feature,':')
    print(data[feature].value_counts())
    print("-----------------")

In [None]:
num_plots = len(categorical_features)
total_cols = 2
total_rows = num_plots//total_cols + 1
fig, axs = plt.subplots(nrows=total_rows, ncols=total_cols,
                        figsize=(7*total_cols, 7*total_rows), facecolor='w', constrained_layout=True)
for i, var in enumerate(categorical_features):
    row = i//total_cols
    pos = i % total_cols
    plot = sns.countplot(x=var, data=data, ax=axs[row][pos])




Among the categorical features:

    BPmeds, prevalentStroke and diabetes are highly imbalanced.
    There are four levels of education whereas the rest categorical features are all binary
    The number of Smokers and non-Smokers in currentSmoker is almost the same



In [None]:

#Numerical Features

numeric_features = ['cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
for feature in numeric_features:
    plt.figure(figsize=(18, 10), facecolor='w')
    sns.distplot(data[feature])
    plt.title('{} Distribution'.format(feature), fontsize=20)
    plt.show()





Among the numerical features:

    totChol, sysBP, diaBPand BMI has an uniform distribution and the rest are unevenly distributed
    cigsPerDay has a highly uneven distribution with the most data present in 0
    cigsPerDay and sysBP shows quite a bit and slight right skewness respectively.



In [None]:


#Distribution of outcome variable, Heart Disease
plt.figure(figsize=(12, 10), facecolor='w')
plt.subplots_adjust(right=1.5)
plt.subplot(121)
sns.countplot(x="TenYearCHD", data=data)
plt.title("Count distribution of TenYearCHD", size=20)
plt.subplot(122)
labels=[0,1]
plt.pie(data["TenYearCHD"].value_counts(),autopct="%1.1f%%",labels=labels,colors=["grey","green"])
plt.show()





The distribution is highly imbalanced.
As in, the number of negative cases outweigh the number of positive cases. 
This would lead to class imbalance problem while fitting our models.
Therefore, this problem needs to be addressed and taken care of.


In [None]:
#Bivariate ananlysis
#checking for which gender has more risk of coronary heart disease CHD

graph_2 = data.groupby("male", as_index=False).TenYearCHD.sum()
#Ploting the above values

plt.figure(figsize=(12,8), facecolor='w')
sns.barplot(x=graph_2["male"], y=graph_2["TenYearCHD"])
plt.title("Graph showing which gender has more risk of coronary heart disease CHD", size=20)
plt.xlabel("Gender\n0 is female and 1 is male",size=20)
plt.ylabel("TenYearCHD cases", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

In [None]:
#Relation between cigsPerDay and risk of coronary heart disease.
plt.figure(figsize=(30,12), facecolor='w')
sns.countplot(x="TenYearCHD",data=data,hue="cigsPerDay")
plt.legend(title='cigsPerDay', fontsize='large')
plt.title("Graph showing the relation between cigsPerDay and risk of coronary heart disease.", size=30)
plt.xlabel("Risk of TenYearCHD", size=20)
plt.ylabel("Count of TenYearCHD", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

In [None]:
# Multivariate analysis

#Plotting a linegraph to check the relationship between age and cigsPerDay, totChol, glucose.

graph_5 = data.groupby("age").cigsPerDay.mean()
graph_6 = data.groupby("age").totChol.mean()
graph_7 = data.groupby("age").glucose.mean()

plt.figure(figsize=(16,10), facecolor='w')
sns.lineplot(data=graph_5, label="cigsPerDay")
sns.lineplot(data=graph_6, label="totChol")
sns.lineplot(data=graph_7, label="glucose")
plt.title("Graph showing totChol and cigsPerDay in every age group.", size=20)
plt.xlabel("age", size=20)
plt.ylabel("count", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

In [None]:
# Resampling
target1=data[data['TenYearCHD']==1]
target0=data[data['TenYearCHD']==0]

In [None]:
target1=resample(target1,replace=True,n_samples=len(target0),random_state=40)
target=pd.concat([target0,target1])
target['TenYearCHD'].value_counts() 


data=target
np.shape(data)



In [None]:
#Distribution of heart disease cases in the balanced dataset, the outcome variable
plt.figure(figsize=(12, 10), facecolor='w')
plt.subplots_adjust(right=1.5)
plt.subplot(121)
sns.countplot(x="TenYearCHD", data=data)
plt.title("Count of TenYearCHD column", size=20)
plt.subplot(122)
labels=[0,1]
plt.pie(data["TenYearCHD"].value_counts(),autopct="%1.1f%%",labels=labels,colors=["yellow","grey"])
plt.show()

In [None]:
#To idenfify the features that have larger contribution towards the outcome variable, TenYearCHD
X=data.iloc[:,0:15]
y=data.iloc[:,-1]
print("X - ", X.shape, "\ny - ", y.shape)



In [None]:
#Apply SelectKBest and extract top 10 features
best=SelectKBest(score_func=chi2, k=10)
fit=best.fit(X,y)
data_scores=pd.DataFrame(fit.scores_)
data_columns=pd.DataFrame(X.columns)

#Join the two dataframes
scores=pd.concat([data_columns,data_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(11,'Score'))

In [None]:


#To visualize feature selection
scores=scores.sort_values(by="Score", ascending=False)
plt.figure(figsize=(20,7), facecolor='w')
sns.barplot(x='Feature',y='Score',data=scores,palette='BuGn_r')
plt.title("Plot showing the best features in descending order", size=20)
plt.show()



In [None]:
#Select 10 features
features=scores["Feature"].tolist()[:10]
features

data=data[['sysBP','glucose','age','cigsPerDay','totChol','diaBP','prevalentHyp','male','BPMeds','diabetes','TenYearCHD']]
data.head()

In [None]:
y = data['TenYearCHD']
X = data.drop(['TenYearCHD'], axis=1)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Different Predictive Models

In [None]:


m1 = 'LogisticRegression'
lr = LogisticRegression(random_state=1, max_iter=1000)
model = lr.fit(train_x, train_y)
lr_predict = lr.predict(test_x)
lr_conf_matrix = confusion_matrix(test_y, lr_predict)
lr_acc_score = accuracy_score(test_y, lr_predict)
print("confusion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(test_y,lr_predict))



In [None]:
m2 = 'Gradient Boosting Classifier'
gvc =  GradientBoostingClassifier()
gvc.fit(train_x,train_y)
gvc_predicted = gvc.predict(test_x)
gvc_conf_matrix = confusion_matrix(test_y, gvc_predicted)
gvc_acc_score = accuracy_score(test_y, gvc_predicted)
print("confusion matrix")
print(gvc_conf_matrix)
print("\n")
print("Accuracy of Gradient Boosting Classifier:",gvc_acc_score*100,'\n')
print(classification_report(test_y,gvc_predicted))



In [None]:
m3 = 'LGBMClassifier'
lg=LGBMClassifier(boosting_type='gbdt',n_estimators=5000,learning_rate=0.05,objective='binary',metric='accuracy',is_unbalance=True,
                 colsample_bytree=0.7,reg_lambda=3,reg_alpha=3,random_state=500,n_jobs=-1,num_leaves=35)
lg.fit(train_x,train_y)
lg_predicted = lg.predict(test_x)
lg_conf_matrix = confusion_matrix(test_y, lg_predicted)
lg_acc_score = accuracy_score(test_y, lg_predicted)
print("confusion matrix")
print(lg_conf_matrix)
print("\n")
print("LGBMClassifier:",lg_acc_score*100,'\n')
print(classification_report(test_y,lg_predicted))


In [None]:
m4 = 'XGBClassifier'
xg = XGBClassifier(learning_rate=0.05, n_estimators=100,max_depth=4, subsample = 0.9,colsample_bytree = 0.1, gamma=1,random_state=42)
xg.fit(train_x,train_y)
xg_predicted = xg.predict(test_x)
xg_conf_matrix = confusion_matrix(test_y, xg_predicted)
xg_acc_score = accuracy_score(test_y, xg_predicted)
print("confusion matrix")
print(xg_conf_matrix)
print("\n")
print("XGBClassifier:",xg_acc_score*100,'\n')
print(classification_report(test_y,xg_predicted))

In [None]:
m5 = 'MLPClassifier'
mlp=MLPClassifier(solver='adam', learning_rate_init = 0.0005, learning_rate = 'adaptive', activation="relu", max_iter=3000, random_state=10)
mlp.fit(train_x,train_y)
mlp_predicted = mlp.predict(test_x)
mlp_conf_matrix = confusion_matrix(test_y, mlp_predicted)
mlp_acc_score = accuracy_score(test_y, mlp_predicted)
print("confusion matrix")
print(mlp_conf_matrix)
print("\n")
print("MLPClassifier:",mlp_acc_score*100,'\n')
print(classification_report(test_y,mlp_predicted))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [None]:
m6 = 'KNeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(train_x, train_y)
knn_predict = knn.predict(test_x)
knn_conf_matrix = confusion_matrix(test_y, knn_predict)
knn_acc_score = accuracy_score(test_y, knn_predict)
print("confusion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of k-NN Classification:",knn_acc_score*100,'\n')
print(classification_report(test_y, knn_predict))

In [None]:
m7 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=200, random_state=0,max_depth=12)
rf.fit(train_x,train_y)
rf_predicted = rf.predict(test_x)
rf_conf_matrix = confusion_matrix(test_y, rf_predicted)
rf_acc_score = accuracy_score(test_y, rf_predicted)
print("confusion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(test_y,rf_predicted))

In [None]:
m8 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 30)
dt.fit(train_x,train_y)
dt_predicted = dt.predict(test_x)
dt_conf_matrix = confusion_matrix(test_y, dt_predicted)
dt_acc_score = accuracy_score(test_y, dt_predicted)
print("confusion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(test_y,dt_predicted))

In [None]:
m9 = 'Support Vector Classifier'
svc = SVC(kernel = 'linear')
svc.fit(train_x,train_y)
svc_predicted = svc.predict(test_x)
svc_conf_matrix = confusion_matrix(test_y, svc_predicted)
svc_acc_score = accuracy_score(test_y, svc_predicted)
print("confusion matrix")
print(svc_conf_matrix)
print("\n")
print("Support Vector Classifier:",svc_acc_score*100,'\n')
print(classification_report(test_y,svc_predicted))

# kernels = ['linear', 'poly', 'rbf', 'sigmoid']

In [None]:
m10 = 'Naive Bayes Classifier'
nbc = GaussianNB()
nbc.fit(train_x,train_y)
nbc_predicted = nbc.predict(test_x)
nbc_conf_matrix = confusion_matrix(test_y, nbc_predicted)
nbc_acc_score = accuracy_score(test_y, nbc_predicted)
print("confusion matrix")
print(nbc_conf_matrix)
print("\n")
print("Naive Bayes Classifier:",nbc_acc_score*100,'\n')
print(classification_report(test_y,nbc_predicted))

# ROc   Curve to compare  all the classifiers

In [None]:
lr_false_positive_rate,lr_true_positive_rate,lr_threshold = roc_curve(test_y,lr_predict)
knn_false_positive_rate,knn_true_positive_rate,knn_threshold = roc_curve(test_y,knn_predict)
rf_false_positive_rate,rf_true_positive_rate,rf_threshold = roc_curve(test_y,rf_predicted)                                                             
dt_false_positive_rate,dt_true_positive_rate,dt_threshold = roc_curve(test_y,dt_predicted)
gvc_false_positive_rate,gvc_true_positive_rate,gvc_threshold = roc_curve(test_y,gvc_predicted)
svc_false_positive_rate,svc_true_positive_rate,svc_threshold = roc_curve(test_y,svc_predicted)
nbc_false_positive_rate,nbc_true_positive_rate,nbc_threshold = roc_curve(test_y,nbc_predicted)
lg_false_positive_rate,lg_true_positive_rate,lg_threshold = roc_curve(test_y,lg_predicted)
xg_false_positive_rate,xg_true_positive_rate,xg_threshold = roc_curve(test_y,xg_predicted)
mlp_false_positive_rate,mlp_true_positive_rate,mlp_threshold = roc_curve(test_y,mlp_predicted)

In [None]:

sns.set_style('whitegrid')
plt.figure(figsize=(15,15), facecolor='w')
plt.title('Reciever Operating Characterstic Curve')
plt.plot(lr_false_positive_rate,lr_true_positive_rate,label='Logistic Regression')
plt.plot(knn_false_positive_rate,knn_true_positive_rate,label='K-Nearest Neighbor')
plt.plot(rf_false_positive_rate,rf_true_positive_rate,label='Random Forest')
plt.plot(dt_false_positive_rate,dt_true_positive_rate,label='Desion Tree')
plt.plot(gvc_false_positive_rate,gvc_true_positive_rate,label='Gradient Boosting Classifier')
plt.plot(svc_false_positive_rate,svc_true_positive_rate,label='Support Vector Classifier')
plt.plot(nbc_false_positive_rate,nbc_true_positive_rate,label='Naive Bayes Classifier')
plt.plot(lg_false_positive_rate,lg_true_positive_rate,label='LGBMClassifier')
plt.plot(xg_false_positive_rate,xg_true_positive_rate,label='XGBClassifier')
plt.plot(mlp_false_positive_rate,mlp_true_positive_rate,label='MLPClassifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()



# Bravo!