In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from statistics import mode, mean

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
data = pd.read_csv("../input/heart-disease-prediction-using-logistic-regression/framingham.csv")
data.head(10)

In [None]:
plt.figure(figsize =(16,9))
sns.heatmap(data.corr() , annot = True)

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data = data.dropna(axis='rows', thresh=15)
data.isnull().sum()

In [None]:
data["education"]=data["education"].fillna(mode(data["education"]))
data["BPMeds"]=data["BPMeds"].fillna(mode(data["BPMeds"]))

data["cigsPerDay"]=data["cigsPerDay"].fillna((data["cigsPerDay"].mean()))
data["totChol"]=data["totChol"].fillna((data["totChol"].mean()))
data["BMI"]=data["BMI"].fillna((data["BMI"].mean()))
data["heartRate"]=data["heartRate"].fillna((data["heartRate"].mean()))
data["glucose"]=data["glucose"].fillna(data["glucose"].mean())

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize = (16,9))
sns.pairplot(data)

In [None]:
data = data.drop(columns='currentSmoker')

In [None]:
x = data[['male','age','education','cigsPerDay','BPMeds','prevalentStroke','prevalentHyp','diabetes','totChol','sysBP','diaBP','BMI','heartRate','glucose']]
y = pd.Series(data['TenYearCHD'])

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.4, random_state=1)

In [None]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [None]:
model = LogisticRegression()
model.fit(train_x , train_y)

In [None]:
pred = model.predict(test_x)

In [None]:
print(accuracy_score(test_y , pred))

In [None]:
print(confusion_matrix(test_y , pred))

In [None]:
sns.heatmap(confusion_matrix(test_y , pred) , annot = True)

In [None]:
print(classification_report(test_y , pred))

In [None]:
logit_roc_auc = roc_auc_score(test_y, pred)
fpr, tpr, thresholds = roc_curve(test_y, model.predict_proba(test_x)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()