In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Introduction** 

15 million people in the world suffer from a heart attacks annually, 5 million of them die, and another 5 million remain permanently incapacitated, which places a burden on the family and society. Heart attcks are uncommon in people under the age of 40, and when it does occur the main cause is high blood pressure. However, attacks also occurs in about 8% of children with sickle cell disease.

So, what about if we can predict who has a higher chance of having a heart attack beasd on their medical history?

I will try to build a logistic regression model that can predict the chance of having a heart attack for a patient based on his medical history.

**Dataset Description**

age : Age of the patient (Continuous)

sex : Male or Female (Nominal)

cp : Chest pain type (Nominal)

trtbps : resting blood pressure in mm\Hg (Continuous)

chol : Cholestoral in mg/dl (Continuous)

fbs : Fasting blood sugar (Continuous)

restecg : Electrocardiographic results (Nominal)

thalachh : Maximum heart rate (Continuouse)

exang: exercise induced angina (Nominal)

old peak : Previous peak (Continususe)

slp : Slope (Nominal)

caa: Number of major vessels (Nominal)

output : 0= less chance of heart attack 1= more chance of heart attack (Nominal)

**Importing the required packages**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve
import pickle
from sklearn.model_selection import train_test_split, cross_val_score 

Importing our dataset (heart.csv)

In [None]:
data= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

data

Renaming columns for clarification purpose

In [None]:
data.columns = ['Age', 'Sex', 'Chest Pain Type', 'Blood Pressure', 'Cholesterol', 'Blood Sugar', 'ECG', 'Max. Heart Rate',
       'Induced Angina', 'ST Depression', 'ST Slope', 'Num. Major Blood Vessels', 'Thalassemia', 'Output']

Check if there is any missing values in our dataset

In [None]:
data.isnull().sum()

**Exploratory data analysis**

In [None]:
fig=plt.figure(figsize=(20, 20))
fig = plt.subplot(4,4,1)
sns.kdeplot(data['Age'],shade=True,label=' Age')
sns.distplot(data['Age'], color="red",  kde= True,label='Age')
plt.title ('Distribution of Age')
plt.legend()


fig = plt.subplot(4,4,2)
data['Sex'].value_counts().plot(kind='bar',label=' sex', color="lightgreen")
plt.xlabel('sex')
plt.title ('Distribution of sex')
plt.legend()


fig = plt.subplot(4,4,3)
data['Chest Pain Type'].value_counts().plot(kind='bar',label='pain')
plt.xlabel('cp')
plt.title ('Distribution of chest pain')
plt.legend()


fig = plt.subplot(4,4,4)
sns.kdeplot(data['Blood Pressure'],shade=True,label='blood pressure')
sns.distplot(data['Blood Pressure'], color="red",  kde= True,label='Blood Pressure')
plt.title ('Distribution of blood pressure')
plt.xlabel('blood pressure')
plt.legend()


fig = plt.subplot(4,4,5)
sns.kdeplot(data['Cholesterol'],shade=True,label='cholestrol')
plt.title ('Distribution of cholestrol')
plt.xlabel('cholestrol')
plt.legend()

fig = plt.subplot(4,4,6)
data['Blood Sugar'].value_counts().plot(kind='bar',label='Blood sugar', color="red")
plt.title ('Distribution of blood sugar')
plt.xlabel('blood sugar')
plt.legend()


fig = plt.subplot(4,4,7)
data['ECG'].value_counts().plot(kind='bar',label='Electrocardiographic result')
plt.xlabel('electrocardiographic result')
plt.title ('Distribution of electrocardiographic result')
plt.legend()


fig = plt.subplot(4,4,8)
sns.kdeplot(data['Max. Heart Rate'],shade=True,label='Maximum heart rate')
sns.distplot(data['Max. Heart Rate'], color="red",  kde= True,label='Maximum heart rate')
plt.title ('Distribution of maximum heart rate')
plt.xlabel('maximum heart rate')
plt.legend()


fig = plt.subplot(4,4,9)
data['Induced Angina'].value_counts().plot(kind='bar',label='Induced engina', color="darkblue")
plt.xlabel('Induced Angina')
plt.title ('Distribution of induced engina')
plt.legend()


fig = plt.subplot(4,4,10)
sns.kdeplot(data['ST Depression'],shade=True,label='ST Depression')
plt.xlabel('ST Depression')
plt.title ('Distribution of old peak')
plt.legend()


fig = plt.subplot(4,4,11)
data['ST Slope'].value_counts().plot(kind='bar',label='ST Slope', color="darkgreen")
plt.xlabel('ST Slope')
plt.title ('Distribution of slope')
plt.legend()


fig = plt.subplot(4,4,12)
data['Num. Major Blood Vessels'].value_counts().plot(kind='bar',label='Major vessels')
plt.xlabel('Num. Major Blood Vessels')
plt.title ('Distribution of no. of major vessels')
plt.legend()


fig=plt.subplot(4,4,13)
data['Thalassemia'].value_counts().plot(kind='bar' ,label='Thalassemia')
plt.xlabel('Thalassemia')
plt.title('Distribution of Thalassemia')
plt.legend()
plt.show()

Check heart attacks over Blood Pressure, age, and Heart rate. 

In [None]:
fig = plt.figure(figsize=(20,20))
fig = plt.subplot(2,3,1)
sns.distplot(data[data['Output'] == 0]["Blood Pressure"], color='blue',label='No heart Disease') 
sns.distplot(data[data['Output'] == 1]["Blood Pressure"], color='red',label='Heart Disease') 
plt.title('Heart Attack distibution over Blood pressure ', fontsize=12)
plt.legend()

fig = plt.subplot(2,3,2)
sns.distplot(data[data['Output'] == 0]["Age"], color='blue',label='No heart Disease') 
sns.distplot(data[data['Output'] == 1]["Age"], color='red',label='Heart Disease') 
plt.title('Heart Attack distibution over Age ', fontsize=12)
plt.legend()

fig = plt.subplot(2,3,3)
sns.distplot(data[data['Output'] == 0]["Max. Heart Rate"], color='blue',label='No heart Disease') 
sns.distplot(data[data['Output'] == 1]["Max. Heart Rate"], color='red',label='Heart Disease') 
plt.title('Heart Attack distibution over Heart Rate ', fontsize=12)
plt.legend()

plt.show()

Check the distribution of our target variable 

In [None]:
ax=sns.countplot(data=data, x='Output',palette=['blue','red'])
ax.set(xticklabels=['less chance of heart attack', 'more chance of heart attack'],title="Distribution of Target")
ax.tick_params(bottom=False)

In [None]:
fig=plt.figure(figsize=(15, 6))
sns.heatmap(data.corr(),cmap = 'Blues')

plt.savefig('heatmap.png')
plt.show()

In [None]:
data.corr()

From the above figure and table for the correlation we can notice that:

1. Heart attack have (positive correlation) with chest pain type, heart rate, and slope.
2. Heart attack have (negative correlation) with age, induced engina, and Num. major vessel.


**Modeling**
 

Split our dataset into training dataset (90%) and test dataset (10%)

In [None]:
X = data[['Age', 'Sex', 'Chest Pain Type', 'Blood Pressure','Cholesterol', 'Blood Sugar', 'ECG', 'Max. Heart Rate','Induced Angina', 'ST Depression', 'ST Slope','Num. Major Blood Vessels', 'Thalassemia']]

y = data['Output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 0)

Build our logistic regression model and fit it into the training dataset

In [None]:
lr = LogisticRegression()
model = lr.fit(X_train, y_train)

Test our model and diplay confusion matrix 

In [None]:
model_predict = lr.predict(X_test)
model_conf_matrix = confusion_matrix(y_test, model_predict)
model_accuracy = accuracy_score(y_test, model_predict)
print("confussion matrix")
print(model_conf_matrix)
print("model accuracy = ", model_accuracy*100)

In [None]:
print(classification_report(y_test,model_predict))

So far so good, our LR model achive 87% accuracy

Lets draw our ROC curve 

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,model_predict)
plt.plot(fpr,tpr)
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

Now, we can save our model

In [None]:
filename = 'Heart_model.sav'
pickle.dump(model, open(filename, 'wb'))

After some time lets try to load our model.

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

Great !!

> > 