# Heart Disease Prediction

I have collected the dataset from kaggle and I will be using machine learning to predict whether a person is suffering from Heart Disease or not.

--------------------------------------------------------------------------------------------------------------------------------

importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Data Description

age: The person's age in years

sex: The person's sex (1 = male, 0 = female)

cp: chest pain type
-- Value 0: typical angina
-- Value 1: atypical angina
-- Value 2: non-anginal pain
-- Value 3: asymptomatic

trtbps: The person's resting blood pressure (mm Hg on admission to the hospital)

chol: The person's cholesterol measurement in mg/dl

fbs: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)

restecg: resting electrocardiographic results
-- Value 0: normal
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalachh: The person's maximum heart rate achieved

exng: Exercise induced angina (1 = yes; 0 = no)

oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot)

slp-(slope): the slope of the peak exercise ST segment
-- Value 0: downsloping
-- Value 1: flat
-- Value 2: upsloping

caa: The number of major vessels (0-3)

thall
Results of the blood flow observed via the radioactive dye.
-- Value 0: NULL (dropped from the dataset previously)
-- Value 1: fixed defect (no blood flow in some part of the heart)
-- Value 2: normal blood flow
-- Value 3: reversible defect (a blood flow is observed but it is not normal)

target: 0 = disease, 1 = no disease

reading dataset

In [3]:
df= pd.read_csv("heart_attack_dataset.csv")

In [4]:
#df.info()

In [5]:
#df.describe()

In [6]:
#df.isnull().sum()

In [7]:
#df.duplicated().sum()

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
#df.duplicated().sum()

## Data Analysis

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = df.copy()
X = data.iloc[:,0:13]  #independent columns
y = data.iloc[:,-1]    #target column 
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
#print(featureScores.nlargest(12,'Score'))  #print 10 best features

In [11]:

from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
##plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
#feat_importances.nlargest(13).plot(kind='barh')
#plt.show()

In [12]:
#df.groupby('output').mean()

In [13]:
#df.corr()

In [14]:
cm=df.corr()
top_corr_features=cm.index
#plt.figure(figsize=(20,20))
#g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdBu")


In [15]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [16]:
#import pandas_profiling as pp
#pp.ProfileReport(df)

## Data visualization

countplot is used when there are limited number of classifiers or limited distinct values
distplot is used when the values are in float or continuos

In [17]:
sns.set_style('darkgrid')
sns.set_palette('Set2')

In [18]:
df2 = df.copy()

In [19]:
def chng(sex):
    if sex == 0:
        return 'female'
    else:
        return 'male'
df2['sex'] = df2['sex'].apply(chng)

In [20]:
def chng2(prob):
    if prob == 0:
        return 'Heart Disease'
    else:
        return 'No Heart Disease'
df2['output'] = df2['output'].apply(chng2)

In [21]:
#plt.figure(figsize=(10,6))
#sns.countplot(data= df2, x='sex',hue='output')
#plt.title('Gender v/s output\n',size=15)

In [22]:
#plt.figure(figsize=(10,6))
#sns.countplot(data= df2, x='cp',hue='output')
#plt.title('Chest Pain Type v/s output\n',size=15)

In [23]:
#plt.figure(figsize=(10,6))
#sns.countplot(df["output"])
#plt.title("Attack",size=15)
#plt.show()
#sns.countplot(x='output',data=df)

In [24]:
#plt.figure(figsize=(10,6))
#sns.countplot(data= df2, x='sex',hue='thall')
#plt.title('Gender v/s Thalassemia\n',size=15)

In [25]:
#plt.figure(figsize=(10,6))
#sns.countplot(data= df2, x='slp',hue='output')
#plt.title('Slope v/s Output\n',size=15)

In [26]:
#plt.figure(figsize=(10,6))
#sns.countplot(data= df2, x='exng',hue='thall')
#plt.title('exercise induced angina v/s Thalassemia\n',size=15)

In [27]:
#plt.figure(figsize=(20,8))
#sns.distplot(df[df["output"]==0]["age"],kde=False,bins=50)
#plt.title("Age of Heart Diseased Patients\n",size=15)
#plt.show()

In [28]:
#plt.figure(figsize=(20,8))
#sns.distplot(df[df['output']==0]['chol'],kde=False,bins=40)
#plt.title('Cholestrol of Heart Diseased Patients\n',size=15)

In [29]:
#plt.figure(figsize=(20,8))
#sns.distplot(df[df['output']==0]['thalachh'],kde=False,bins=40)
#plt.title('max heart rate achieved(thalachh) of Heart Diseased Patients\n',size=15)

## Data pre-processing

In [30]:
#df.info()

In [31]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg_type', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope_type', 'num_major_vessels', 'thalassemia_type', 'target']

#df.columns

In [32]:
#df.info()

We have 4 Categorical columns as seen in Data Description using pandas profiling:

cp - chest_pain_type

restecg - rest_ecg_type

slope - st_slope_type

thal - thalassemia_type

In [33]:
#cp - chest_pain_type
df.loc[df['chest_pain_type'] == 0, 'chest_pain_type'] = 'typical angina'
df.loc[df['chest_pain_type'] == 1, 'chest_pain_type'] = 'atypical angina'
df.loc[df['chest_pain_type'] == 2, 'chest_pain_type'] = 'non-anginal pain'
df.loc[df['chest_pain_type'] == 3, 'chest_pain_type'] = 'asymptomatic'

#restecg - rest_ecg_type
df.loc[df['rest_ecg_type'] == 0, 'rest_ecg_type'] = 'normal'
df.loc[df['rest_ecg_type'] == 1, 'rest_ecg_type'] = 'ST-T wave abnormality'
df.loc[df['rest_ecg_type'] == 2, 'rest_ecg_type'] = 'left ventricular hypertrophy'

#slope - st_slope_type
df.loc[df['st_slope_type'] == 0, 'st_slope_type'] = 'downsloping'
df.loc[df['st_slope_type'] == 1, 'st_slope_type'] = 'flat'
df.loc[df['st_slope_type'] == 2, 'st_slope_type'] = 'upsloping'

#thal - thalassemia_type
df.loc[df['thalassemia_type'] == 0, 'thalassemia_type'] = 'nothing'
df.loc[df['thalassemia_type'] == 1, 'thalassemia_type'] = 'fixed defect'
df.loc[df['thalassemia_type'] == 2, 'thalassemia_type'] = 'normal'
df.loc[df['thalassemia_type'] == 3, 'thalassemia_type'] = 'reversable defect'



In [34]:
#df.head()

In [35]:
data = pd.get_dummies(df, drop_first=False)
#data.columns

In [36]:
df_temp = data['thalassemia_type_fixed defect']

In [37]:
data = pd.get_dummies(df, drop_first=True)

In [38]:
#data.head()

In [39]:
frames = [data, df_temp]
result = pd.concat(frames,axis=1)

#result.head()

In [40]:
result.drop('thalassemia_type_nothing',axis=1,inplace=True)
resultc = result.copy()

In [41]:
#result.columns

In [42]:
X = result.drop('target', axis = 1)


In [43]:
y = result['target']

## Data standardization

In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
scaler = StandardScaler()
scaler.fit(X)

StandardScaler()

In [46]:
standardized_data = scaler.transform(X)
#print(standardized_data)

##  Train Test Split

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [67]:
#X.info()

## Training Model

In [50]:
from sklearn.linear_model import LogisticRegression

In [51]:
logre = LogisticRegression()

In [52]:
 logre.fit(X_train,y_train)

LogisticRegression()

In [53]:
y_pred = logre.predict(X_test)

In [54]:
actual = []
predcition = []

for i,j in zip(y_test,y_pred):
  actual.append(i)
  predcition.append(j) 

dic = {'Actual':actual,
       'Prediction':predcition
       }

In [55]:
result  = pd.DataFrame(dic)

In [56]:
#!pip install plotly
#!pip install cufflinks

In [57]:
#from plotly import graph_objs as go
 
#fig = go.Figure()
 
    #fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_test,
 #                   mode='markers+lines',
  #                  name='Test'))

#fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_pred,
   #                 mode='markers',
    #                name='Pred'))

## Model Evaluation

In [58]:
from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test,y_pred))

In [59]:
from sklearn.metrics import classification_report
#print(classification_report(y_test,y_pred))


In [60]:
from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_test,y_pred))
#sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

In [61]:
from sklearn.metrics import roc_curve
#fpr, tpr, thresholds = roc_curve(y_test, y_pred)
#3plt.plot(fpr,tpr)
#plt.xlim([0.0, 1.0])
#plt.ylim([0.0, 1.0])
#plt.title('ROC curve for Heart disease classifier')
#plt.xlabel('False positive rate (1-Specificity)')
#plt.ylabel('True positive rate (Sensitivity)')
#plt.grid(True)

In [62]:
import sklearn
#sklearn.metrics.roc_auc_score(y_test,y_pred)

## Result

In [66]:
def predict():
    print("enter input:")
    
    lst=[]
    r=int(input("enter age(in years):"))
    lst.append([r])
    r=int(input("enter sex(enter 1 if male \n enter 0 if female):"))
    lst.append([r])
    r=int(input("enter resting blood pressure (enter 0 if normal \n enter 1 if having ST-T wave abnormality \nenter 2 if showing probable or definite left ventricular hypertrophy by Estes' criteria) :"))
    lst.append([r])
    r=int(input("enter cholesterol :"))
    lst.append([r])
    r=int(input("enter fasting blood sugar:"))
    lst.append([r])
    r=int(input("enter  max heart rate achieved:"))
    lst.append([r])
    r=int(input("enter  exercise induced angina (enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=float(input("enter st depression( induced by exercise relative to rest) :"))
    lst.append([r])
    r=int(input("enter major vessels((0-3) colored by flourosopy):"))
    lst.append([r])
    r=int(input("chest pain type-atypical angina(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("chest pain type-non-anginal pain(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("chest pain type-typical angina(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("rest ecg type-left ventricular hypertrophy(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("rest ecg type-normal(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("st slope type-flat(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("st slope type-upsloping(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("thalassemia type-normal(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("thalassemia type-reversable defect(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
    r=int(input("thalassemia type-fixed defect(enter 1 if yes\n enter 0 if no):"))
    lst.append([r])
   
     #for i in range(0,19):
        #if(i==7 ):
        #    r=float(input())
       # else:
       #     r=int(input())
       # lst.append([r])

#input = (5,166,72,19,175,25.8,0.587,51)

# Changing input data to numpy array
    input_array = np.asarray(lst)

# Reshaping input
    input_reshaped = input_array.reshape(1,-1)

# Standardiaing data
    std_data = scaler.transform(input_reshaped)
    #print(std_data)

    prediction =logre.predict(std_data)
    #print(prediction)

    if(prediction[0]==0):
          print('you may have heart disease')
    else:
          print("you don't have a heart disease")
        
predict()
#lst=[43,0,2,140,192,173,1,1.3,0,0,0,1,0,0,0,1,1,0,0]


enter input:
enter age(in years):43
enter sex(enter 1 if male 
 enter 0 if female):0
enter resting blood pressure (enter 0 if normal 
 enter 1 if having ST-T wave abnormality 
enter 2 if showing probable or definite left ventricular hypertrophy by Estes' criteria) :2
enter cholesterol :140
enter fasting blood sugar:192
enter  max heart rate achieved:173
enter  exercise induced angina (enter 1 if yes
 enter 0 if no):1
enter st depression( induced by exercise relative to rest) :1.3
enter major vessels((0-3) colored by flourosopy):0
chest pain type-atypical angina(enter 1 if yes
 enter 0 if no):0
chest pain type-non-anginal pain(enter 1 if yes
 enter 0 if no):0
chest pain type-typical angina(enter 1 if yes
 enter 0 if no):1
rest ecg type-left ventricular hypertrophy(enter 1 if yes
 enter 0 if no):0
rest ecg type-normal(enter 1 if yes
 enter 0 if no):0
st slope type-flat(enter 1 if yes
 enter 0 if no):0
st slope type-upsloping(enter 1 if yes
 enter 0 if no):1
thalassemia type-normal(enter 