In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Read and check the data
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
df.head()

# Data contains; 
 1.  age - age in years 
 2.  sex - (1 = male; 0 = female) 
 3.  cp - chest pain type(Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4:       asymptomatic)
 4.  trestbps - resting blood pressure (in mm Hg on admission to the hospital) 
 5.  chol - serum cholestoral in mg/dl 
 6.  fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
 7.  restecg - resting electrocardiographic results 
 8.  thalach - maximum heart rate achieved 
 9.  exang - exercise induced angina (1 = yes; 0 = no) 
 10. oldpeak - ST depression induced by exercise relative to rest 
 11. slope - the slope of the peak exercise ST segment(Value 1: upsloping, Value 2: flat, Value 3: downsloping) 
 12. ca - number of major vessels (0-3) colored by flourosopy 
 13. thal -A blood disorder called thalassemia(3=normal; 6 = fixed defect; 7 = reversable defect)
 14. target - have disease or not (1=yes, 0=no)

In [None]:
print('Number of rows and columns :',df.shape)
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
#Feature correlation
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot = True,cmap="RdYlGn")
plt.title('Heatmap for the Dataset')

In [None]:
#calculate how many are having heart disease and not heart having disease
target_value = df.target.value_counts()
print('Number of patients have heart disease:{}'.format(target_value[1]))
print("Number of patients haven't heart disease:{}".format(target_value[0]))

In [None]:
sns.countplot(x='target',data=df)

In the target class 0 shows that the people is not suffering from disease and 1 shows that the  people is suffering from disease.

In [None]:
#checking the age of patients
sns.kdeplot(df.age)
plt.xlabel('Age')

The above graph shows that the highest number of people suffering from heart disease are in the age group(55-65).The patients in age group(20-30) are very less likely suffer from heart disease 

In [None]:
#Age vs target
df.groupby(df['age']).target.value_counts().unstack().plot(kind = 'bar',figsize=(20,6))
plt.title('Heart Disease Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(["Haven't Disease",'Have Disease'])

In [None]:
LABELS=['Female','Male']
sns.countplot(x = 'sex',data=df)
plt.xticks(range(2), LABELS)

In the above bar graph, we can assume that the number of male patients are two times more then females patients.

In [None]:
LABELS=['Female','Male']
df.groupby(df['sex']).target.value_counts().unstack().plot(kind = 'bar')
plt.title('Heart Disease Frequency for Sex')
plt.xticks(range(2), LABELS,rotation=360)
plt.legend(["Haven't Disease",'Have Disease'])

In [None]:
df.groupby(df['cp']).target.value_counts().unstack().plot(kind = 'bar')
plt.legend(["Haven't Disease",'Have Disease'])
plt.xlabel('Chest pain')
plt.ylabel('Frequency')

In [None]:
sns.boxplot(df['target'],df['trestbps'],palette = 'viridis')
plt.title('Relation between trestbps and target')

The relation between trestbps and target plot clearly says that the patients who are not suffer from the disease have slightly greater blood pressure then the patients who have heart disease.

In [None]:
sns.boxplot(df['target'],df['chol'],palette = 'viridis')
plt.title('Relation between Cholestrol and Target')

Above the bivariate graph says that the patients who suffer from heart disease are having higher cholestol level then the patients who have not heart disease.

In [None]:
df.groupby(df['target']).restecg.value_counts().unstack().plot(kind = 'bar',color = plt.cm.rainbow(np.linspace(0, 1, 3)))
plt.title('Relation between ECG and Target')

In [None]:
sns.boxplot(df['target'],df['slope'],palette = 'viridis')
plt.title('Relation between Slope and Target')

This plot clearly shows that the patients who are suffer from any heart diseases are mostly having value 1 and value 2 are mostly seen in the cases of Patients who are more likely to suffer from heart diseases.

In [None]:
sns.boxenplot(df['target'], df['ca'], palette = 'Reds')
plt.title('Relation between number of major vessels and target')

The major vessel is helpful in determining the heart disease,more number of vessel,the more chances of suffering from heart disease.

In [None]:
sns.boxplot(df['target'], df['thalach'], palette = 'Reds')
plt.title('Relation between heart rate and target')

This plot clearly shows that the person who have suffer from heart disease are having maximum heart rate then the person who haven't.So it is very important to keep our heart rate low.

In [None]:
# let's change the names of the  columns for better understanding

df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

df.columns

In [None]:
df['sex']=df['sex'].map({0:'female',1:'male'})
df['chest_pain_type']=df['chest_pain_type'].map({0:'typical angina',1:'atypical angina',2:'non-anginal pain',3:'asymptomatic'})
df['fasting_blood_sugar']=df['fasting_blood_sugar'].map({0:'lower than 120mg/ml',1:'greater than 120mg/ml'})
df['rest_ecg']=df['rest_ecg'].map({0:'normal',1:'ST-T wave abnormality',2:'left ventricular hypertrophy'})
df['exercise_induced_angina']=df['exercise_induced_angina'].map({0:'no',1:'yes'})
df['st_slope']=df['st_slope'].map({0:'upsloping',1:'flat',2:'downsloping'})
df['thalassemia']=df['thalassemia'].map({0:'0',1:'normal',2:'fixed defect',3:'reversable defect'})

In [None]:
x = df.drop('target',axis=1)
y= df['target']

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
x = pd.get_dummies(x,drop_first=True)

In [None]:
x.head()

In [None]:
x.shape

In [None]:
# splitting the sets into training and test sets

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
#Modelling
#Randomforest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 50, max_depth = 5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True,  cmap = 'Blues',cbar=None)
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
pred_lr= lr.predict(x_test)
cm_lr = confusion_matrix(y_test, pred_lr)
sns.heatmap(cm_lr, annot = True,  cmap = 'Blues',cbar=None)
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')

# classification report
cr_lr = classification_report(y_test, pred_lr)
print(cr_lr)

In [None]:
#Gaussian naive bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)
pred_nb = nb.predict(x_test)
cm_nb = confusion_matrix(y_test, pred_nb)
sns.heatmap(cm_nb, annot = True,  cmap = 'Blues',cbar=None)
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')

# classification report
cr_nb = classification_report(y_test, pred_nb)
print(cr_nb)


In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn_score = []
for i in range(1,21):
    dt= KNeighborsClassifier(n_neighbors=i)
    dt.fit(x_train,y_train)
    pred = dt.predict(x_test)
    knn_score.append(accuracy_score(y_test,pred))
plt.plot(range(1,21),knn_score)
plt.xticks(np.arange(1,20,1))    

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(x_train,y_train)
pred_knn = knn.predict(x_test)
cm_knn = confusion_matrix(y_test, pred_knn)
sns.heatmap(cm_knn, annot = True,  cmap = 'Blues',cbar=None)
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')

# classification report
cr_knn = classification_report(y_test, pred_knn)
print(cr_knn)