# Heart disease analysis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In the dataset we have some patient clinical report, We have to predict they have heart disease or not?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
from pandas.plotting import andrews_curves

# Load the data

In [None]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.head()

# Data Description

 **1. age**: The person's age in years

 **2. sex**: The person's sex (1 = male, 0 = female)

 **3. cp**: Chest pain type 
            * 0: Typical angina: chest pain related decrease blood supply to the heart
            * 1: Atypical angina: chest pain not related to heart
            * 2: Non-anginal pain: typically esophageal spasms (non heart related)
            * 3: Asymptomatic: chest pain not showing signs of disease
            
**4. trestbps**: resting blood pressure (in mm Hg on admission to the hospital)

**5. chol**: The person's cholesterol measurement in mg/dl

**6. fbs**: the perdon's fasting blood sugar > 120 mg/dl. (1 = true; 0 = false)

**7. restecg**: Resting electrocardiographic results

     0: Nothing to note
     1: ST-T Wave abnormality
        * can range from mild symptoms to severe problems
        * signals non-normal heart beat
     2: Possible or definite left ventricular hypertrophy
        * Enlarged heart's main pumping chamber
        
**8. thalach**: Maximum heart rate achieved

**9. exang**: Exercise induced angina (1 = yes; 0 = no)

**10. oldpeak**: - ST depression induced by exercise relative to rest looks at stress of heart during excercise unhealthy heart will stress more

**11. slope**: - the slope of the peak exercise ST segment
    * 0: Upsloping: better heart rate with excercise (uncommon)
    * 1: Flatsloping: minimal change (typical healthy heart)
    * 2: Downslopins: signs of unhealthy heart
    
**12. ca**: Number of major vessels (0-3) colored by flourosopy
    * colored vessel means the doctor can see the blood passing through
    * the more blood movement the better (no clots)
    
**13. thal**: Thalium stress result
    * 1,3: normal
    * 6: fixed defect: used to be defect but ok now
    * 7: reversable defect: no proper blood movement when excercising
    
**14. target**: Person have disease or not (1=yes, 0=no) (= the predicted attribute)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
print('Data Sum of Null Values \n')
df.isnull().sum()

# Data Visualization

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style='whitegrid')
df.target.value_counts().plot(kind="bar", color=["green", "blue"])
plt.title("Disease vs non-Disesase person")
plt.xlabel("Sex (0 = female, 1= male)")
plt.ylabel("Count")

### Percentage of patients

In [None]:
countFemale = len(df[df.sex == 0])
countMale = len(df[df.sex == 1])
print("Percentage of Female Patients: {:.2f}%".format((countFemale / (len(df.sex))*100)))
print("Percentage of Male Patients: {:.2f}%".format((countMale / (len(df.sex))*100)))

## mean, median & max value of all coulmn 

In [None]:
mean = df.mean()
print(mean)

In [None]:
median = df.median()
print(median)

In [None]:
max = df.max()
print(max)

In [None]:
df.groupby('target').mean()

In [None]:
df.groupby('target').median()

In [None]:
df.groupby('target').max()

## Making a Heatmap

In [None]:
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 fmt=".2f",
                 cmap="inferno"); #bone cmap type
# plt.title('Heatmap for the Dataset', fontsize = 20)
bottom, top = ax.get_ylim()
sns.set(font_scale=1.4)
# ax.set_ylim(bottom + 0.5, top - 0.5)
#plt.savefig('Heatmap.pdf')  

All correlation values between the data are listed in the previous sections. As a result of this listing, it is aimed to ensure that these properties are used in different places by performing different operations. Thus, the p-value process determines a hypothesis and a hypothesis thesis is presented between each characteristic according to this hypothesis. In this process, after determining the Class property as hypothesis, the relations between all the other properties are checked. This results in a different number for each property. What is important here is that these numbers are not close to 1.00. If the number is close to 1.00 this is very bad.

The above heat map is to show the correlations amongst the different attributes of the given dataset. The above Heat Map shows that almost all of the features/attributes given in the dataset are very less correlated with each other. This implies we must include all of the features, as we can only eliminate those features where the correlation of two or more features are very high.

we observe positive correlation between target and cp, thalach,slope
and also negative correlation between target and sex, exang,ca,thai,oldpeak

In [None]:
import matplotlib as mpl
from matplotlib.pyplot import cm

cg = sns.clustermap(corr_matrix, 
                    #cbar_kws={'label': 'Christmas colorbar'},
                    
                    annot=True,
                    fmt=".2f",
                    cmap ="inferno",
                    figsize=(20,16)); 

plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation = 0) 
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
#sns.set(font_scale=1.7)
plt.savefig('Heatmap2.png', bbox_inches = "tight")  
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
df.hist(figsize= (18,12), layout=(4,4));

# Analysis using Age

In [None]:
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6),color=['blue', '#ffa600'])
#plt.title('Heart Disease Frequency for Ages')
plt.grid(False)
plt.xlabel('Age', fontsize=12, fontname = 'Times New Roman')
plt.ylabel('Frequency', fontsize=12, fontname = 'Times New Roman')
plt.savefig('AgeVsTarget.pdf') 
plt.show()

In [None]:
#firstly find min and max ages
minAge=min(df.age)
maxAge=df.age.max()
meanAge=df.age.mean()
print('Min Age :',minAge)
print('Max Age :',maxAge)
print('Mean Age :',meanAge)

In [None]:
young_ages=df[(df.age>=29)&(df.age<40)]
middle_ages=df[(df.age>=40)&(df.age<55)]
elderly_ages=df[(df.age>55)]
print('Young Ages :',len(young_ages))
print('Middle Ages :',len(middle_ages))
print('Elderly Ages :',len(elderly_ages))

 - Here I use 29-39 as Young age

 - 40-54 as Middle age and 

 - older than 55 as Eardly age

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=['young ages','middle ages','elderly ages'],y=[len(young_ages),len(middle_ages),len(elderly_ages)], palette="CMRmap",hue=['Young ages','Middle ages', 'Elderly ages'])
plt.xlabel('Age Range')
plt.ylabel('Age Counts')
plt.title('Ages State in Dataset')
plt.show()

# Analysis using Sex

In [None]:
df.sex.value_counts()

In [None]:
size = df['sex'].value_counts()
colors = ['blue', 'green']
labels = "Male", "Female"
explode = [0, 0.01]

my_circle = plt.Circle((0, 0), 0.7, color = 'white')


plt.rcParams['figure.figsize'] = (7, 7)
plt.pie(size, colors = colors, labels = labels, shadow = True, explode = explode, autopct = '%.2f%%')
plt.title('Distribution of Gender', fontsize = 20)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()


total_genders_count=len(df.sex)
male_count=len(df[df['sex']==1])
female_count=len(df[df['sex']==0])
plt.text(1, 1,'Total Genders:\nMale Count:\nFemale Count  :')
plt.text(1.55, 1.15,total_genders_count)
plt.text(1.55, 1.07,male_count)
plt.text(1.55, .97,female_count)

plt.show()

In [None]:
pd.crosstab(df.sex,df.target).plot(kind="bar",figsize=(15,6),color=['#1CA53B','#AA1111' ])
plt.title('Heart Disease Frequency for Sex')
plt.xlabel('Sex (0 = Female, 1 = Male)')
plt.xticks(rotation=0)
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency')
plt.show()

# Analysis using Chest Pain Type

In [None]:
#As seen, there are 4 types of chest pain.
df.cp.value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(df.cp, palette="magma_r")
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.title('Chest Type vs Count State')
plt.show()

In [None]:
pd.crosstab(df.cp,df.target).plot(kind="bar",figsize=(15,6),color=['#11A5AA','#AA1190' ])
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency of Disease or Not')
plt.show()

# Analysis Using fbs
The person's fasting blood sugar > 120 mg/dl. (1 = true; 0 = false)

In [None]:
pd.crosstab(df.fbs,df.target).plot(kind="bar",figsize=(15,6),color=['#FFC300','#581845' ])
plt.title('Heart Disease Frequency According To FBS')
plt.xlabel('FBS - (Fasting Blood Sugar > 120 mg/dl) (1 = true; 0 = false)')
plt.xticks(rotation = 0)
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency of Disease or Not')
plt.show()

# Analysis Using thalach
Maximum heart rate achieved

In [None]:
df.thalach.value_counts()[:20]
#First show 20 rows

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=df.thalach.value_counts()[:20].index,y=df.thalach.value_counts()[:20].values)
plt.xlabel('Thalach')
plt.ylabel('Count')
plt.title('Thalach Counts')
plt.xticks(rotation=45)
plt.show()

In [None]:
age_unique=sorted(df.age.unique())
age_thalach_values=df.groupby('age')['thalach'].count().values
mean_thalach=[]
for i,age in enumerate(age_unique):
    mean_thalach.append(sum(df[df['age']==age].thalach)/age_thalach_values[i])

In [None]:
#data_sorted=data.sort_values(by='Age',ascending=True)
plt.figure(figsize=(15,5))
sns.pointplot(x=age_unique,y=mean_thalach,color='red',alpha=0.8)
plt.xlabel('Age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('Thalach',fontsize = 15,color='blue')
plt.title('Age vs Thalach',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)]) # c="yellow" for color
plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])
#plt.title("Age vs Max Heart rate", fontsize=11, fontname = 'Times New Roman')
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age", fontsize=11, fontname = 'Times New Roman')
plt.ylabel("Maximum Heart Rate", fontsize=11, fontname = 'Times New Roman')
plt.savefig('HeartRate vs Age.pdf')
plt.show()

# Analysis using slope
the slope of the peak exercise ST segment

* 0: Upsloping: better heart rate with excercise (uncommon)
* 1: Flatsloping: minimal change (typical healthy heart)
* 2: Downslopins: signs of unhealthy heart

In [None]:
pd.crosstab(df.slope,df.target).plot(kind="bar",figsize=(15,6),color=['lightgreen','#FF5733' ])
plt.title('Heart Disease Frequency for Slope')
plt.xlabel('The Slope of The Peak Exercise ST Segment ')
plt.xticks(rotation = 0)
plt.ylabel('Frequency')
plt.show()

# Thal Analysis
Thalium stress result

* 1,3: normal
* 6: fixed defect: used to be defect but ok now
* 7: reversable defect: no proper blood movement when excercising

In [None]:
df.thal.value_counts()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(df.thal, palette="cubehelix_r")
plt.show()

In [None]:
df[(df['thal']==1)].target.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=df[(df['thal']==1)].target.value_counts().index,y=df[(df['thal']==1)].target.value_counts().values, palette="Oranges_r",hue=['0','1'])
plt.xlabel('Thal Value')
plt.ylabel('Count')
plt.title('Counter for Thal')
plt.show()

# Analysis Target
Person have disease or not (1=yes, 0=no) (= the predicted attribute)

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(df.target, palette="Purples_r")
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Target Counter 1 & 0')
plt.show()

In [None]:
sns.countplot(df.target,hue=df.sex)
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Target & Sex Counter 1 & 0')
plt.show()

# **Please upvote if it helps you**