In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dt=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
print('A view of the top 5 rows of the datatset')
dt.head()


## This dataset has only 303 rows, 13 attributes and 1 target colum   

In [None]:
# print('The different columns along with their original names: ')
# dt.columns

1. ### age      >>   age
1. ### sex      >>   sex
1. ### cp       >>   chest pain type (4 values)
1. ### trestbps >>   resting blood pressure
1. ### chol     >>   serum cholestoral in mg/dl
1. ### fbs      >>   fasting blood sugar > 120 mg/dl
1. ### restecg  >>   resting electrocardiographic results (values 0,1,2)
1. ### thalach  >>   maximum heart rate achieved
1. ### exang    >>   exercise induced angina
1. ### oldpeak  >>   ST depression induced by exercise relative to rest
1. ### slope    >>   the slope of the peak exercise ST segment
1. ### ca       >>   number of major vessels (0-3) colored by flourosopy
1. ### thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
1. ### target   >>   0 indicates healthy and 1 indicates illness****

## The different columns present in the datset along with their meanings are as shown above 

In [None]:
dt.info(verbose=True)

## It's good that the dataset has no null values as it will save our efforts and time in handling those null values. Thus we can directly jump to exploration without wasting much time on that part. 

In [None]:
for feature in dt.columns:
    print(feature,'has',len(dt[feature].unique()),'unique values.')

## Some features like sex, cp, fbs, restecg, exang, slope, ca and thal appears like categorical varibles and thus can be use for dividing the data into clusters (segregation) to find relevant relations and dependency among different variables

In [None]:
dt['sex']=np.where(dt.sex==0,'female','male')
dt['exang']=np.where(dt.exang==0,'no','yes')
dt['target']=np.where(dt.target==0,'healthy','diseased')
dt['fbs']=np.where(dt.fbs==0,'<120(Normal)','>120(Abnormal)')

## We converted some variables from numeric to string data type so that we can have a clear understanding of the values that these  are representing

In [None]:
for feature in dt.columns:
    if (len(dt[feature].unique()))<10:
        print(feature,'has',len(dt[feature].unique()),'unique values','i,e.',dt[feature].unique())

## This represents the unique value that these variables are having. It will help us in clearly visualizing their relationship with other independent variables.

In [None]:
dt.describe().transpose()

## This table descrbes the diferent variables along with their mean, median and quartile distributons. It can give us a rough idea about the distribution of these varaibles, the presence of outliers and the mean and median deviations.

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data=dt.corr(),annot=True)

## This correlation matrix will reflect how strongly or weakly the variables in our dataset are related to one another. The values inside the cells displays their relativity strength and direction on a scale from -1 to 1.

In [None]:
ax=sns.swarmplot(x='cp',y='thalach',data=dt,hue='target')
sns.boxplot(x="cp", y="thalach", data=dt,showcaps=False,boxprops={'facecolor':'None'},showfliers=False,whiskerprops={'linewidth':0}, ax=ax)


## This plot between maximum heart rate achieved and chest pain type for healthy and illed person shows that 1,2 and 3 cp values are more associated with higher thalap values. These cp values and higher heart rates can be observed more commonly in those persons who are having some kind of illness. The mean value of thalach in these cp types is even more than the third quartile of thalach values at 0th cp type.

In [None]:
dt['count_thalach']=pd.cut(dt.thalach,labels=['<100','<150','<200','<300'],bins=[50,100,150,200,300])

## Here, we are converting the thalach (maximum heart rate achieved) column into categorical feature with 4 unique values to plot its distribution

In [None]:
sns.countplot(x='count_thalach',data=dt,hue='target')

## As can be seen, subjects with thalach value between 150-200 have more chances of possessing some kind of heart diease. Some, People with thalach value less than 150 may also have some illness but they are less than even half of the 3 category people.

In [None]:

plt.figure(figsize=(12,8))
sns.relplot(x='trestbps',y='thalach',data=dt,kind='scatter',hue='target',col='sex')

## Form this we can state that in case of men, 90% have trestbps between 110-150 and those with maximum heart rate values less than 150 have very less chances of having any illness where female have lesser trestbps value as compared to men and also there is no clear separation line at any thalach value. Compared to men, the percentage of diseased female is higher than that of male and also the ill female count with thalach less than 150 looks nearly equal to those with higher thalach value.   


In [None]:
fig=plt.figure(figsize=(10,8))
sns.jointplot(data=dt, x="trestbps", y="thalach",hue='sex',)
sns.jointplot(data=dt,x='trestbps',y='thalach',hue='target')


## It is also clear from both these plots that males have higher resting blood pressure as well as heart rates as compared to female and also, the diseased persons have higher heart rates as compared to normal subjects. Most of the diseased people have trestbps in the range from 11 to 140  

In [None]:
plt.figure(figsize=(12,8))
sns.relplot(x='chol',y='thalach',data=dt,kind='scatter',hue='target',col='sex')

In [None]:
sns.catplot(y='trestbps',x='oldpeak',kind='strip',data=dt,hue='target',col='sex',aspect=2)


## More number of subjects are unhealthy at depression ratio below 2.0. It can be seen clearly from these plots that above this limiting value the number of unhealthy subjects are very less (or negligible) as compared to healthy subjects. 

In [None]:
sns.histplot(x='age',data=dt,bins=30,hue='target',multiple='dodge',kde=True)

## The distribution of healthy subjects appear to be left skewed and it is also visible that more people with age below 55 are found to unhealthy as  compared to older people whereas most of the subjects between 55 to 65 are healthy.

In [None]:
dt['age_cat']=pd.cut(dt['age'],bins=[20,30,40,50,60,70,80],labels=['<30','<40','<50','<60','<70','<80'])

## Here we are converting the age column from quantitative to categorical so that we can easily see it's effect on other parameters.

In [None]:
for i in ['trestbps','chol','thalach']:
    fig=plt.figure(figsize=(10,8))
    p1=sns.swarmplot(x='age_cat',y=i,data=dt,size=5,hue='target')
    sns.boxplot(x="age_cat", y=i, data=dt,showcaps=False,showfliers=False,whiskerprops={'linewidth':0}, ax=p1,hue='target')

## Out of all these plots only the last plot i.e. thalach vs age_cat plot is showing some clear distinctions between the two categories related to age. The maximum heart rate of healthy persons is very less for younger people as compared to elder people. Elder people have the least maximum achieved heart rate.

In [None]:
sns.catplot(x='cp',data=dt,kind='count',hue='target',col='restecg')

## From these plots we can say that at all restecg values, majority of healthy subjects have chest pain of 0th  type where as ill subjects have chest pain of 2nd type. It can also be said that subjects of restecg 2 category may have some kind of illness if they have any type of chest pain. 

In [None]:
sns.catplot(y='trestbps',x='thal',kind='swarm',data=dt,hue='target',col='sex')

## Fter examining realtionship between thal and resting blood pressure with respect to target and sex we can say that in case of both male and female, the chances of a subject having some illness is maximum at a thal value of 2 whereas healthy at 3 at resting bp below 160.

In [None]:
sns.catplot(y='thalach',x='thal',kind='swarm',data=dt,hue='target',col='sex')

## Similarly, for this also the chances of a person having some illness is maximum at thal equals 2 and that's also at higher heart rates. 

In [None]:
sns.catplot(y='oldpeak',x='slope',kind='swarm',data=dt,hue='target',col='sex',aspect=1.5,height=10.5)

## From here we can say that the depression induced(oldpeak) in majority of candidates can be seen at a slope of 2 or 3. Moreover, the maximum percent of unhealthy subjects have oldpeak level less than 1. Thus the chances of subject being ill is maximum at slope of 2 and oldpeak level less than 1.