# ***Coronary Artery Disease -- Data Analysis***

In [None]:
from IPython.display import Image
Image("../input/heart-pics/Heart_pumping1.gif")

In [None]:
from IPython.display import Image
Image("../input/heart-pics/Heart_pumping2.gif")

###### **Pics Link:** https://en.wikipedia.org/wiki/Heart_valve

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

%matplotlib inline

In [None]:
cleveland_df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
cleveland_df.head()

#### **Dataset dimensionality**

In [None]:
cleveland_df.shape

#### **Assigning feature names to the dataset**

In [None]:
features = ['age','gender','chest_pain','rest_bp','cholestrol','fst_bs','rest_ecg','max_hrt_rate','ex_angina','oldpeak','slope','color_vsl','thal','num']

In [None]:
cleveland_df.columns = features

In [None]:
cleveland_df.head()

## **Data Pre-processing**

#### **Step-1: Class handling in 'num'**
Handling the multivariate predicted attribute i.e. 'num', in which records with values >1 are effected with CAD and <1 are non-CAD.

##### **Checking the counts before applying any operation**

In [None]:
pd.DataFrame(cleveland_df['num'].value_counts())

##### **Finding the count of records with blockage > 50%**

In [None]:
cleveland_df[cleveland_df['num'] > 0]['num'].count()

##### **Categorizing the 'num' feature into two classes 0(Non-CAD i.e. blockage < 50%) and 1(CAD i.e. blockage >50%)**

In [None]:
cleveland_df['result'] = cleveland_df['num'].apply(lambda val : val if val == 0 else 1)

In [None]:
cleveland_df.head()

In [None]:
cleveland_df['result'].value_counts()

##### **Visualize the CAD and Non-CAD records**

In [None]:
bar_font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 14,
        }

In [None]:
cleveland_df['result'].value_counts().plot(kind='bar', figsize=(9,9), color= ['palegreen','orange'])
plt.minorticks_on()
plt.grid(which='major',color='coral',linestyle=':')
plt.xlabel('CAD and Non-CAD states',fontdict=bar_font)
plt.ylabel('Patient Count',fontdict=bar_font)
plt.title('CAD and Non-CAD patient count',fontdict=bar_font)

##### **Finding the missing values**
As missing values are marked as '?'. So replacing such values with None.

In [None]:
cleveland_df = cleveland_df.applymap(lambda val : None if val == '?' else val)

##### **Counting the missing values**

In [None]:
cleveland_df.isnull().sum()

##### **Visualize the missing records**

In [None]:
font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'bold',
        'size': 16,
        }

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(cleveland_df.isnull(),cbar=False,cmap='inferno')
plt.axis(ymin=0,ymax=318)
plt.minorticks_on()
plt.xlabel("Dataset features",fontdict=font)
plt.ylabel("Record Index",fontdict=font)
plt.title("Missing values in the dataset",fontdict=font)

##### **Records with NULL Color_vsl or ca**

In [None]:
cleveland_df[cleveland_df['color_vsl'].isnull()]

##### **Count the records for color_vsl categories**

In [None]:
cleveland_df['color_vsl'].value_counts()

##### **Count the CAD and Non-CAD records for every color_vsl category**

In [None]:
pd.DataFrame(cleveland_df.groupby(['color_vsl','result'])['result'].count())

### **Filling the missing values in COLOR_VSL feature**
##### **Replacing NULL with MAX occurence of respective feature class based on TARGET column** 

In [None]:
cleveland_df['fix_color_vsl'] = cleveland_df['color_vsl'].fillna(value='0.0')

In [None]:
cleveland_df['fix_color_vsl'].value_counts()

##### **Records with NULL THAL or THALASSEMIA**

In [None]:
cleveland_df[cleveland_df['thal'].isnull()]

##### **Count the records for thal categories**

In [None]:
cleveland_df['thal'].value_counts()

##### **Count the CAD and Non-CAD records for every thal category**

In [None]:
thal_missing_val = pd.DataFrame(cleveland_df.groupby(['thal','result'])['result'].count())

In [None]:
thal_missing_val.index.names = ['thal', 'result1']

In [None]:
thal_missing_val.sort_values(['thal'],ascending=True)

### **Filling the missing values in THAL feature**
##### **Replacing NULL with MAX occurence of respective feature class based on TARGET column** 

In [None]:
cleveland_df[(cleveland_df['result'] == 0) & (cleveland_df['thal'].isna())]

In [None]:
cleveland_df['result'].dtype

In [None]:
cleveland_df['fix_thal'] = cleveland_df[['thal','result']].apply(lambda val : '7.0' if val['result'] == 1 and val['thal'] == None 
                                                                 else '3.0' if val['result'] == 0 and val['thal'] == None else val['thal'],axis=1)

In [None]:
cleveland_df['thal'].value_counts()

In [None]:
cleveland_df['fix_thal'].value_counts()

### **Visualizing missing values again**

In [None]:
cleveland_df.columns

In [None]:
missing_val_font = {'family':'serif',
                   'weight': 'bold',
                   'size': 14,
                   'color':'red'}

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(cleveland_df[['age', 'gender', 'chest_pain', 'rest_bp', 'cholestrol', 'fst_bs',
       'rest_ecg', 'max_hrt_rate', 'ex_angina', 'oldpeak', 'slope', 'result', 'fix_color_vsl', 'fix_thal']].isnull(),cbar=False,cmap='viridis')
plt.minorticks_on()
plt.axis(ymin=0,ymax=315)
plt.xlabel("Dataset Features",fontdict=missing_val_font)
plt.ylabel("Record index",fontdict=missing_val_font)
plt.title("Post fix - Missing values",fontdict=missing_val_font)

### **Datatype handling**

##### **Several columns are having definite values but are of float datatype**

In [None]:
cleveland_df.info()

##### **Created a UDF for converting the datatypes of required columns**

In [None]:
def handle_datatype(df_name,unchange_col=None):
    """
    Description: This function will change the datatype of the features in the dataset.
    
    Input parameter:
    *df_name*: It will only accept the DataFrame object.
    *unchange_col*: This is the column for which you don't want to change the datatype.
    
    Return:
    It will returned the modified DataFrame object.
    """
    cols = ['age', 'gender', 'chest_pain', 'rest_bp', 'cholestrol', 'fst_bs',
            'rest_ecg', 'max_hrt_rate', 'ex_angina', 'oldpeak','slope', 'result', 'fix_color_vsl', 'fix_thal']
    for col in cols:
        if col != unchange_col:
            df_name[col] = df_name[col].astype('float')
            df_name[col] = df_name[col].astype('int')
    return df_name

In [None]:
handle_datatype(cleveland_df,'oldpeak')

### **Drop the non-required column**

In [None]:
cleveland_df.drop(['color_vsl','thal','num'],axis=1,inplace=True)

# **Exploratory Data Analysis**

## **Question-1: How many people of age group 29-48 have blockage greater than 50%?**

In [None]:
age_grp_29_48 = pd.concat([cleveland_df[(cleveland_df['age'] >= 29.0) & (cleveland_df['age'] < 48.0)][['age','result']].groupby(by='result',axis=0).count(),
           pd.DataFrame({'color':['palegreen','orange']})],axis=1)

In [None]:
age_grp_29_48.reset_index(inplace=True)

In [None]:
age_grp_29_48.columns = ['result','age','color']

In [None]:
age_grp_29_48

In [None]:
label_style={'family':'serif','color':'red','size':16}
age_grp_29_48.plot(kind='bar',x='result',y='age',figsize=(8,8),color=age_grp_29_48['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('Heart disease result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('Heart Disease Result of people from age group 29 - 48',fontdict=label_style)

## **Question-2: How many people of age group 48-56 have blockage greater than 50%?**

In [None]:
age_grp_48_56 = pd.concat([cleveland_df[(cleveland_df['age'] >= 48.0) & (cleveland_df['age'] < 56.0)][['age','result']].groupby(by='result',axis=0).count(),
           pd.DataFrame({'color':['palegreen','orange']})],axis=1)

age_grp_48_56.reset_index(inplace=True)

age_grp_48_56.columns = ['result','age','color']

age_grp_48_56

In [None]:
label_style={'family':'serif','color':'red','size':16}
age_grp_48_56.plot(kind='bar',x='result',y='age',figsize=(8,8),color=age_grp_48_56['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('Heart disease result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('Heart Disease Result of people from age group 48 - 56',fontdict=label_style)

## **Question-3: How many people of age group 56-77 have blockage greater than 50%?**

In [None]:
age_grp_56_77 = pd.concat([cleveland_df[(cleveland_df['age'] >= 56.0)][['age','result']].groupby(by='result',axis=0).count(),
           pd.DataFrame({'color':['palegreen','orange']})],axis=1)

age_grp_56_77.reset_index(inplace=True)

age_grp_56_77.columns = ['result','age','color']

age_grp_56_77

In [None]:
label_style={'family':'serif','color':'red','size':16}
age_grp_56_77.plot(kind='bar',x='result',y='age',figsize=(8,8),color=age_grp_56_77['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('Heart disease result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('Heart Disease Result of people from age group 56 - 77',fontdict=label_style)

## **Question-4: How many MALE and FEMALE have heart disease?**

In [None]:
pd.concat([pd.DataFrame(cleveland_df['gender'].value_counts()),pd.DataFrame({'gender_name':['female','male']})],axis=1)

In [None]:
gender_dist = pd.DataFrame(cleveland_df.groupby(by=['gender','result'],axis=0)['age'].count())

gender_dist.columns = ['Count of people']
gender_dist.index.names = ['Gender(0:Female,1:Male)','Heart Disease Result']

In [None]:
gender_dist

## **Question-5: How many patients suffered from various CHEST PAINS?**

In [None]:
chest_pain_dist = pd.DataFrame(cleveland_df.groupby(by=['gender','result','chest_pain'],axis=0)['age'].count())

In [None]:
chest_pain_dist.columns = ['Patient Count']
chest_pain_dist.index.names = ['Gender(0:Female,1:Male)','Heart Disease Result','Chest Pain Type']

In [None]:
chest_pain_dist

## **Question-6: Does high blood pressure at rest corresponds to a CAD?**

In [None]:
cleveland_df['rest_bp'].describe()

### **BP Group1: (94-120]**

In [None]:
rest_bp_94_120 = pd.DataFrame(cleveland_df[(cleveland_df['rest_bp'] >= 94.0) & (cleveland_df['rest_bp'] < 120.0)]['result'].value_counts())
rest_bp_94_120.reset_index(inplace=True)
rest_bp_94_120.columns = ['CAD Result','Patient_Count']
rest_bp_94_120['color'] = rest_bp_94_120['CAD Result'].apply(lambda val : 'palegreen' if val == 0 else 'orange')

In [None]:
rest_bp_94_120

In [None]:
label_style={'family':'serif','color':'red','size':16}
rest_bp_94_120.plot(kind='bar',y='Patient_Count',x='CAD Result',figsize=(8,8),color=rest_bp_94_120['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('CAD result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('CAD Result of people having BP from group 94 - 120',fontdict=label_style)

### **BP Group2: (120-130]**

In [None]:
rest_bp_120_130 = pd.DataFrame(cleveland_df[(cleveland_df['rest_bp'] >= 120.0) & (cleveland_df['rest_bp'] < 130.0)]['result'].value_counts())
rest_bp_120_130.reset_index(inplace=True)
rest_bp_120_130.columns = ['CAD Result','Patient_Count']
rest_bp_120_130['color'] = rest_bp_120_130['CAD Result'].apply(lambda val : 'palegreen' if val == 0 else 'orange')

In [None]:
rest_bp_120_130

In [None]:
label_style={'family':'serif','color':'red','size':16}
rest_bp_120_130.plot(kind='bar',y='Patient_Count',x='CAD Result',figsize=(8,8),color=rest_bp_120_130['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('CAD result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('CAD Result of people having BP from group 120 - 130',fontdict=label_style)

### **BP Group3: (130-140]**

In [None]:
rest_bp_130_140 = pd.DataFrame(cleveland_df[(cleveland_df['rest_bp'] >= 130.0) & (cleveland_df['rest_bp'] < 140.0)]['result'].value_counts())
rest_bp_130_140.reset_index(inplace=True)
rest_bp_130_140.columns = ['CAD Result','Patient_Count']
rest_bp_130_140['color'] = rest_bp_130_140['CAD Result'].apply(lambda val : 'palegreen' if val == 0 else 'orange')

In [None]:
rest_bp_130_140

In [None]:
label_style={'family':'serif','color':'red','size':16}
rest_bp_130_140.plot(kind='bar',y='Patient_Count',x='CAD Result',figsize=(8,8),color=rest_bp_130_140['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('CAD result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('CAD Result of people having BP from group 130 - 140',fontdict=label_style)

### **BP Group4: 140 or more**

In [None]:
rest_bp_140_more = pd.DataFrame(cleveland_df[(cleveland_df['rest_bp'] >= 140.0)]['result'].value_counts())
rest_bp_140_more.reset_index(inplace=True)
rest_bp_140_more.columns = ['CAD Result','Patient_Count']
rest_bp_140_more['color'] = rest_bp_140_more['CAD Result'].apply(lambda val : 'palegreen' if val == 0 else 'orange')

In [None]:
rest_bp_140_more

In [None]:
label_style={'family':'serif','color':'red','size':16}
rest_bp_140_more.plot(kind='bar',y='Patient_Count',x='CAD Result',figsize=(8,8),color=rest_bp_140_more['color'],legend=False)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('CAD result',fontdict=label_style)
plt.ylabel('Number of people',fontdict=label_style)
plt.title('CAD Result of people having BP from group 140 or more',fontdict=label_style)

## **Question-7: Does high blood pressure corresponds to high serum cholestrol, also leads to CAD?**

In [None]:
cleveland_df['cholestrol'].describe()

In [None]:
label_style={'family':'serif','color':'red','size':22}
plt.figure(figsize=(20,15))
sns.barplot(x=cleveland_df['rest_bp'],y=cleveland_df['cholestrol'],hue=cleveland_df['result'],ci=None,dodge=True)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('BP at Rest',fontdict=label_style)
plt.ylabel('Cholestrol level',fontdict=label_style)
plt.title('CAD Result',fontdict=label_style)

## **Question-8: Does high blood pressure corresponds to high serum cholestrol, also leads to high blood sugar?**

In [None]:
label_style={'family':'serif','color':'red','size':22}
plt.figure(figsize=(20,15))
sns.barplot(x=cleveland_df['rest_bp'],y=cleveland_df['cholestrol'],hue=cleveland_df['fst_bs'],ci=None,dodge=True)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('BP at Rest',fontdict=label_style)
plt.ylabel('Cholestrol level',fontdict=label_style)
plt.title('Blood Sugar Result',fontdict=label_style)

## **Question-9: Does high blood pressure corresponds to high blood sugar, also leads to CAD?**

In [None]:
pd.DataFrame(cleveland_df[cleveland_df['rest_bp'] >=160]['fst_bs'].value_counts())

In [None]:
sugar_high_bp_relation = pd.DataFrame(cleveland_df[cleveland_df['rest_bp'] >=160].groupby(['fst_bs','result'])['age'].count())
sugar_high_bp_relation.columns = ['People count']
sugar_high_bp_relation.index.names = ['Blood Sugar(0:Low,1:High)','CAD Result']
sugar_high_bp_relation

## **Question-10: Does ST Wave Abnormality corresponds leads to CAD?**

In [None]:
pd.DataFrame(cleveland_df.groupby('rest_ecg')['result'].count())

In [None]:
rest_ecg_CAD_relation = pd.DataFrame(cleveland_df.groupby(['rest_ecg','result'])['age'].count())
rest_ecg_CAD_relation.columns = ['People Count']
rest_ecg_CAD_relation.index.names = ['Rest ECG(0:Normal, 1:ST Wave Abnormal, 2:Left Vent Hypertrophy)','CAD Result']
rest_ecg_CAD_relation

In [None]:
rest_ecg_CAD_relation.reset_index(inplace=True)

In [None]:
rest_ecg_CAD_relation.columns = ['Rest_ECG','CAD_Result','People_Count']

In [None]:
label_style={'family':'serif','color':'red','size':16}
plt.figure(figsize=(8,8))
sns.barplot(x=rest_ecg_CAD_relation['Rest_ECG'],y=rest_ecg_CAD_relation['People_Count'],hue=rest_ecg_CAD_relation['CAD_Result'])
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('ECG at Rest',fontdict=label_style)
plt.ylabel('People Count',fontdict=label_style)
plt.title('Rest ECG & CAD relation',fontdict=label_style)

## **Question-11: Does LEFT VENTRICULAR HYPERTROPHY has a realtion with Blood Pressure and Cholestrol?**

In [None]:
cleveland_df[cleveland_df['rest_ecg'] == 2][['rest_bp','cholestrol']].describe()

In [None]:
left_vent_ht_bp_chol_relation = cleveland_df[cleveland_df['rest_ecg'] == 2][['rest_bp','cholestrol','result']]

In [None]:
label_style={'family':'serif','color':'orange','size':15}
plt.figure(figsize=(17,10))
sns.scatterplot(x=left_vent_ht_bp_chol_relation['rest_bp'],y=left_vent_ht_bp_chol_relation['cholestrol'],hue=left_vent_ht_bp_chol_relation['result'])
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='pink')
plt.xlabel('BP at Rest',fontdict=label_style)
plt.ylabel('Cholestrol level',fontdict=label_style)
plt.title('Rest BP & Cholestrol relation for Left Ventricular Hypertrophy',fontdict=label_style)

## **Question-12: Does LEFT VENTRICULAR HYPERTROPHY associated with High Blood Sugar, also leads to CAD?**

In [None]:
left_vent_hyt_bs_cad = cleveland_df[cleveland_df['rest_ecg'] == 2][['age','fst_bs','result']].groupby(by=['fst_bs','result']).count()
left_vent_hyt_bs_cad.index.names = ['Blood Sugar','CAD Result']
left_vent_hyt_bs_cad.columns = ['People count']

In [None]:
left_vent_hyt_bs_cad

## **Question-13: Does MAX Heart Rate corresponds to BP at Rest, also leads to CAD?**

In [None]:
cleveland_df['max_hrt_rate'].describe()

In [None]:
label_style={'family':'serif','color':'red','size':22}
plt.figure(figsize=(30,15))
sns.barplot(x=cleveland_df['max_hrt_rate'],y=cleveland_df['rest_bp'],hue=cleveland_df['result'],ci=None)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='coral')
plt.xlabel('Maximum Heart Rate',fontdict=label_style)
plt.ylabel('BP at Rest',fontdict=label_style)
plt.title('Rest BP & Max Heart Rate relation',fontdict=label_style)

## **Question-14: Does Exercise induced angina corresponds to CAD?**

In [None]:
pd.DataFrame(cleveland_df['ex_angina'].value_counts())

In [None]:
exc_angina_cad = pd.DataFrame(cleveland_df.groupby(by=['ex_angina','result'],axis=0)['age'].count())
exc_angina_cad.index.names = ['Exc Angina(1:Yes, 0:No)', 'CAD Result']
exc_angina_cad.columns = ['People Count']
exc_angina_cad

## **Question-15: How exercise induced angina and oldpeak corresponds to the CAD result?**

In [None]:
exangina_oldpk = cleveland_df[cleveland_df['ex_angina'] == 1][['rest_bp','oldpeak','result']]

In [None]:
label_style={'family':'serif','color':'Green','size':16}
plt.figure(figsize=(15,7))
sns.scatterplot(x=exangina_oldpk['oldpeak'],y=exangina_oldpk['rest_bp'],hue=exangina_oldpk['result'],linewidth=1)
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='pink')
plt.xlabel('Old Peak',fontdict=label_style)
plt.ylabel('BP at Rest',fontdict=label_style)
plt.title('Rest BP & Old peak where patient felt exercise induced angina',fontdict=label_style)

## **Question-16: What kind of ST slope in exercise test corresponds more to CAD?**

In [None]:
pd.DataFrame(cleveland_df['slope'].value_counts())

In [None]:
pd.DataFrame(cleveland_df.groupby(['slope','result'])['age'].count())

## **Question-17: Does ST slope has a relationship with Oldpeak and Max heart rate?**

In [None]:
cleveland_df[['rest_bp','oldpeak','slope']].head()

In [None]:
cleveland_df['slope'].unique()

In [None]:
label_style={'family':'serif','color':'coral','size':16}
plt.figure(figsize=(15,9))
sns.scatterplot(x=cleveland_df['oldpeak'],y=cleveland_df['max_hrt_rate'],hue=cleveland_df['slope'],palette=['red','lightpink','black'])
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='pink')
plt.xlabel('Old Peak',fontdict=label_style)
plt.ylabel('Maximum heart rate',fontdict=label_style)
plt.title('Maximum heart rate & Old peak for different ST slopes',fontdict=label_style)

## **Question-18: How Color Vessels in Flouroscopy and Exercise induced angina corresponds to CAD?**

In [None]:
pd.DataFrame(cleveland_df['fix_color_vsl'].value_counts())

In [None]:
pd.DataFrame(cleveland_df.groupby(['fix_color_vsl','result'])['age'].count())

In [None]:
pd.DataFrame(cleveland_df.groupby(['fix_color_vsl','ex_angina','result'])['age'].count())

## **Question-19: How Thalassemia corrsponds to CAD?**

In [None]:
pd.DataFrame(cleveland_df['fix_thal'].value_counts())

In [None]:
pd.DataFrame(cleveland_df.groupby(['fix_thal','result'])['age'].count())

## **Question-20: Does Thalassemia has any relationship with Age or Max Heart rate/BP/Cholestrol?**

In [None]:
label_style={'family':'serif','color':'coral','size':16}
plt.figure(figsize=(15,9))
sns.scatterplot(x=cleveland_df['age'],y=cleveland_df['max_hrt_rate'],hue=cleveland_df['fix_thal'])
plt.minorticks_on()
plt.grid(which='major',linestyle=':',color='pink')
plt.xlabel('Age',fontdict=label_style)
plt.ylabel('Maximum heart rate',fontdict=label_style)
plt.title('Maximum heart rate & Age for different effects of Thalassemia',fontdict=label_style)

### ***Don't forget to upvote this notebook if you like the work..***
#### ***Also, feel free to share any improvement ;)***