In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
import warnings
warnings.filterwarnings('ignore')

# Loading data from given source and initial overview

In [None]:
df=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
df.sample(5)

In [None]:
df.shape

In [None]:
df.columns

# Variable Identification

In [None]:
# Tabular view of Data {shape of data, the type of data , the missing values , unique counts , % Missing}
# Creating the Data Dictionary with first column being datatype.
Data_dict = pd.DataFrame(df.dtypes)
# Identifying unique values . For this I've used nunique() which returns unique elements in the object.
Data_dict['UniqueVal'] = df.nunique()
# Identifying the missing values from the dataset.
Data_dict['MissingVal'] = df.isnull().sum()
# Percentage of Missing Values
Data_dict['Percent Missing'] = round(df.isnull().sum()/len(df)*100, 2)
# identifying count of the variable.
Data_dict['Count'] = df.count()
# Renaming the first column using rename()
Data_dict = Data_dict.rename(columns = {0:'DataType'})
Data_dict

# Numeric Statistical Summary

In [None]:
# descriptive statistics
df.describe()

**Inference**  
1. Dataset consists of 303 observations across 14 features with target variable "output" that has 1 = Abnormal Heart and 0 = Healthy Heart
2. There are no missing values in the data set   
3. Variables "Sex" , "Fasting Blood Sugar" , "Resting ECG" , "Exercise Induced Angina" , "Slope"  , "Thall" and "Number of Major Vessels" are catgorical variable but with numeric values

# Exploratory Data Analysis

## Analyzing Target (Healthy Heart)  
**output ( 0 = Abnormal heart, 1= Normal heart)**  

In [None]:
# Target distribution counts
df.output.value_counts()

In [None]:
# Target distribution proportion
df.output.value_counts(normalize = True)



In [None]:
# Visualizing Target Distribution
sns.countplot(data = df , x = 'output');

**Inference**  
1. Approximately 55% of the recorded patients have a "Healthy Heart"  
2. The Target class is fairly balanced given the distibution between "Healthy -- 55% and Non Healthy -- 45%"


## Analyzing truly categorical variables
    sex: sex (1 = male; 0 = female)
    fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
    restecg: resting electrocardiographic results
    -- Value 0: normal
    -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
    exng: exercise induced angina (1 = yes; 0 = no)
    slp: the slope of the peak exercise ST segment
    -- Value 1: upsloping
    -- Value 2: flat
    -- Value 3: downsloping
    thall : 
    -- Value 1 : 
    -- Value 2 :  
    -- Value 3 : 
    -- Value 4 : 
    caa: number of major vessels (0-3) colored by flourosop

In [None]:
# create dataframe of categorical variables for separate analysis
cat_df = df[['sex' , 'fbs' , 'restecg' , 'exng' , 'slp' , 'caa' , 'thall' , 'output']]

cat_df.sample(5)

In [None]:
# distribution of categories
for i in cat_df.columns:
    print('----------------------------------')
    print(cat_df[i].value_counts(normalize=True))

In [None]:
# distribution of categories
for i in cat_df.columns:
    print('----------------------------------')
    print(cat_df[i].value_counts())

## Statistical associations between Categorical features and Target

1. Count Plot by Target to visualize distribution between categories by target
2. Chi2 Test of association to reject null hypothesis that categorical variable and target are assocoiated


### Analysing assocition between gender and healthy heart

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "sex");

In [None]:
sex_freq = pd.crosstab(cat_df.sex, cat_df.output)
sex_prop = sex_freq/len(cat_df)
sex_freq

In [None]:
sex_prop

In [None]:
from scipy.stats import chi2_contingency
chi2, pval, dof, expected = chi2_contingency(sex_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

**Inference**  
1. Higher Proportion of Males than Females have an Abnormal Heart
2. Insignificant p-value coupled by a high chi2 statistic substantiates that "Gender and Heart Condition" are related



## Analysing assocition between fasting blood sugar and healthy heart

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "fbs");

In [None]:
fbs_freq = pd.crosstab(cat_df.fbs, cat_df.output)
fbs_prop = fbs_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(fbs_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

**Inference**  
1. Fairly equal proportions of Patients have either a healthy or a normal heart when compared with higher fasting blood sugar
2. Significant P-Vale with a very low Chi2 Statistic defies the Null Hypothesis and we infer that "Having Fasting Blood Sugar >120" is not associated with either normal or abnormal heart.
3. We exclude exclude this feature from modeling

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "restecg");

In [None]:
restecg_freq = pd.crosstab(cat_df.restecg, cat_df.output)
restecg_prop = restecg_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(restecg_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

**Inference**  
1. Varying proportions of patients with resting ECG across patients with a healthy or a normal heart
2. In-significant P-Vale coupled by a high chi2 statistic substantiates that "resting ECG and Heart Condition" are related

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "exng");

In [None]:
exng_freq = pd.crosstab(cat_df.exng, cat_df.output)
exng_prop = exng_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(exng_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

**Inference**  
1. Varying proportions of patients with Exercise Induced Angina across patients with a healthy or a normal heart
2. In-significant P-Vale coupled by a high chi2 statistic substantiates that "Exercise Induce Angina and Heart Condition" are related

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "slp");

In [None]:
slp_freq = pd.crosstab(cat_df.slp, cat_df.output)
slp_prop = slp_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(slp_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

**Inference**
1. Varying proportions of patients with slope of the peak exercise ST segment across patients with a healthy or a normal heart
2. In-significant P-Vale couplued by a high chi2 statistic substantiates that "Slope of the peak exercise ST segment" and "Heart Condition" are related

In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "caa");

In [None]:
caa_freq = pd.crosstab(cat_df.caa, cat_df.output)
caa_prop = caa_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(caa_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

In [None]:
caa_prop

**Inference**  
1. Varying proportions of patients with colured major blood vessels across patients with a healthy or a normal heart
2. In-significant P-Vale couplued by a high chi2 statistic substantiates that "Major Blood Vessel Color" and "Heart Condition" are related  
3. We can also look at combining "3" and "4" to minimize sub-categories



In [None]:
sns.countplot(data = cat_df , hue = "output" , x = "thall");

In [None]:
thall_freq = pd.crosstab(cat_df.thall, cat_df.output)
thall_prop = thall_freq/len(cat_df)
chi2, pval, dof, expected = chi2_contingency(thall_freq)
print("Expected Frequency")
print(np.round(expected))
print("Chi-Square Statistic")
print(chi2)
print("P-Value")
print(pval)

In [None]:
thall_prop

**Inference**  
1. Varying proportions of patients with thall across patients with a healthy or a normal heart
2. In-significant P-Vale couplued by a high chi2 statistic substantiates that "Thallr" and "Heart Condition" are related  
3. We can also look at combining "0" and "1" to minimize sub-categories