## Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.pyplot import plot
import altair as alt
alt.renderers.enable('notebook')

import os
print(os.listdir("../input"))

## Importing Data 

In [None]:
#Import dataset and print the structure of the dataset
df = pd.read_csv('../input/heart.csv')
print('\nShape of Dataset: {}'.format(df.shape))

In [None]:
#view the data
df.head()

# Exploratory Data Analysis (EDA)

In [None]:
#Checking the count of samples across each person's Age
df["age"].value_counts().head(10)

In [None]:
# Adding a new column - 'age_group'. Calculation logic - eg) If age is between 40 to 49, age_group will be 40

# start stop and step variables 
start, stop, step = 0, 1, 1  
# converting to string data type 
df["age_str"]= df["age"].astype(str) 
# slicing till 2nd last element 
df["age_group"]= df["age_str"].str.slice(start, stop, step) 
# concatenate zero at the end
df['age_group'] = df['age_group'] + '0'
#converting to int
df['age_group'] = df['age_group'].astype(int)

## Heart Diseases among different Age groups

In [None]:
df2 = df.groupby(['age_group','target'])['age_group'].count().unstack('target').fillna(0)
df2.plot(kind='bar', stacked=True, color=['green', 'red'])

In [None]:
df = df.drop(columns=['age_group','age_str'])

## Comparison of Heart Diease seen between Males and Females

In [None]:
#sex 1- male, 0-female
df[['age','sex','target']].groupby(['sex','target']).count()

In [None]:
#Chart showing the comparison heart diseases between Males & Females.
#sex 1- male, 0-female
df2 = df.groupby(['sex','target'])['sex'].count().unstack('target').fillna(0)
df2.plot(kind='bar', stacked=False, color=['limegreen', 'orangered'])
plt.show()

## Identifying features that are highly correlated to the heart disease

In [None]:
corr = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(df.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(df.columns)
ax.set_yticklabels(df.columns)
plt.show()

The possibility of heart disease is highly related to the features - 'chest pain type' (CP), maximum heart rate achieved (thalach). It is surprising that the column 'age' is not very correlated to 'target'. This tells us that heart dieases are common in all age groups

## Which chest pain types attribute to heart disease ?

In [None]:
# Grouping by 'cp' (Chest Pain Type)
df[['target','cp']].groupby(['cp']).count()

In [None]:
alt.Chart(df).mark_bar().encode(
    x='count(target):Q',
    y=alt.Y(
        'cp:N',
        sort=alt.EncodingSortField(
            field="target",  # The field to use for the sort
            op="count",  # The operation to run on the field prior to sorting
            order="descending"  # The order to sort in
        )
    )
)

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Since the dataset is very small, applying any Machine Learning Algorithm will result in overfitting and therefore cannot predict accurate results. This can be used for carrying out basic data analysis thereby make anyone familiar with few of the data vizualisation libraries and how it can be applied to the dataset.