In [None]:
!pip install seaborn --upgrade

# Importing data analysis libraries

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

# Reading and checking the dataframe

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

Definitions of the columns:

age = age of the patient

sex = gender of the patient (male = 1 female = 0)

cp = chest pain, from 0 to 4 -- Value 1: typical angina - Value 2: atypical angina - Value 3: non-anginal pain - Value 4: asymptomatic

trestbps = resting blood pressure on mmHg when getting into the hospital

chol = serum cholestoral in ml/dl

fbs = fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

restecg = resting electrocardiographic results (values 0,1,2)

thalach = maximum heart rate achieved

exang = exercise induced angina (1 = yes; 0 = no)

oldpeak = ST depression induced by exercise relative to rest

slope = the slope of the peak exercise ST segment

ca = number of major vessels (0-3) colored by flourosopy

thal = 3 = normal; 6 = fixed defect; 7 = reversable defect

target = 0 False 1 True for disease

In [None]:
msno.matrix(df, figsize = (10,5))

In [None]:
df.isnull().sum()

No null data on the dataset, which is a good thing. Now let's check the type of each column

In [None]:
df.dtypes

# Exploratory analysis

In [None]:
print(sorted(df.age.unique()))

Our population varies from 29 to 77 years old

In [None]:
df.age.median()

The age that divides our dataset in 50/50 is 55

In [None]:
plt.figure(figsize = (10,5))
sns.histplot(data = df, x = 'age', kde = True, hue = 'sex', stat = 'count')
plt.xlabel('Age (29 to 77)')
plt.ylabel('Number of people on the dataset')
plt.legend(labels = ['male', 'female'], title = 'Sex')
plt.grid(which = 'both')

In [None]:
fem_age_mean = df.query('sex == 0')['age'].mean()
fem_age_median =df.query('sex == 0')['age'].median()
fem_age_mode = df.query('sex == 0')['age'].mode()
male_age_mean = df.query('sex == 1')['age'].mean()
male_age_median =df.query('sex == 1')['age'].median()
male_age_mode = df.query('sex == 1')['age'].mode()
print(f'The male mean age is {male_age_mean:.2f} and female is {fem_age_mean:.2f}.\nThe male mode age is {male_age_mode[0:4]} and female is {fem_age_mode}.\nThe male median ate is {male_age_median} and female is {fem_age_median}')

In [None]:
fig, ax = plt.subplots(figsize = (5,5))
sns.boxplot(data = df , x = 'sex', y = 'age')
plt.xlabel('Gender')
plt.ylabel('Age distribution')
plt.xticks(ticks = [0,1], labels = ['female', 'male'])
plt.show()

Looking at the data, we see that the female group is older than male.

In [None]:
df.sex.value_counts()

We have 207 men against 96 women in the dataset

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))
sns.countplot(data = df, x = 'sex')
plt.xticks(ticks = [0,1], labels = ['female', 'male'])
plt.grid(which = 'both', alpha = 0.5)

We have way more men in the dataset

In [None]:
sns.countplot(data = df, x = 'cp')
plt.xticks(ticks = [0,1,2,3], labels = ['typical\nangina', 'atypical\nangina', 'non anginal\npain', 'asymptomatic'])
plt.xlabel('Chest pain classification')
plt.ylabel('Number of patients')
plt.show()

In [None]:
print(f'{((len(df.query("cp == 0")))/(len(df)))*100:.2f} % of the dataset has typical angina')
print(f'{((len(df.query("cp == 1")))/(len(df)))*100:.2f} % of the dataset has atypical angina')
print(f'{((len(df.query("cp == 2")))/(len(df)))*100:.2f} % of the dataset has non aginal pain')
print(f'{((len(df.query("cp == 3")))/(len(df)))*100:.2f} % of the dataset has asymptomatic')

In [None]:
df_male = df.query('sex == 1')
df_female = df.query('sex == 0')

Let's check the chest pain distribution between gender

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', height = 5)
g = g.map(sns.countplot, 'cp')
plt.xticks(ticks = [0,1,2,3], labels = ['typical\nangina', 'atypical\nangina', 'non anginal\npain', 'asymptomatic'])
plt.xlabel('Chest pain')
plt.ylabel('Number of patients')
plt.show()

In [None]:
print(f'{((len(df_male.query("cp == 0")))/(len(df_male)))*100:.2f} % of the male dataset has typical angina')
print(f'{((len(df_male.query("cp == 1")))/(len(df_male)))*100:.2f} % of the male dataset has atypical angina')
print(f'{((len(df_male.query("cp == 2")))/(len(df_male)))*100:.2f} % of the male dataset has non aginal pain')
print(f'{((len(df_male.query("cp == 3")))/(len(df_male)))*100:.2f} % of the male dataset has asymptomatic')

In [None]:
print(f'{((len(df_female.query("cp == 0")))/(len(df_female)))*100:.2f} % of the female dataset has typical angina')
print(f'{((len(df_female.query("cp == 1")))/(len(df_female)))*100:.2f} % of the female dataset has atypical angina')
print(f'{((len(df_female.query("cp == 2")))/(len(df_female)))*100:.2f} % of the female dataset has non aginal pain')
print(f'{((len(df_female.query("cp == 3")))/(len(df_female)))*100:.2f} % of the female dataset has asymptomatic')

Spliting chest pain in two main groups: men and women, we see that among each group there are consistent differences. Men tend to have more typical angina.

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.kdeplot, 'age')

Women tend to have a narrower age distribution for typical agina and asymptomatic.

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.kdeplot, 'trestbps')

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.scatterplot,'age','trestbps')

No clear relationship between: age, resting blood pressure, chest pain and gender. On the second row we see a tendency, but it disappears in the other rows.

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.scatterplot,'age','chol')


No clear correlation between age, gender, cholesterol and chest pain

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.regplot,'chol','trestbps')
plt.ylim(80,250)

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'cp')
g = g.map(sns.scatterplot,'chol','trestbps')

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'fbs')
g = g.map(sns.kdeplot,'age')

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'fbs')
g = g.map(sns.kdeplot,'chol')

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'fbs')
g = g.map(sns.kdeplot,'trestbps')

In [None]:
g = sns.FacetGrid(data = df, col = 'sex', row = 'ca')
g = g.map(sns.kdeplot,'thalach')

In [None]:
sns.jointplot(data = df, x = 'age', y = 'trestbps', kind = 'hex')

In [None]:
sns.jointplot(data = df, x = 'age', y = 'chol', kind = 'hex')

In [None]:
sns.jointplot(data = df, x = 'age', y = 'thalach', kind = 'hex')

In [None]:
plt.figure(figsize = (20,15))
sns.heatmap(data = df.corr(), annot = True)
plt.ylim(0,14)

# No correlation detected between the columns