In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("input/heart.csv")

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Attributes
1. Age
2. Sex
  * 0 = female
  * 1 = make
3. Chest pain type (cp)
  * 1 = typical angina
  * 2 = atypical angina
  * 3 = non-anginal pain
  * 4 = asymptomatic
4. Resting blood presure (trestbps)
5. Serum cholestoral in mg/dl (chol)
6. Fasting blood sugar > 120 mg/dl (fbs)
7. Resting electrocardiographic results (restecg)
  * Values 0-2
8. Maximum heart rate achieved (thalach)
9. Exercise induced angina (exang)
  * 1 = yes; 0 = no
10. ST depression induced by exercise relative to rest (oldpeak)
11. Slope of the peak exercise ST segment (slope)
12. Number of major vessels colored by flourosopy
  * 0-3
13. thal
  * 3 = normal
  * 6 = fixed defect
  * 7 = reversable defect
14. target (Heart disease)
  * 0 = no
  * 1 = yes

In [13]:
df.sample(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
259,38,1,3,120,231,0,1,182,1,3.8,1,0,3,0
278,58,0,1,136,319,1,0,152,0,0.0,2,2,2,0
173,58,1,2,132,224,0,0,173,0,3.2,2,2,3,0


In [14]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

# Looking at Categorical Attributes:
### "cp", "fbs", "restecg", "ca", "thal"

In [10]:
df['cp'].sample()

25    1
Name: cp, dtype: int64

In [18]:
df.dtypes.value_counts()

int64      13
float64     1
dtype: int64

#### Since some of the attributes have multiple categories, we will use one-hot encoding to get rid of any unwanted bias that could arise from arbitrary numbering of categories

In [21]:
df = pd.get_dummies(df)
print("Shape: ", df.shape)

Shape:  (303, 14)


#### Now we will create our training and testing sets from the dataframe

In [23]:
train, test = train_test_split(df, test_size=0.30, random_state=42)

In [24]:
print("Train Shape: ", train.shape)
print("Test Shape: ", test.shape)

Train Shape:  (212, 14)
Test Shape:  (91, 14)


In [11]:
df.groupby(['target'])['cp'].value_counts(normalize=True).mul(100).reset_index(name="percent")

Unnamed: 0,target,cp,percent
0,0,0,75.362319
1,0,2,13.043478
2,0,1,6.521739
3,0,3,5.072464
4,1,2,41.818182
5,1,1,24.848485
6,1,0,23.636364
7,1,3,9.69697


In [25]:
df.groupby(['target'])['fbs'].value_counts(normalize=True).mul(100).reset_index(name="percent")

Unnamed: 0,target,fbs,percent
0,0,0,84.057971
1,0,1,15.942029
2,1,0,86.060606
3,1,1,13.939394


In [29]:
df.groupby(['target'])['restecg'].value_counts(normalize=True).mul(100).reset_index(name="percent")

Unnamed: 0,target,restecg,percent
0,0,0,57.246377
1,0,1,40.57971
2,0,2,2.173913
3,1,1,58.181818
4,1,0,41.212121
5,1,2,0.606061


In [30]:
df.groupby(['target'])['ca'].value_counts(normalize=True).mul(100).reset_index(name="percent")

Unnamed: 0,target,ca,percent
0,0,0,32.608696
1,0,1,31.884058
2,0,2,22.463768
3,0,3,12.318841
4,0,4,0.724638
5,1,0,78.787879
6,1,1,12.727273
7,1,2,4.242424
8,1,4,2.424242
9,1,3,1.818182


In [32]:
df.groupby(['target'])['thal'].value_counts(normalize=True).mul(100).reset_index(name="percent")

Unnamed: 0,target,thal,percent
0,0,3,64.492754
1,0,2,26.086957
2,0,1,8.695652
3,0,0,0.724638
4,1,2,78.787879
5,1,3,16.969697
6,1,1,3.636364
7,1,0,0.606061
