# Download data

## Download the data
- The dataset can be downloaded from: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [1]:
# Download the data
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data --output ../data/cleveland.data

# Download the metadata
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names --output ../data/heart-disease.names

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 18461  100 18461    0     0  16123      0  0:00:01  0:00:01 --:--:-- 16123
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10060  100 10060    0     0  12208      0 --:--:-- --:--:-- --:--:-- 12193


## Load the data into a Pandas Dataframe

In [2]:
import pandas as pd

heart_df = pd.read_csv("../data/cleveland.data", header=None)
heart_df.columns = ["age", "sex", "cp", "trestbps", "chol", 
                    "fbs", "restecg", "thalach", "exang",
                    "oldpeak", "slope", "ca", "thal", "target"]
heart_df["target"] = heart_df["target"].apply(lambda x: 1 if x>0 else 0)

print(heart_df.shape)
heart_df.head()

(303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## Split out training and testing datasets

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(heart_df, test_size=0.1, random_state=0)

print(train_df.shape, test_df.shape)
train_df.head()

(272, 14) (31, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
255,42.0,0.0,3.0,120.0,209.0,0.0,0.0,173.0,0.0,0.0,2.0,0.0,3.0,0
134,43.0,0.0,3.0,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,1
157,58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1
241,41.0,0.0,2.0,126.0,306.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0


## Write the training and testing datasets to S3

In [4]:
train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)