# Download data

## Download the data
- The dataset can be downloaded from: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [1]:
# Download the data
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data --output ../data/cleveland.data

# Download the metadata
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names --output ../data/heart-disease.names

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 18461  100 18461    0     0  40307      0 --:--:-- --:--:-- --:--:-- 40307
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10060  100 10060    0     0  32038      0 --:--:-- --:--:-- --:--:-- 32038


## Load the data into a Pandas Dataframe

In [2]:
import pandas as pd

heart_df = pd.read_csv("../data/cleveland.data", header=None)
heart_df.columns = ["age", "sex", "cp", "trestbps", "chol", 
                    "fbs", "restecg", "thalach", "exang",
                    "oldpeak", "slope", "ca", "thal", "target"]

heart_df["target"] = heart_df["target"].apply(lambda x: 1 if x>0 else 0)
heart_df = heart_df[(heart_df["ca"]!="?") & (heart_df["thal"]!="?")]

print(heart_df.shape)
heart_df.head()

(297, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## Split out training and testing datasets

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(heart_df, test_size=0.1, random_state=42)

print(train_df.shape, test_df.shape)
train_df.head()

(267, 14) (30, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
112,52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0
222,39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
24,60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,1
82,39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0


In [4]:
# Create a big dataset to test out batch inference later
big_test_df = test_df.sample(500_000, replace=True, ignore_index=True)

print(big_test_df.shape)
big_test_df.head()

(500000, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0
1,34.0,0.0,2.0,118.0,210.0,0.0,0.0,192.0,0.0,0.7,1.0,0.0,3.0,0
2,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0
3,52.0,1.0,4.0,112.0,230.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,1
4,35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0


## Write the training and testing datasets to S3

In [5]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()

print(bucket)

sagemaker-us-east-1-298138509966


In [6]:
# Write the files locally
train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)
big_test_df.to_json("../data/bigtest.json", orient="records", lines=True)

In [7]:
# Send the files to S3
train_path = session.upload_data(
    path="../data/train.csv", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

test_path = session.upload_data(
    path="../data/test.csv", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

bigtest_path = session.upload_data(
    path="../data/bigtest.json", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

print(f"Train path: {train_path}")
print(f"Test path: {test_path}")
print(f"Big Test path: {bigtest_path}")

Train path: s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/train.csv
Test path: s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test.csv
Big Test path: s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/bigtest.json
