# Download data

In [None]:
!pip install -U pandas pandas-profiling scikit-learn sagemaker

## Download the data
- The dataset can be downloaded from: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [None]:
# Download the data
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data --output ../data/cleveland.data

# Download the metadata
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names --output ../data/heart-disease.names

## Load the data into a Pandas Dataframe

In [None]:
import pandas as pd

heart_df = pd.read_csv("../data/cleveland.data", header=None)
heart_df.columns = ["age", "sex", "cp", "trestbps", "chol", 
                    "fbs", "restecg", "thalach", "exang",
                    "oldpeak", "slope", "ca", "thal", "target"]

heart_df["target"] = heart_df["target"].apply(lambda x: 1 if x>0 else 0)
heart_df = heart_df[(heart_df["ca"]!="?") & (heart_df["thal"]!="?")]

print(heart_df.shape)
heart_df.head()

## Split out training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(heart_df, test_size=0.1, random_state=42)

print(train_df.shape, test_df.shape)
train_df.head()

In [None]:
# Create a big dataset to test out batch inference later
big_test_df = test_df.sample(500_000, replace=True, ignore_index=True)

print(big_test_df.shape)
big_test_df.head()

## Write the training and testing datasets to S3

In [None]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()

print(bucket)

In [None]:
# Write the files locally
train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)
big_test_df.to_json("../data/bigtest.json", orient="records", lines=True)

In [None]:
# Send the files to S3
train_path = session.upload_data(
    path="../data/train.csv", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

test_path = session.upload_data(
    path="../data/test.csv", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

bigtest_path = session.upload_data(
    path="../data/bigtest.json", bucket=bucket, key_prefix="sagemaker/heart_disease"
)

print(f"Train path: {train_path}")
print(f"Test path: {test_path}")
print(f"Big Test path: {bigtest_path}")