In [5]:
import os
import requests
import zipfile
from io import BytesIO

# This is the URL to the data. There are many files in the zip file
# In particular we will retrieve the cleveland data
url = "https://archive.ics.uci.edu/static/public/45/heart+disease.zip"

# Make sure the proper data folders exist
os.makedirs("../data/raw", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

# Download the zip file into memory
response = requests.get(url)

# Open the zip from memory
with zipfile.ZipFile(BytesIO(response.content)) as z:
    # We only want the Cleveland data
    z.extract("processed.cleveland.data", "../data/raw")

print("Download complete! File saved to data/raw/processed.cleveland.data")

Download complete! File saved to data/raw/processed.cleveland.data


In [6]:
import pandas as pd

cols = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

df = pd.read_csv("../data/raw/processed.cleveland.data", header=None, names=cols)

In [7]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [8]:
# Write out processed data
df.to_csv("../data/processed/cleveland_clean.csv", index=False)
print("Write complete! File saved to data/processed/cleveland_clean.csv")

Write complete! File saved to data/processed/cleveland_clean.csv
