In [1]:
import requests
import pandas as pd

API_URL = "https://heart-disease-api.vercel.app/api/data"
API_KEY = "82f8a705-9570-4a29-8533-d807edf14716"

def fetch_data(cursor=0, limit=10000):
    headers = {"Authorization": f"Bearer {API_KEY}"}
    params = {"cursor": cursor, "limit": limit}
    response = requests.get(API_URL, headers=headers, params=params)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Request failed with status code: {response.status_code}")

def get_all_data():
    all_data = []
    next_page = 0

    while next_page is not None:
        data = fetch_data(cursor=next_page)
        all_data.extend(data["items"])
        next_page = data["nextPage"]

    return all_data

# Fetch all the data from the API
data = get_all_data()

In [2]:
# CLEAN THE DATA

# ISSUES:
# - random rows with all `null` values -> remove them
# - missing fields in the JSON -> replace with 0
# - null values in the JSON -> replace with 0
# - some cells have ~ in them -> remove that
# - some cells have the number in quotes -> remove quotes

def clean_data(data):
    cleaned_data = []

    all_columns = set()
    
    # Collect all unique column names
    for row in data:
        all_columns.update(row.keys())
    
    for row in data:
        cleaned_row = {}
        
        # Add missing columns with a default value of 0
        for column in all_columns:
            if column not in row:
                cleaned_row[column] = 0
            else:
                value = row[column]
                if value is None or value == "NA":
                    cleaned_row[column] = 0
                elif isinstance(value, str):
                    # Remove '~' characters from the value
                    cleaned_row[column] = value.replace("~", "")
                    
                    # Remove quotes from numeric values
                    if cleaned_row[column].startswith('"') and cleaned_row[column].endswith('"'):
                        cleaned_row[column] = cleaned_row[column][1:-1]
                else:
                    cleaned_row[column] = value
        
        # Check if the row has any non-zero values
        if any(value != 0 for value in cleaned_row.values()):
            cleaned_data.append(cleaned_row)
    
    return cleaned_data

In [3]:
# Convert the data into a pandas DataFrame
df = pd.DataFrame(clean_data(data))

# Print the first few rows of the DataFrame
print(df.head())

  alco height weight ﻿id cardio ap_lo    age smoke gender ap_hi active  \
0    0    168     62   0      0    80  18393     0      2   110      1   
1    0    156     85   1      1    90  20228     0      1   140      1   
2    0    165     64   2      1    70  18857     0      1   130      0   
3    0    169     82   3      1   100  17623     0      2   150      1   
4    0    156     56   4      0    60  17474     0      1   100      0   

  cholesterol gluc  
0           1    1  
1           3    1  
2           3    1  
3           1    1  
4           1    1  
