In [10]:
# import appropriate packages
import os
import requests
import json 
import pandas as pd
from datetime import datetime
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore") #supress warnings

In [8]:
# API endpoint
api_url = 'https://data.cms.gov/data-api/v1/dataset/8900b9c5-50b7-43de-9bdd-0d7113a8355e/data'

# initialize variables for pagination
all_data = []
page = 1
rows_per_page = 1000 #API retrurn 1000 rows per page 

while True:
    # request with pagation 
    params = {'page': page, 'size': rows_per_page}
    response = requests.get(api_url, params=params)

    # check status of data retrival 
    if response.status_code == 200:
        data = response.json()

        # break if no more data
        if not data:
            break

        # apend data to list
        all_data.extend(data)
        
        # increment page number
        page += 1
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        print(response.text)
        break

# convert retrieved data to a df 
df = pd.DataFrame(all_data)

# check last modified date 
last_modified_header = response.headers.get('Last-Modified')
if last_modified_header:
    last_modified = datetime.strptime(last_modified_header, '%a, %d %b %Y %H:%M:%S GMT')
    print(f"Data was last modified: {last_modified}")

print(f"Total rows fetched: {len(df)}")

KeyboardInterrupt: 

In [None]:
# API endpoint
api_url = 'https://data.cms.gov/data-api/v1/dataset/8900b9c5-50b7-43de-9bdd-0d7113a8355e/data'

# initialize variables for pagination
all_data = []
page = 1
rows_per_page = 1000  # API returns 1000 rows per page
progress_interval = 100000 

# fetch all pages with a progress bar
pbar = tqdm(desc="Fetching data", unit="page")

while True:
    # request data from API with pagination
    params = {'page': page, 'size': rows_per_page}
    response = requests.get(api_url, params=params)

    # check status of data retrieval
    if response.status_code == 200:
        data = response.json()

        # break if no more data
        if not data:
            break

        # append data to list
        all_data.extend(data)
        
        # increment page number
        page += 1
        
    # update progress bar
        if len(all_data) % progress_interval < rows_per_page:
            tqdm.write(f"Fetched {len(all_data)} rows")

        pbar.update(1)
    else:
        print(f"Failed to fetch data on page {page}. Status code: {response.status_code}")
        print(response.text)
        break

pbar.close()

# convert retrieved data to a dataframe
df = pd.DataFrame(all_data)

# check last modified date
last_modified_header = response.headers.get('Last-Modified')
if last_modified_header:
    last_modified = datetime.strptime(last_modified_header, '%a, %d %b %Y %H:%M:%S GMT')
    print(f"Data was last modified: {last_modified}")

print(f"Total rows fetched: {len(df)}")

Fetching data: 100page [00:25,  3.82page/s]

Fetched 100000 rows


Fetching data: 200page [00:50,  4.08page/s]

Fetched 200000 rows


Fetching data: 300page [01:17,  4.05page/s]

Fetched 300000 rows


Fetching data: 400page [01:55,  3.93page/s]

Fetched 400000 rows


Fetching data: 500page [02:22,  3.99page/s]

Fetched 500000 rows


Fetching data: 600page [02:50,  2.40page/s]

Fetched 600000 rows


Fetching data: 700page [03:57,  1.21s/page]

Fetched 700000 rows


Fetching data: 800page [05:48,  1.20s/page]

Fetched 800000 rows


Fetching data: 900page [07:36,  1.16s/page]

Fetched 900000 rows


Fetching data: 1000page [09:27,  1.05s/page]

Fetched 1000000 rows


Fetching data: 1100page [11:17,  1.03s/page]

Fetched 1100000 rows


Fetching data: 1200page [13:09,  1.15s/page]

Fetched 1200000 rows


Fetching data: 1300page [14:53,  1.01s/page]

Fetched 1300000 rows


Fetching data: 1400page [16:36,  1.09s/page]

Fetched 1400000 rows


Fetching data: 1422page [17:01,  1.05s/page]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 48 columns):
 #   Column                                                                                Non-Null Count  Dtype 
---  ------                                                                                --------------  ----- 
 0   reference_period                                                                      1000 non-null   object
 1   type_of_service                                                                       1000 non-null   object
 2   aggregation_level                                                                     1000 non-null   object
 3   state                                                                                 1000 non-null   object
 4   county                                                                                1000 non-null   object
 5   state_fips                                                                            1000 