In [2]:
# import appropriate packages
import os
import requests
import json 
import pandas as pd
from datetime import datetime
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore") #supress warnings

In [None]:
import requests
from tqdm.notebook import tqdm

def fetch_total_pages(api_url, dataset_type_id, dataset_version_id, page_size=1000):
    current_page = 1
    
    try:
        with tqdm(desc="Fetching Total Records", unit="page") as pbar:
            while True:
                params = {
                    'page': current_page,
                    'size': page_size,
                    'dataset_type_id': dataset_type_id,
                    'dataset_version_id': dataset_version_id
                }
                response = requests.get(api_url, params=params)
                response.raise_for_status()
                
                data = response.json()
                
                if not data:
                    break
                
                # Update progress bar
                pbar.update(1)
                
                if len(data) < page_size:
                    break
                
                current_page += 1
                
        # Calculate total pages
        total_pages = current_page
        return total_pages
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    api_url = 'https://data.cms.gov/data-api/v1/dataset/8900b9c5-50b7-43de-9bdd-0d7113a8355e/data'
    dataset_type_id = '8900b9c5-50b7-43de-9bdd-0d7113a8355e'
    dataset_version_id = 'c9537569-13f5-42d1-8c0a-dd7264629d6a'
    
    # Fetch total pages
    total_pages = fetch_total_pages(api_url, dataset_type_id, dataset_version_id)
    if total_pages is not None:
        print(f"Total pages available: {total_pages}")

Fetching Total Records: 0page [00:00, ?page/s]

In [57]:
import requests
import pandas as pd
from tqdm.notebook import tqdm

def fetch_all_pages(api_url, dataset_type_id, dataset_version_id, page_size=1000, total_rows=1044711):
    all_data = []
    offset = 0  # Offset instead of page number
    total_pages = (total_rows // page_size) + 1  # Calculate the total number of pages

    try:
        with tqdm(total=total_pages, desc="Fetching Data") as pbar:
            while offset < total_rows:
                params = {
                    'start': offset,  # Use 'start' instead of 'page'
                    'size': page_size,
                    'dataset_type_id': dataset_type_id,
                    'dataset_version_id': dataset_version_id
                }
                print(f"Requesting URL: {api_url} with params: {params}")  # Debug statement
                response = requests.get(api_url, params=params)
                response.raise_for_status()
                
                data = response.json()
                
                print(f"Fetched {len(data)} records from offset {offset}")  # Debug statement
                if offset == 0:
                    print(f"First few records from offset {offset}: {data[:3]}")  # Print first few records of the first page for inspection
                if offset == page_size:
                    print(f"First few records from offset {offset}: {data[:3]}")  # Print first few records of the second page for comparison
                
                all_data.extend(data)
                
                # Update progress bar
                pbar.update(1)
                
                # Check if there are more pages based on returned data size
                if len(data) < page_size:
                    break
                
                # Increment offset for next request
                offset += page_size
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {str(e)}")
    
    return all_data

# Example usage
if __name__ == "__main__":
    api_url = 'https://data.cms.gov/data-api/v1/dataset/8900b9c5-50b7-43de-9bdd-0d7113a8355e/data'
    dataset_type_id = '8900b9c5-50b7-43de-9bdd-0d7113a8355e'
    dataset_version_id = 'c9537569-13f5-42d1-8c0a-dd7264629d6a'
    
    # Fetch all pages of data
    all_data = fetch_all_pages(api_url, dataset_type_id, dataset_version_id)
    
    # Convert all data to DataFrame
    df = pd.DataFrame(all_data)
    print(f"Total rows fetched: {len(df)}")
    print(df.head())  # Displaying the first few rows as an example

Fetching Data:   0%|          | 0/1045 [00:00<?, ?it/s]

Requesting URL: https://data.cms.gov/data-api/v1/dataset/8900b9c5-50b7-43de-9bdd-0d7113a8355e/data with params: {'start': 0, 'size': 1000, 'dataset_type_id': '8900b9c5-50b7-43de-9bdd-0d7113a8355e', 'dataset_version_id': 'c9537569-13f5-42d1-8c0a-dd7264629d6a'}
Fetched 1000 records from offset 0
First few records from offset 0: [{'reference_period': '2019-01-01 to 2019-12-31', 'type_of_service': 'Ambulance (Emergency & Non-Emergency)', 'aggregation_level': 'NATION + TERRITORIES', 'state': '--ALL--', 'county': '--ALL--', 'state_fips': ' ', 'county_fips': ' ', 'number_of_fee_for_service_beneficiaries': '36,122,263', 'number_of_providers': '8,814', 'average_number_of_users_per_provider': '495.69', 'percentage_of_users_out_of_ffs_beneficiaries': '12.09%', 'number_of_users': '4,368,976', 'average_number_of_providers_per_county': '39.85', 'number_of_dual_eligible_users': '1,348,463', 'percentage_of_dual_eligible_users_out_of_total_users': '30.86%', 'percentage_of_dual_eligible_users_out_of_dua

KeyboardInterrupt: 

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045000 entries, 0 to 1044999
Data columns (total 48 columns):
 #   Column                                                                                Non-Null Count    Dtype 
---  ------                                                                                --------------    ----- 
 0   reference_period                                                                      1045000 non-null  object
 1   type_of_service                                                                       1045000 non-null  object
 2   aggregation_level                                                                     1045000 non-null  object
 3   state                                                                                 1045000 non-null  object
 4   county                                                                                1045000 non-null  object
 5   state_fips                                                            

In [53]:
df.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
1044995     True
1044996     True
1044997     True
1044998     True
1044999     True
Length: 1045000, dtype: bool