# HealthPartner Assessment 

In [18]:
import os
import json
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

# Define the metadata file and data directory
metadata_file = 'metadata.json'
data_dir = 'HealthPartner/'
data_urls = ['https://data.cms.gov/provider-data/api/1/metastore/schemas/dataset/items']  

# Function to load metadata from a file
def load_metadata():
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as f:
            return json.load(f)
    else:
        return {}

# Function to save metadata to a file
def save_metadata(metadata):
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f)

# Function to fetch data from the API and process it
def fetch_and_process_data(url, metadata):

    file_name = 'CSM_provider_data' + '.csv'
    file_path = os.path.join(data_dir, file_name)
    
    # Check the last modified timestamp for the file
    if file_name in metadata:
        last_modified = metadata[file_name]['last_modified']
    else:
        last_modified = None
    
    # Make an API request to get the JSON data
    response = requests.get(url)
    data = response.json()
    
    # Perform transformations
    df = pd.DataFrame(data)
    df['theme'] = df['theme'].apply(lambda x: ", ".join(x))
    filtered_df = df[df['theme'] == 'Hospitals'].copy()
    filtered_df.columns = df.columns.str.lower().str.strip().str.replace('@','').str.replace(' ','_')
    print('Displaying the Sample 5 rows of the transformed data! \n\n')
    print(filtered_df.head())
    
    # Save the dataframe to a CSV file
    filtered_df.to_csv(file_path, index=False)
    
    # Update metadata with the current timestamp
    metadata[file_name] = {'last_modified': str(datetime.now())}
    save_metadata(metadata)
    
    print(f"\n\nProcessed and saved {file_name}.")

# Function to process multiple files in parallel
def process_files_in_parallel():
    metadata = load_metadata()
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Download and process files in parallel
        executor.map(lambda url: fetch_and_process_data(url, metadata), data_urls)

# Main function to run the task
def main():
    # Ensure the data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    process_files_in_parallel()

# Run the script
if __name__ == "__main__":
    main()


Displaying the Sample 5 rows of the transformed data! 


   accesslevel                                        landingpage bureaucode  \
13      public  https://data.cms.gov/provider-data/dataset/48n...   [009:38]   
14      public  https://data.cms.gov/provider-data/dataset/4jc...   [009:38]   
15      public  https://data.cms.gov/provider-data/dataset/wue...   [009:38]   
16      public  https://data.cms.gov/provider-data/dataset/tf3...   [009:38]   
17      public  https://data.cms.gov/provider-data/dataset/axe...   [009:38]   

        issued          type    modified    released  \
13  2023-07-05  dcat:Dataset  2025-01-08  2025-02-19   
14  2024-10-10  dcat:Dataset  2025-01-15  2025-02-19   
15  2024-10-10  dcat:Dataset  2025-01-15  2025-02-19   
16  2020-12-10  dcat:Dataset  2025-01-08  2025-02-19   
17  2024-10-10  dcat:Dataset  2025-01-15  2025-02-19   

                                              keyword  \
13  [Ambulatory Surgical Center, Outpatient, Surve...   
14         