# Benchmarking Different dataframe libraries

- https://www.youtube.com/watch?v=zrY2u2_WJ0o
- Packages to compare:
    - Pandas 2.0 (pyarrow)
    - Pandas + cudf
    - Polars (python api)

1. Gather dataset (large GB+)
    - Different data types (date, int, floats, strings, etc.)
    - https://data.cms.gov/provider-summary-by-type-of-service/medicare-physician-other-practitioners/medicare-physician-other-practitioners-by-provider/data
2. install latest of each library
3. create benchmarks to run
4. test

In [1]:
%pip install polars

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import polars as pl
import requests
import sys
import os

print(pd.__version__)
print(pl.__version__)

2.1.4
0.20.2


# Data

## Download data

In [3]:
# if medicare_data.csv doesn't exist, download it 
if not os.path.exists('medicare_data.csv'):
    url = 'https://data.cms.gov/sites/default/files/2023-05/914a4463-7af3-423f-83a7-b343794e20ee/MUP_PHY_R23_P05_V10_D21_Prov_Svc.csv'
    response = requests.get(url)
    if response.status_code == 200:
        with open('medicare_data.csv', 'wb') as file:
            file.write(response.content)
        print("File downloaded successfully.")
    else:
        print("Failed to download the file.")

## pandas df (default backend)

In [4]:
pandas_df = pd.read_csv('medicare_data.csv')
pandas_df.shape

  pandas_df = pd.read_csv('medicare_data.csv')


(9886177, 29)

In [5]:
print(f"{sys.getsizeof(pandas_df) / (1024**3): .2f} GB")

 12.65 GB


In [6]:
pandas_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9886177 entries, 0 to 9886176
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   Rndrng_NPI                     int64  
 1   Rndrng_Prvdr_Last_Org_Name     object 
 2   Rndrng_Prvdr_First_Name        object 
 3   Rndrng_Prvdr_MI                object 
 4   Rndrng_Prvdr_Crdntls           object 
 5   Rndrng_Prvdr_Gndr              object 
 6   Rndrng_Prvdr_Ent_Cd            object 
 7   Rndrng_Prvdr_St1               object 
 8   Rndrng_Prvdr_St2               object 
 9   Rndrng_Prvdr_City              object 
 10  Rndrng_Prvdr_State_Abrvtn      object 
 11  Rndrng_Prvdr_State_FIPS        object 
 12  Rndrng_Prvdr_Zip5              object 
 13  Rndrng_Prvdr_RUCA              float64
 14  Rndrng_Prvdr_RUCA_Desc         object 
 15  Rndrng_Prvdr_Cntry             object 
 16  Rndrng_Prvdr_Type              object 
 17  Rndrng_Prvdr_Mdcr_Prtcptg_Ind  object 
 18  HC

In [7]:
pandas_df['Rndrng_Prvdr_Gndr'] = pandas_df['Rndrng_Prvdr_Gndr'].astype('category')
pandas_df['Rndrng_Prvdr_State_FIPS'] = pandas_df['Rndrng_Prvdr_State_FIPS'].\
    astype('str').\
    astype('category')
pandas_df['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'] = pandas_df['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'].\
    replace({'Y': 1, 'N': 0}).\
    astype('boolean')
pandas_df['Rndrng_Prvdr_Zip5'] = pandas_df['Rndrng_Prvdr_Zip5'].astype('str')

In [8]:
pandas_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9886177 entries, 0 to 9886176
Data columns (total 29 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   Rndrng_NPI                     int64   
 1   Rndrng_Prvdr_Last_Org_Name     object  
 2   Rndrng_Prvdr_First_Name        object  
 3   Rndrng_Prvdr_MI                object  
 4   Rndrng_Prvdr_Crdntls           object  
 5   Rndrng_Prvdr_Gndr              category
 6   Rndrng_Prvdr_Ent_Cd            object  
 7   Rndrng_Prvdr_St1               object  
 8   Rndrng_Prvdr_St2               object  
 9   Rndrng_Prvdr_City              object  
 10  Rndrng_Prvdr_State_Abrvtn      object  
 11  Rndrng_Prvdr_State_FIPS        category
 12  Rndrng_Prvdr_Zip5              object  
 13  Rndrng_Prvdr_RUCA              float64 
 14  Rndrng_Prvdr_RUCA_Desc         object  
 15  Rndrng_Prvdr_Cntry             object  
 16  Rndrng_Prvdr_Type              object  
 17  Rndrng_Prvdr_Mdcr_Prtcptg_I

## Build pandas dataframe with Pyarrow backend

In [9]:
pandas_df.to_parquet('medicare_data.parquet')

In [10]:
import os
print(f"{os.path.getsize('medicare_data.csv') / (1024**3): .2f} GB")
print(f"{os.path.getsize('medicare_data.parquet') / (1024**3): .2f} GB")

 2.91 GB
 0.40 GB


In [11]:
# read in data with pyarrow backend
pyarrow_df = pd.read_parquet('medicare_data.parquet', dtype_backend='pyarrow')

In [12]:
# 11.4 GB vs 3.5 GB
pyarrow_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9886177 entries, 0 to 9886176
Data columns (total 29 columns):
 #   Column                         Dtype                                                       
---  ------                         -----                                                       
 0   Rndrng_NPI                     int64[pyarrow]                                              
 1   Rndrng_Prvdr_Last_Org_Name     string[pyarrow]                                             
 2   Rndrng_Prvdr_First_Name        string[pyarrow]                                             
 3   Rndrng_Prvdr_MI                string[pyarrow]                                             
 4   Rndrng_Prvdr_Crdntls           string[pyarrow]                                             
 5   Rndrng_Prvdr_Gndr              dictionary<values=string, indices=int32, ordered=0>[pyarrow]
 6   Rndrng_Prvdr_Ent_Cd            string[pyarrow]                                             
 7   Rndrng_Pr

# Dask

In [13]:
import dask.dataframe as dd

dask_df = dd.from_pandas(pandas_df, npartitions=10)
dask_df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 29 entries, Rndrng_NPI to Avg_Mdcr_Stdzd_Amt
dtypes: boolean(1), category(1), category(1), float64(6), int64(3), string(17)

## Polars

In [14]:
%pip install polars

[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
# polars_df = pl.from_pandas(pandas_df)
polars_df_scan = pl.scan_parquet('medicare_data.parquet')
polars_df_read = pl.read_parquet('medicare_data.parquet')

## pandas cuDF

In [16]:
# %pip install cudf-cu12 rmm-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.ngc.nvidia.com/
# %pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com cudf-cu12
# %pip install \
#     --extra-index-url=https://pypi.nvidia.com \
#     cudf-cu12==23.12.* dask-cudf-cu12==23.12.* cuml-cu12==23.12.* \
#     cugraph-cu12==23.12.* cuspatial-cu12==23.12.* cuproj-cu12==23.12.* \
#     cuxfilter-cu12==23.12.* cucim-cu12==23.12.* pylibraft-cu12==23.12.* \
#     raft-dask-cu12==23.12.*

---

# Benchmarks

In [17]:
# pyarrow slower than pandas
%timeit pandas_df['Rndrng_Prvdr_State_FIPS'].value_counts()
%timeit pyarrow_df['Rndrng_Prvdr_State_FIPS'].value_counts()
%timeit polars_df_read['Rndrng_Prvdr_State_FIPS'].value_counts().sort(-pl.col('count'))
%timeit polars_df_scan.group_by('Rndrng_Prvdr_State_FIPS').agg(pl.count()).sort(-pl.col('count')).collect()

37.3 ms ± 719 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
66.2 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
148 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
51.2 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
# why dask so slow?
%timeit dask_df['Rndrng_Prvdr_State_FIPS'].value_counts().compute()

14.4 s ± 227 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


---

In [19]:
# pyarrow faster than pandas
%timeit pandas_df.groupby('Rndrng_Prvdr_Crdntls')['Avg_Sbmtd_Chrg'].mean().sort_values()
%timeit pyarrow_df.groupby('Rndrng_Prvdr_Crdntls')['Avg_Sbmtd_Chrg'].mean().sort_values()

532 ms ± 8.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
388 ms ± 6.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit pandas_df.query("Rndrng_Prvdr_Crdntls == '(FNP) FAMILY NURSE P'")
%timeit pyarrow_df.query("Rndrng_Prvdr_Crdntls == '(FNP) FAMILY NURSE P'")

129 ms ± 712 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.98 s ± 8.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
# dask is terrible for querying
%timeit dask_df.query("Rndrng_Prvdr_Crdntls == '(FNP) FAMILY NURSE P'").compute()

14.6 s ± 331 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


---

In [22]:
# observed=False to remove warnings
%timeit pandas_df.\
    groupby(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr'], observed=False)\
    [['Tot_Benes', 'Tot_Srvcs', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt']].\
    agg(['mean', 'std', 'min', 'max'])
%timeit pyarrow_df.\
    groupby(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr'], observed=False)\
    [['Tot_Benes', 'Tot_Srvcs', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt']].\
    agg(['mean', 'std', 'min', 'max'])

2.67 s ± 7.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.27 s ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
#%timeit 
# dask why you so slow? Perhaps i installed/configured something wrong?
dask_df.\
    groupby(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr'])\
    [['Tot_Benes', 'Tot_Srvcs', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt']].\
    agg(['mean', 'std', 'min', 'max']).compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tot_Benes,Tot_Benes,Tot_Benes,Tot_Benes,Tot_Srvcs,Tot_Srvcs,Tot_Srvcs,Tot_Srvcs,Tot_Bene_Day_Srvcs,Tot_Bene_Day_Srvcs,...,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,mean,std,min,max,mean,std,...,min,max,mean,std,min,max,mean,std,min,max
Rndrng_Prvdr_Crdntls,Rndrng_Prvdr_Gndr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
(FNP) FAMILY NURSE P,F,54.714286,43.172963,26.0,149.0,59.000000,42.720019,28.0,151.0,59.000000,42.720019,...,7.680000,184.180000,85.883588,49.950831,6.140000,152.219470,77.994716,45.314871,5.570000,138.165430
(FNP) FAMILY NURSE P,M,,,,,,,,,,,...,,,,,,,,,,
(LCSW),F,11.000000,,11.0,11.0,74.000000,,74.0,74.0,74.000000,,...,74.097838,74.097838,53.881216,,53.881216,53.881216,55.553378,,55.553378,55.553378
(LCSW),M,,,,,,,,,,,...,,,,,,,,,,
", M.D., PH.D.",F,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"WHNP, CFNP",M,,,,,,,,,,,...,,,,,,,,,,
"WHNP-BC,NP-C",F,38.000000,32.365105,11.0,90.0,57.200000,60.611055,14.0,159.0,57.200000,60.611055,...,2.183824,75.853774,19.526918,18.261404,2.183824,49.517296,20.180113,18.670040,2.210000,50.690566
"WHNP-BC,NP-C",M,,,,,,,,,,,...,,,,,,,,,,
WILLIAM MANDRICK,F,,,,,,,,,,,...,,,,,,,,,,


In [26]:
%%timeit

results = polars_df_read.group_by(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr']).\
    agg([
        pl.col('Tot_Benes').mean().alias('Tot_Benes_mean'),
        pl.col('Tot_Benes').std().alias('Tot_Benes_std'),
        pl.col('Tot_Benes').min().alias('Tot_Benes_min'),
        pl.col('Tot_Benes').max().alias('Tot_Benes_max'),

        pl.col('Tot_Srvcs').mean().alias('Tot_Srvcs_mean'),
        pl.col('Tot_Srvcs').std().alias('Tot_Srvcs_std'),
        pl.col('Tot_Srvcs').min().alias('Tot_Srvcs_min'),
        pl.col('Tot_Srvcs').max().alias('Tot_Srvcs_max'),

        pl.col('Tot_Bene_Day_Srvcs').mean().alias('Tot_Bene_Day_Srvcs_mean'),
        pl.col('Tot_Bene_Day_Srvcs').std().alias('Tot_Bene_Day_Srvcs_std'),
        pl.col('Tot_Bene_Day_Srvcs').min().alias('Tot_Bene_Day_Srvcs_min'),
        pl.col('Tot_Bene_Day_Srvcs').max().alias('Tot_Bene_Day_Srvcs_max'),

        pl.col('Avg_Sbmtd_Chrg').mean().alias('Avg_Sbmtd_Chrg_mean'),
        pl.col('Avg_Sbmtd_Chrg').std().alias('Avg_Sbmtd_Chrg_std'),
        pl.col('Avg_Sbmtd_Chrg').min().alias('Avg_Sbmtd_Chrg_min'),
        pl.col('Avg_Sbmtd_Chrg').max().alias('Avg_Sbmtd_Chrg_max'),

        pl.col('Avg_Mdcr_Alowd_Amt').mean().alias('Avg_Mdcr_Alowd_Amt_mean'),
        pl.col('Avg_Mdcr_Alowd_Amt').std().alias('Avg_Mdcr_Alowd_Amt_std'),
        pl.col('Avg_Mdcr_Alowd_Amt').min().alias('Avg_Mdcr_Alowd_Amt_min'),
        pl.col('Avg_Mdcr_Alowd_Amt').max().alias('Avg_Mdcr_Alowd_Amt_max'),

        pl.col('Avg_Mdcr_Pymt_Amt').mean().alias('Avg_Mdcr_Pymt_Amt_mean'),
        pl.col('Avg_Mdcr_Pymt_Amt').std().alias('Avg_Mdcr_Pymt_Amt_std'),
        pl.col('Avg_Mdcr_Pymt_Amt').min().alias('Avg_Mdcr_Pymt_Amt_min'),
        pl.col('Avg_Mdcr_Pymt_Amt').max().alias('Avg_Mdcr_Pymt_Amt_max'),

        pl.col('Avg_Mdcr_Stdzd_Amt').mean().alias('Avg_Mdcr_Stdzd_Amt_mean'),
        pl.col('Avg_Mdcr_Stdzd_Amt').std().alias('Avg_Mdcr_Stdzd_Amt_std'),
        pl.col('Avg_Mdcr_Stdzd_Amt').min().alias('Avg_Mdcr_Stdzd_Amt_min'),
        pl.col('Avg_Mdcr_Stdzd_Amt').max().alias('Avg_Mdcr_Stdzd_Amt_max'),
    ])

418 ms ± 18.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
%%timeit

results = polars_df_scan.group_by(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr']).\
    agg([
        pl.col('Tot_Benes').mean().alias('Tot_Benes_mean'),
        pl.col('Tot_Benes').std().alias('Tot_Benes_std'),
        pl.col('Tot_Benes').min().alias('Tot_Benes_min'),
        pl.col('Tot_Benes').max().alias('Tot_Benes_max'),

        pl.col('Tot_Srvcs').mean().alias('Tot_Srvcs_mean'),
        pl.col('Tot_Srvcs').std().alias('Tot_Srvcs_std'),
        pl.col('Tot_Srvcs').min().alias('Tot_Srvcs_min'),
        pl.col('Tot_Srvcs').max().alias('Tot_Srvcs_max'),

        pl.col('Tot_Bene_Day_Srvcs').mean().alias('Tot_Bene_Day_Srvcs_mean'),
        pl.col('Tot_Bene_Day_Srvcs').std().alias('Tot_Bene_Day_Srvcs_std'),
        pl.col('Tot_Bene_Day_Srvcs').min().alias('Tot_Bene_Day_Srvcs_min'),
        pl.col('Tot_Bene_Day_Srvcs').max().alias('Tot_Bene_Day_Srvcs_max'),

        pl.col('Avg_Sbmtd_Chrg').mean().alias('Avg_Sbmtd_Chrg_mean'),
        pl.col('Avg_Sbmtd_Chrg').std().alias('Avg_Sbmtd_Chrg_std'),
        pl.col('Avg_Sbmtd_Chrg').min().alias('Avg_Sbmtd_Chrg_min'),
        pl.col('Avg_Sbmtd_Chrg').max().alias('Avg_Sbmtd_Chrg_max'),

        pl.col('Avg_Mdcr_Alowd_Amt').mean().alias('Avg_Mdcr_Alowd_Amt_mean'),
        pl.col('Avg_Mdcr_Alowd_Amt').std().alias('Avg_Mdcr_Alowd_Amt_std'),
        pl.col('Avg_Mdcr_Alowd_Amt').min().alias('Avg_Mdcr_Alowd_Amt_min'),
        pl.col('Avg_Mdcr_Alowd_Amt').max().alias('Avg_Mdcr_Alowd_Amt_max'),

        pl.col('Avg_Mdcr_Pymt_Amt').mean().alias('Avg_Mdcr_Pymt_Amt_mean'),
        pl.col('Avg_Mdcr_Pymt_Amt').std().alias('Avg_Mdcr_Pymt_Amt_std'),
        pl.col('Avg_Mdcr_Pymt_Amt').min().alias('Avg_Mdcr_Pymt_Amt_min'),
        pl.col('Avg_Mdcr_Pymt_Amt').max().alias('Avg_Mdcr_Pymt_Amt_max'),

        pl.col('Avg_Mdcr_Stdzd_Amt').mean().alias('Avg_Mdcr_Stdzd_Amt_mean'),
        pl.col('Avg_Mdcr_Stdzd_Amt').std().alias('Avg_Mdcr_Stdzd_Amt_std'),
        pl.col('Avg_Mdcr_Stdzd_Amt').min().alias('Avg_Mdcr_Stdzd_Amt_min'),
        pl.col('Avg_Mdcr_Stdzd_Amt').max().alias('Avg_Mdcr_Stdzd_Amt_max'),
    ]).collect()

791 ms ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
polars_df_scan.group_by(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr']).\
    agg([
        pl.col('Tot_Benes').mean().alias('Tot_Benes_mean'),
        pl.col('Tot_Benes').std().alias('Tot_Benes_std'),
        pl.col('Tot_Benes').min().alias('Tot_Benes_min'),
        pl.col('Tot_Benes').max().alias('Tot_Benes_max'),

        pl.col('Tot_Srvcs').mean().alias('Tot_Srvcs_mean'),
        pl.col('Tot_Srvcs').std().alias('Tot_Srvcs_std'),
        pl.col('Tot_Srvcs').min().alias('Tot_Srvcs_min'),
        pl.col('Tot_Srvcs').max().alias('Tot_Srvcs_max'),

        pl.col('Tot_Bene_Day_Srvcs').mean().alias('Tot_Bene_Day_Srvcs_mean'),
        pl.col('Tot_Bene_Day_Srvcs').std().alias('Tot_Bene_Day_Srvcs_std'),
        pl.col('Tot_Bene_Day_Srvcs').min().alias('Tot_Bene_Day_Srvcs_min'),
        pl.col('Tot_Bene_Day_Srvcs').max().alias('Tot_Bene_Day_Srvcs_max'),

        pl.col('Avg_Sbmtd_Chrg').mean().alias('Avg_Sbmtd_Chrg_mean'),
        pl.col('Avg_Sbmtd_Chrg').std().alias('Avg_Sbmtd_Chrg_std'),
        pl.col('Avg_Sbmtd_Chrg').min().alias('Avg_Sbmtd_Chrg_min'),
        pl.col('Avg_Sbmtd_Chrg').max().alias('Avg_Sbmtd_Chrg_max'),

        pl.col('Avg_Mdcr_Alowd_Amt').mean().alias('Avg_Mdcr_Alowd_Amt_mean'),
        pl.col('Avg_Mdcr_Alowd_Amt').std().alias('Avg_Mdcr_Alowd_Amt_std'),
        pl.col('Avg_Mdcr_Alowd_Amt').min().alias('Avg_Mdcr_Alowd_Amt_min'),
        pl.col('Avg_Mdcr_Alowd_Amt').max().alias('Avg_Mdcr_Alowd_Amt_max'),

        pl.col('Avg_Mdcr_Pymt_Amt').mean().alias('Avg_Mdcr_Pymt_Amt_mean'),
        pl.col('Avg_Mdcr_Pymt_Amt').std().alias('Avg_Mdcr_Pymt_Amt_std'),
        pl.col('Avg_Mdcr_Pymt_Amt').min().alias('Avg_Mdcr_Pymt_Amt_min'),
        pl.col('Avg_Mdcr_Pymt_Amt').max().alias('Avg_Mdcr_Pymt_Amt_max'),

        pl.col('Avg_Mdcr_Stdzd_Amt').mean().alias('Avg_Mdcr_Stdzd_Amt_mean'),
        pl.col('Avg_Mdcr_Stdzd_Amt').std().alias('Avg_Mdcr_Stdzd_Amt_std'),
        pl.col('Avg_Mdcr_Stdzd_Amt').min().alias('Avg_Mdcr_Stdzd_Amt_min'),
        pl.col('Avg_Mdcr_Stdzd_Amt').max().alias('Avg_Mdcr_Stdzd_Amt_max'),
    ]).\
    collect().\
    sort('Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr')

Rndrng_Prvdr_Crdntls,Rndrng_Prvdr_Gndr,Tot_Benes_mean,Tot_Benes_std,Tot_Benes_min,Tot_Benes_max,Tot_Srvcs_mean,Tot_Srvcs_std,Tot_Srvcs_min,Tot_Srvcs_max,Tot_Bene_Day_Srvcs_mean,Tot_Bene_Day_Srvcs_std,Tot_Bene_Day_Srvcs_min,Tot_Bene_Day_Srvcs_max,Avg_Sbmtd_Chrg_mean,Avg_Sbmtd_Chrg_std,Avg_Sbmtd_Chrg_min,Avg_Sbmtd_Chrg_max,Avg_Mdcr_Alowd_Amt_mean,Avg_Mdcr_Alowd_Amt_std,Avg_Mdcr_Alowd_Amt_min,Avg_Mdcr_Alowd_Amt_max,Avg_Mdcr_Pymt_Amt_mean,Avg_Mdcr_Pymt_Amt_std,Avg_Mdcr_Pymt_Amt_min,Avg_Mdcr_Pymt_Amt_max,Avg_Mdcr_Stdzd_Amt_mean,Avg_Mdcr_Stdzd_Amt_std,Avg_Mdcr_Stdzd_Amt_min,Avg_Mdcr_Stdzd_Amt_max
str,cat,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
,,407.314281,5097.188973,11,882358,977.73974,20337.984573,2.7,7.272824e6,574.02459,8323.41123,11,1773223,729.093522,3589.561112,0.006,99999.99,179.968552,911.978145,0.006,50065.52073,149.858791,729.533471,0.006,40042.589888,149.026465,728.140322,0.006,39241.738146
,"""F""",50.856641,82.978847,11,6854,151.482786,796.656166,7.1,121742.0,105.588543,212.585304,11,13547,285.517113,541.695551,0.01,50000.0,80.063043,87.577291,0.009524,6940.359474,62.642462,69.963039,0.006429,5544.107368,60.821359,66.785742,0.006429,5036.041053
,"""M""",62.69931,364.644918,11,107217,169.601677,1723.847023,11.0,620254.0,112.101056,430.107712,11,107219,403.28563,930.023178,0.01,61060.0,98.073781,175.455368,0.01,16417.0852,76.832679,140.515682,0.0,13125.1432,74.318989,131.680417,0.007047,11131.032
"""(D.C.) CHIROPR…","""F""",15.0,4.242641,12,18,74.0,43.84062,43.0,105.0,74.0,43.84062,43,105,60.0,0.0,60.0,60.0,39.265,9.66615,32.43,46.1,27.865539,10.401317,20.510698,35.220381,23.892834,9.021099,17.513953,30.271714
"""(D.C.) CHIROPR…","""M""",93.5,75.660426,40,147,1291.5,1673.721751,108.0,2475.0,1291.5,1673.721751,108,2475,62.291667,3.240906,60.0,64.583333,47.807683,8.969391,41.465366,54.15,36.933406,9.032008,30.546812,43.32,35.615018,9.028157,29.231147,41.998889
"""(FNP) FAMILY N…","""F""",54.714286,43.172963,26,149,59.0,42.720019,28.0,151.0,59.0,42.720019,28,151,420.816382,235.889928,34.0,764.549669,105.347143,60.204824,7.68,184.18,85.883588,49.950831,6.14,152.21947,77.994716,45.314871,5.57,138.16543
"""(LCSW)""","""F""",11.0,,11,11,74.0,,74.0,74.0,74.0,,74,74,100.0,,100.0,100.0,74.097838,,74.097838,74.097838,53.881216,,53.881216,53.881216,55.553378,,55.553378,55.553378
"""(PA-C)""","""M""",64.0,75.382359,11,188,196.0,279.425303,11.0,676.0,196.0,279.425303,11,676,95.692053,73.587678,33.0,215.036364,60.778031,42.509959,11.54,118.350909,43.09554,30.625309,9.23,86.403636,56.889924,37.446763,9.54,105.679091
""").D.""","""M""",18.25,5.188127,11,22,18.25,5.188127,11.0,22.0,18.25,5.188127,11,22,111.704545,53.402282,50.0,179.0,68.467386,43.902313,28.869545,122.31,48.572462,28.631294,21.566364,81.776667,51.77053,30.46633,23.132273,87.301667
""", DPT""","""M""",16.5,0.57735,16,17,385.75,306.960774,17.0,765.0,280.75,176.922535,17,384,79.6375,42.82753,56.35,143.75,54.498309,43.842671,27.174402,119.11,43.727345,35.178225,21.803615,95.57,36.45256,29.406807,18.320379,79.88


In [30]:
pandas_df.\
    groupby(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr'], observed=False)\
    [['Tot_Benes', 'Tot_Srvcs', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt']].\
    agg(['mean', 'std', 'min', 'max']).\
    sort_values(['Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Tot_Benes,Tot_Benes,Tot_Benes,Tot_Benes,Tot_Srvcs,Tot_Srvcs,Tot_Srvcs,Tot_Srvcs,Tot_Bene_Day_Srvcs,Tot_Bene_Day_Srvcs,...,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt,Avg_Mdcr_Stdzd_Amt
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,mean,std,min,max,mean,std,...,min,max,mean,std,min,max,mean,std,min,max
Rndrng_Prvdr_Crdntls,Rndrng_Prvdr_Gndr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
(D.C.) CHIROPRACTIC,F,15.000000,4.242641,12.0,18.0,74.00,43.840620,43.0,105.0,74.00,43.840620,...,32.430000,46.10,27.865539,10.401317,20.510698,35.220381,23.892834,9.021099,17.513953,30.271714
(D.C.) CHIROPRACTIC,M,,,,,,,,,,,...,,,,,,,,,,
(D.C.) CHIROPRACTOR,F,,,,,,,,,,,...,,,,,,,,,,
(D.C.) CHIROPRACTOR,M,93.500000,75.660426,40.0,147.0,1291.50,1673.721751,108.0,2475.0,1291.50,1673.721751,...,41.465366,54.15,36.933406,9.032008,30.546812,43.320000,35.615018,9.028157,29.231147,41.998889
(FNP) FAMILY NURSE P,F,54.714286,43.172963,26.0,149.0,59.00,42.720019,28.0,151.0,59.00,42.720019,...,7.680000,184.18,85.883588,49.950831,6.140000,152.219470,77.994716,45.314871,5.570000,138.165430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WILLIAM TAYLOR,M,30.250000,11.898879,14.0,42.0,43.75,22.306576,14.0,68.0,43.75,22.306576,...,35.350000,133.76,65.268778,33.286264,28.350000,107.260000,65.629444,33.481261,28.460000,107.790000
WNHP-BC,F,23.250000,9.708244,12.0,34.0,26.25,7.973916,19.0,35.0,26.25,7.973916,...,3.480000,119.10,47.453452,35.489223,3.480000,85.068000,43.516750,32.465160,3.410000,78.017000
WNHP-BC,M,,,,,,,,,,,...,,,,,,,,,,
WOMEN'S NP,F,15.000000,0.000000,15.0,15.0,15.00,0.000000,15.0,15.0,15.00,0.000000,...,33.600000,36.52,35.060000,2.064752,33.600000,36.520000,34.880000,2.460732,33.140000,36.620000


---

# Appendix

In [None]:
# url = 'https://data.cms.gov/data-api/v1/dataset/5a6f0f6f-0439-403d-bd99-2c7631003cb1/data-viewer?_format=csv'

# response = requests.get(url)
# # Check if the request was successful
# if response.status_code == 200:
#     # Use io.BytesIO to treat the response content as a file-like object for zipfile
#     zipped_file = zipfile.ZipFile(io.BytesIO(response.content))
#     # Extract all the contents into the current directory
#     zipped_file.extractall()
#     print("File downloaded and unzipped successfully.")
# else:
#     print("Failed to download the file.")
# pandas_df = pd.read_csv('Medicare_Physician_Other_Practitioners_by_Provider_2021.csv')
# pandas_df.shape