In [1]:
#import libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# for Mac errors
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as ticker
import numpy as np

In [2]:
# load data
cbsa = pd.read_csv("Market_Saturation_and_Utilization_CBSA_Dataset_Release_16_20231010.csv")
cbsa.head(5)

Unnamed: 0,reference_period,type_of_service,aggregation_level,cbsa,cbsatitle,number_of_fee_for_service_beneficiaries,number_of_providers,average_number_of_users_per_provider,percentage_of_users_out_of_ffs_beneficiaries,number_of_users,...,average_number_of_providers_per_cbsa_dual_color,average_number_of_providers_per_cbsa_description,number_of_dual_eligible_users_dual_color,number_of_dual_eligible_users_description,percentage_of_dual_eligible_users_out_of_total_users_dual_color,percentage_of_dual_eligible_users_out_of_total_users_description,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_description,total_payment_dual_color,total_payment_description
0,2015-01-01 to 2015-12-31,Ambulance (Emergency & Non-Emergency),NATION + TERRITORIES,--ALL--,--ALL--,37359009,9078,424.34,10.31%,3852199,...,,,,,,,,,,
1,2015-01-01 to 2015-12-31,Ambulance (Emergency & Non-Emergency),CBSA,10100,"Aberdeen, SD",7526,3,231.0,9.21%,693,...,BLUE 1,Lowest 25% (Less than 5.00 Providers),BLUE 2,Second Lowest 25% (166 - 349 Dual Eligible Users),BLUE 2,Second Lowest 25% (21.67% - 26.95% of Total Us...,BLUE 3,Third Lowest 25% (17.06% - 19.32% of Total FFS...,BLUE 2,"Second Lowest 25% ($525,929.51 - $1,042,852.56 )"
2,2015-01-01 to 2015-12-31,Ambulance (Emergency & Non-Emergency),CBSA,10140,"Aberdeen, WA",17349,18,110.44,11.46%,1988,...,BLUE 4,Top 25% Excl. Extreme Values (16.00 - 32.49 Pr...,BLUE 3,Third Lowest 25% (350 - 845 Dual Eligible Users),BLUE 4,Top 25% Excl. Extreme Values (33.58% - 51.44% ...,BLUE 4,Top 25% Excl. Extreme Values (19.33% - 27.78% ...,BLUE 3,"Third Lowest 25% ($1,042,852.57 - $2,686,138.28 )"
3,2015-01-01 to 2015-12-31,Ambulance (Emergency & Non-Emergency),CBSA,10180,"Abilene, TX",25198,11,271.09,11.83%,2982,...,BLUE 3,Third Lowest 25% (9.00 - 15.99 Providers),BLUE 4,"Top 25% Excl. Extreme Values (846 - 1,864 Dual...",BLUE 3,Third Lowest 25% (26.96% - 33.57% of Total Users),BLUE 4,Top 25% Excl. Extreme Values (19.33% - 27.78% ...,BLUE 3,"Third Lowest 25% ($1,042,852.57 - $2,686,138.28 )"
4,2015-01-01 to 2015-12-31,Ambulance (Emergency & Non-Emergency),CBSA,10220,"Ada, OK",7570,5,104.8,6.92%,524,...,BLUE 2,Second Lowest 25% (5.00 - 8.99 Providers),BLUE 2,Second Lowest 25% (166 - 349 Dual Eligible Users),BLUE 4,Top 25% Excl. Extreme Values (33.58% - 51.44% ...,BLUE 1,Lowest 25% (Less than 13.69% of Total FFS Bene...,BLUE 1,"Lowest 25% (Less than $525,929.51 )"


In [3]:
cbsa.shape

(144881, 35)

In [4]:
# remove percent and dollar signs
cbsa = cbsa.replace('%', '', regex=True)
cbsa = cbsa.replace('\$', '', regex=True)

# remove aggregate
cbsa = cbsa[cbsa['cbsa'] != '--ALL--']

In [5]:
print(cbsa.columns)

Index(['reference_period', 'type_of_service', 'aggregation_level', 'cbsa',
       'cbsatitle', 'number_of_fee_for_service_beneficiaries',
       'number_of_providers', 'average_number_of_users_per_provider',
       'percentage_of_users_out_of_ffs_beneficiaries', 'number_of_users',
       'average_number_of_providers_per_cbsa', 'number_of_dual_eligible_users',
       'percentage_of_dual_eligible_users_out_of_total_users',
       'percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries',
       'total_payment', 'number_of_fee_for_service_beneficiaries_dual_color',
       'number_of_fee_for_service_beneficiaries_description',
       'number_of_providers_dual_color', 'number_of_providers_description',
       'average_number_of_users_per_provider_dual_color',
       'average_number_of_users_per_provider_description',
       'percentage_of_users_out_of_ffs_beneficiaries_dual_color',
       'percentage_of_users_out_of_ffs_beneficiaries_description',
       'number_of_users_du

<table style="border:1px solid black;">
  <tr>
    <th width="200px" style="background-color: lightgrey; border:1px solid black;">Quantitative (interval)</th>
    <th width="200px" style="background-color: lightgrey; border:1px solid black;">Quantitative (ratio)   </th>
  </tr>
  <tr>
    <td style="background-color: lightblue; vertical-align: top; border:1px solid black;">start_date, end_date</td>
    <td style="background-color: lightblue; vertical-align: top; border:1px solid black;">number_of_fee_for_service_beneficiaries,
number_of_providers,
average_number_of_users_per_provider,
percentage_of_users_out_of_ffs_beneficiaries,
number_of_users,
average_number_of_providers_per_cbsa,
number_of_dual_eligible_users,
percentage_of_dual_eligible_users_out_of_total_users,
percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries,
total_payment
</td>
  </tr>
</table>

<table>
  <tr>
    <th width="200px" style="background-color: lightgrey; border:1px solid black;">Qualitative (nominal)</th>
    <th width="200px" style="background-color: lightgrey; border:1px solid black;">Qualitative (ordinal)</th>
  </tr>
  <tr>
    <td style="background-color: lightblue; vertical-align: top; border:1px solid black;">type_of_service,
cbsa,
cbsatitle
    <td style="background-color: lightblue; vertical-align: top; border:1px solid black;">
aggregation_level,
number_of_fee_for_service_beneficiaries_dual_color,
number_of_fee_for_service_beneficiaries_description,
number_of_providers_dual_color,
number_of_providers_description,
average_number_of_users_per_provider_dual_color,
average_number_of_users_per_provider_description,
percentage_of_users_out_of_ffs_beneficiaries_dual_color,
percentage_of_users_out_of_ffs_beneficiaries_description,
number_of_users_dual_color,
number_of_users_description,
average_number_of_providers_per_cbsa_dual_color,
average_number_of_providers_per_cbsa_description,
number_of_dual_eligible_users_dual_color,
number_of_dual_eligible_users_description,
percentage_of_dual_eligible_users_out_of_total_users_dual_color,
percentage_of_dual_eligible_users_out_of_total_users_description,
percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color,
percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_description,
total_payment_dual_color,
total_payment_description
    </td>
  </tr>
</table>

In [6]:
cbsa[['start_date', 'end_date']] = cbsa['reference_period'].str.split(' to ', expand=True)
cbsa.insert(0, 'end_date', cbsa.pop('end_date'))
cbsa.insert(0, 'start_date', cbsa.pop('start_date'))
cbsa.drop(["reference_period"], axis = 1, inplace= True)

In [7]:
# convert date columns to datetime
cbsa[['start_date', 'end_date']] = cbsa[['start_date', 'end_date']].apply(pd.to_datetime)

**Identifying and Converting Attributes**

In [8]:
convert_numerical = [
    'number_of_fee_for_service_beneficiaries',
    'number_of_providers',
    'number_of_users',
    'number_of_dual_eligible_users',
    'average_number_of_users_per_provider',
    'percentage_of_users_out_of_ffs_beneficiaries',
    'average_number_of_providers_per_cbsa',
    'percentage_of_dual_eligible_users_out_of_total_users',
    'percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries',
    'total_payment'
]

for column in convert_numerical:
    cbsa[column] = cbsa[column].str.replace(' ', '').str.replace(',', '')
    cbsa[column] = pd.to_numeric(cbsa[column], downcast=None)

In [9]:
# identify and convert to categorical
convert_to_category = [
    'type_of_service',
    'cbsa',
    'cbsatitle',
    'aggregation_level',
    'number_of_fee_for_service_beneficiaries_dual_color',
    'number_of_fee_for_service_beneficiaries_description',
    'number_of_providers_dual_color',
    'number_of_providers_description',
    'average_number_of_users_per_provider_dual_color',
    'average_number_of_users_per_provider_description',
    'percentage_of_users_out_of_ffs_beneficiaries_dual_color',
    'percentage_of_users_out_of_ffs_beneficiaries_description',
    'number_of_users_dual_color',
    'number_of_users_description',
    'average_number_of_providers_per_cbsa_dual_color',
    'average_number_of_providers_per_cbsa_description',
    'number_of_dual_eligible_users_dual_color',
    'number_of_dual_eligible_users_description',
    'percentage_of_dual_eligible_users_out_of_total_users_dual_color',
    'percentage_of_dual_eligible_users_out_of_total_users_description',
    'percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color',
    'percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_description',
    'total_payment_dual_color',
    'total_payment_description'
]
cbsa[convert_to_category] = cbsa[convert_to_category].astype('category')


In [10]:
print(cbsa.dtypes)

start_date                                                                                         datetime64[ns]
end_date                                                                                           datetime64[ns]
type_of_service                                                                                          category
aggregation_level                                                                                        category
cbsa                                                                                                     category
cbsatitle                                                                                                category
number_of_fee_for_service_beneficiaries                                                                     int64
number_of_providers                                                                                         int64
average_number_of_users_per_provider                                                    

In [11]:
cbsa.describe()

Unnamed: 0,start_date,end_date,number_of_fee_for_service_beneficiaries,number_of_providers,average_number_of_users_per_provider,percentage_of_users_out_of_ffs_beneficiaries,number_of_users,average_number_of_providers_per_cbsa,number_of_dual_eligible_users,percentage_of_dual_eligible_users_out_of_total_users,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries,total_payment
count,144715,144715,144715.0,144715.0,141350.0,144715.0,144715.0,144715.0,132026.0,132026.0,132026.0,144715.0
mean,2018-07-14 18:35:44.999481600,2019-07-14 00:33:48.129772288,38714.07,49.618056,116.399974,12.945911,5099.865,49.618056,1038.777741,23.847503,15.186126,11516310.0
min,2015-01-01 00:00:00,2015-12-31 00:00:00,407.0,0.0,2.31,0.01,11.0,0.0,11.0,0.47,0.01,604.61
25%,2017-01-01 00:00:00,2017-12-31 00:00:00,6239.5,4.0,42.1225,2.57,240.0,4.0,66.0,13.57,3.94,219442.1
50%,2019-01-01 00:00:00,2019-12-31 00:00:00,12011.0,10.0,66.75,5.92,806.0,10.0,195.0,20.83,9.65,921199.9
75%,2021-01-01 00:00:00,2021-12-31 00:00:00,26912.0,28.0,132.0975,15.54,2842.5,28.0,601.0,31.15,20.16,4471906.0
max,2022-01-01 00:00:00,2022-12-31 00:00:00,2523066.0,16732.0,2581.13,74.45,1421406.0,16732.0,231507.0,100.0,79.17,6625352000.0
std,,,117487.7,275.38952,138.871776,15.940015,25979.81,275.38952,5380.292491,13.984618,15.282265,81418150.0


## Data Preparation

Clean and prepare data for data analysis


    - change misleading field values
    - remap categorical as numerical
    - standardize numeric variables
    - identify outliers

In [12]:
cbsa.isnull().sum()

start_date                                                                                             0
end_date                                                                                               0
type_of_service                                                                                        0
aggregation_level                                                                                      0
cbsa                                                                                                   0
cbsatitle                                                                                              0
number_of_fee_for_service_beneficiaries                                                                0
number_of_providers                                                                                    0
average_number_of_users_per_provider                                                                3365
percentage_of_users_out_of_ffs_beneficiaries           

The CMS Integrated Data Repository excluded CBSAs that were incomplete from the source dataset, specifically dual eligibility metrics.

In [13]:
# cbsa.to_csv('cbsa.csv', index=False)

In [14]:
cbsa_2015 = cbsa[cbsa['start_date'].dt.year == 2015]
cbsa_2016 = cbsa[cbsa['start_date'].dt.year == 2016]
cbsa_2017 = cbsa[cbsa['start_date'].dt.year == 2017]
cbsa_2018 = cbsa[cbsa['start_date'].dt.year == 2018]
cbsa_2019 = cbsa[cbsa['start_date'].dt.year == 2019]
cbsa_2020 = cbsa[cbsa['start_date'].dt.year == 2020]
cbsa_2021 = cbsa[cbsa['start_date'].dt.year == 2021]
cbsa_2022 = cbsa[cbsa['start_date'].dt.year == 2022]