In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
import numpy as np
import category_encoders as ce
df = pd.read_csv('Healthcare Providers.csv')

In [19]:
drop=['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]

In [20]:
df = df.drop(columns=drop)

In [21]:
df.isnull().sum()

Credentials of the Provider                                 7209
Gender of the Provider                                      4254
Entity Type of the Provider                                    0
City of the Provider                                           0
State Code of the Provider                                     0
Country Code of the Provider                                   0
Provider Type                                                  0
Medicare Participation Indicator                               0
Place of Service                                               0
HCPCS Description                                              0
HCPCS Drug Indicator                                           0
Number of Services                                             0
Number of Medicare Beneficiaries                               0
Number of Distinct Medicare Beneficiary/Per Day Services       0
Average Medicare Allowed Amount                                0
Average Submitted Charge 

In [22]:
# df['Credentials of the Provider']=df['Credentials of the Provider'].fillna('Unknown')
# gender_mode = df['Gender of the Provider'].mode()[0]
# df['Gender of the Provider'] = df['Gender of the Provider'].fillna(gender_mode)

In [23]:
# df.isnull().sum()

In [24]:
# # Ensure the column is of string type before applying string operations
# df['Average Medicare Allowed Amount'] = df['Average Medicare Allowed Amount'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)
# df['Average Submitted Charge Amount'] = df['Average Submitted Charge Amount'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)
# df['Average Medicare Payment Amount'] = df['Average Medicare Payment Amount'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)
# df['Average Medicare Standardized Amount'] = df['Average Medicare Standardized Amount'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)


# Numerical Features

In [25]:

numerical_features = [
    'Number of Services',
    'Number of Medicare Beneficiaries',
    'Number of Distinct Medicare Beneficiary/Per Day Services',
    'Average Medicare Allowed Amount',
    'Average Submitted Charge Amount',
    'Average Medicare Payment Amount',
    'Average Medicare Standardized Amount'
]


# Clean numerical columns

In [26]:

for i in numerical_features:
    df[i] = df[i].replace({",": ""}, regex=True)  # Remove commas
    df[i] = pd.to_numeric(df[i], errors='coerce')  # Convert to numeric


# Min-Max Scaling

In [27]:
min_max_scaler = MinMaxScaler()
min_max_scaler.set_output(transform='pandas')  # Ensure output is in Pandas DataFrame
df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])


# categorical columns

In [28]:
categorical_features = [
    'Credentials of the Provider',
    'Entity Type of the Provider',
    'City of the Provider',
    'State Code of the Provider',
    'Country Code of the Provider',
    'Provider Type',
    'Medicare Participation Indicator',
    'Place of Service',
    'HCPCS Description',
    'HCPCS Drug Indicator'
]


# Handle missing values in categorical columns

In [29]:

df[categorical_features] = df[categorical_features].fillna('Unknown')

In [30]:
threshold = 100  # Example: any category appearing fewer than 100 times will be grouped as 'Other'
for col in categorical_features:
    value_counts = df[col].value_counts()
    rare_categories = value_counts[value_counts < threshold].index
    df[col] = df[col].replace(rare_categories, 'Other')

# One-Hot Encoder

In [31]:
encoder = ce.OneHotEncoder(cols=categorical_features, use_cat_names=True)#,sparse_output=True)
encoder.set_output(transform='pandas') 
df_encoded = encoder.fit_transform(df)

In [32]:
encoder

# Label Encoder

In [536]:
# Step 6: Label Encoding for Binary Categories
label_encoder = LabelEncoder()
df_encoded['Gender of the Provider'] = label_encoder.fit_transform(df_encoded['Gender of the Provider'])

In [537]:
df_encoded.describe()

Unnamed: 0,Credentials of the Provider_M.D.,Credentials of the Provider_DPM,Credentials of the Provider_MD,Credentials of the Provider_DO,Credentials of the Provider_DPT,Credentials of the Provider_D.O.,Credentials of the Provider_Unknown,Credentials of the Provider_M.D,Credentials of the Provider_PA-C,Credentials of the Provider_ARNP,...,"HCPCS Description_Moderate sedation services by physician also performing a procedure, patient 5 years of age or older, first 15 minutes",HCPCS Drug Indicator_N,HCPCS Drug Indicator_Y,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.32757,0.0133,0.32874,0.02478,0.00645,0.03533,0.07209,0.00935,0.01859,0.00463,...,0.00178,0.93802,0.06198,0.000809,0.000414,0.000464,0.004949,0.005655,0.004814,0.004601
std,0.469329,0.114557,0.469758,0.155455,0.080053,0.184613,0.258638,0.096243,0.135073,0.067887,...,0.042153,0.24112,0.24112,0.008818,0.005831,0.005801,0.012552,0.016949,0.01243,0.011797
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,3.5e-05,3.2e-05,3.2e-05,0.001184,0.000919,0.001203,0.001186
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.000113,0.00011,0.000103,0.003176,0.002329,0.002926,0.002821
75%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.000378,0.000336,0.000336,0.005521,0.004768,0.005283,0.005005
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
