In [51]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
import numpy as np
import category_encoders as ce
df = pd.read_csv('Healthcare Providers.csv')

In [52]:
df.shape

(100000, 27)

In [53]:
df['Provider Type']

0                             Internal Medicine
1                       Obstetrics & Gynecology
2                                      Podiatry
3                             Internal Medicine
4                             Internal Medicine
                          ...                  
99995    Physical Therapist in Private Practice
99996                        Nurse Practitioner
99997                                Cardiology
99998                         Internal Medicine
99999    Physical Therapist in Private Practice
Name: Provider Type, Length: 100000, dtype: object

In [54]:
drop=['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]

In [55]:
df = df.drop(columns=drop)

In [56]:
df.isnull().sum()

Credentials of the Provider                                 7209
Gender of the Provider                                      4254
Entity Type of the Provider                                    0
City of the Provider                                           0
State Code of the Provider                                     0
Country Code of the Provider                                   0
Provider Type                                                  0
Medicare Participation Indicator                               0
Place of Service                                               0
HCPCS Description                                              0
HCPCS Drug Indicator                                           0
Number of Services                                             0
Number of Medicare Beneficiaries                               0
Number of Distinct Medicare Beneficiary/Per Day Services       0
Average Medicare Allowed Amount                                0
Average Submitted Charge 

# Numerical Features

In [60]:

numerical_features = [
    'Number of Services',
    'Number of Medicare Beneficiaries',
    'Number of Distinct Medicare Beneficiary/Per Day Services',
    'Average Medicare Allowed Amount',
    'Average Submitted Charge Amount',
    'Average Medicare Payment Amount',
    'Average Medicare Standardized Amount'
]


# Clean numerical columns

In [61]:

for i in numerical_features:
    df[i] = df[i].replace({",": ""}, regex=True)  # Remove commas
    df[i] = pd.to_numeric(df[i], errors='coerce')  # Convert to numeric


# Min-Max Scaling

In [62]:
min_max_scaler = MinMaxScaler()
min_max_scaler.set_output(transform='pandas')  # Ensure output is in Pandas DataFrame
df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])


# categorical columns

In [63]:
categorical_features = [
    'Credentials of the Provider',
    'Entity Type of the Provider',
    'City of the Provider',
    'State Code of the Provider',
    'Country Code of the Provider',
    'Provider Type',
    'Medicare Participation Indicator',
    'Place of Service',
    'HCPCS Description',
    'HCPCS Drug Indicator'
]


# Handle missing values in categorical columns

In [64]:

df[categorical_features] = df[categorical_features].fillna('Unknown')

In [65]:
threshold = 100  # Example: any category appearing fewer than 100 times will be grouped as 'Other'
for col in categorical_features:
    value_counts = df[col].value_counts()
    rare_categories = value_counts[value_counts < threshold].index
    df[col] = df[col].replace(rare_categories, 'Other')

# One-Hot Encoder

In [66]:
encoder = ce.OneHotEncoder(cols=categorical_features, use_cat_names=True)#,sparse_output=True)
encoder.set_output(transform='pandas') 
df_encoded = encoder.fit_transform(df)

In [67]:
encoder

# Label Encoder

In [68]:
# Step 6: Label Encoding for Binary Categories
label_encoder = LabelEncoder()
df_encoded['Gender of the Provider'] = label_encoder.fit_transform(df_encoded['Gender of the Provider'])

In [70]:
df.to_csv('healthcare_processed.csv', index=False)

print("Transformed dataset saved to 'healthcare_processed.csv'.")

Transformed dataset saved to 'healthcare_processed.csv'.


In [71]:
df1=pd.read_csv('healthcare_processed.csv')

In [72]:
df1

Unnamed: 0,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,M.D.,F,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,0.000057,0.000068,0.000057,0.009787,0.004868,0.009787,0.009489
1,M.D.,F,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,0.000580,0.000862,0.000580,0.006037,0.008753,0.007395,0.007979
2,DPM,M,I,Other,CT,US,Podiatry,Y,O,Other,N,0.000074,0.000011,0.000074,0.004423,0.002472,0.004010,0.003573
3,MD,M,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,0.000032,0.000037,0.000032,0.000170,0.000080,0.000213,0.000202
4,DO,M,I,Other,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,0.000078,0.000068,0.000071,0.001294,0.000638,0.001216,0.001123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,PT,F,I,WILMINGTON,IL,US,Physical Therapist in Private Practice,Y,O,"Evaluation of physical therapy, typically 30 m...",N,0.000032,0.000047,0.000032,0.004165,0.003413,0.003779,0.003670
99996,ARNP,F,I,Other,OR,US,Nurse Practitioner,Y,O,Established patient office or other outpatient...,N,0.000442,0.000504,0.000442,0.002989,0.002298,0.001867,0.002184
99997,M.D.,M,I,SAINT LOUIS,MO,US,Cardiology,Y,F,Other,N,0.000000,0.000000,0.000000,0.000877,0.001747,0.000876,0.000862
99998,Unknown,F,I,Other,NY,US,Internal Medicine,Y,O,Administration of influenza virus vaccine,N,0.000004,0.000005,0.000004,0.001490,0.001037,0.001862,0.001493
