# **Preprocessing**

In [256]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('Healthcare Providers.csv')

In [257]:
df

Unnamed: 0,index,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,8774979,1891106191,UPADHYAYULA,SATYASREE,,M.D.,F,I,1402 S GRAND BLVD,FDT 14TH FLOOR,...,99223,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,3354385,1346202256,JONES,WENDY,P,M.D.,F,I,2950 VILLAGE DR,,...,G0202,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,3001884,1306820956,DUROCHER,RICHARD,W,DPM,M,I,20 WASHINGTON AVE,STE 212,...,99348,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155,64.4396875,60.5959375
3,7594822,1770523540,FULLARD,JASPER,,MD,M,I,5746 N BROADWAY ST,,...,81002,"Urinalysis, manual test",N,20,18,20,3.5,5,3.43,3.43
4,746159,1073627758,PERROTTI,ANTHONY,E,DO,M,I,875 MILITARY TRL,SUITE 200,...,96372,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40,19.539393939,19.057575758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3837311,1386938868,PAPES,JOAN,,PT,F,I,324 E BALTIMORE ST,,...,97162,"Evaluation of physical therapy, typically 30 m...",N,20,20,20,85.3725,214,60.7255,62.2485
99996,2079360,1215091327,HAYNER,MARGARET,S,ARNP,F,I,645 NW 4TH ST,,...,99213,Established patient office or other outpatient...,N,136,107,136,61.27,144.05147059,30.006176471,37.040220588
99997,8927965,1902868185,VALENCIA,DANA,,M.D.,M,I,3009 N BALLAS RD,SUITE 202B,...,93320,"Doppler ultrasound study of heart blood flow, ...",N,11,11,11,17.98,109.54545455,14.09,14.62
99998,8854571,1891941183,GONZALEZ-LAMOS,RAFAELA,,,F,I,2365 BOSTON POST RD,SUITE 201,...,G0008,Administration of influenza virus vaccine,N,12,12,12,30.54,65,29.93,25.32


In [258]:
# List of columns to be dropped from the DataFrame
DropCols = [
    'index',                          # index column
    'National Provider Identifier',   # Unique provider identifier, not required for analysis
    'Last Name/Organization Name of the Provider',  # Name or organization details, possibly irrelevant
    'First Name of the Provider',     # Provider's first name
    'Middle Initial of the Provider', # Provider's middle initial
    'Street Address 1 of the Provider', # Street address, not required for the analysis
    'Street Address 2 of the Provider', # Additional street address, not required
    'Zip Code of the Provider',       # Provider's zip code
    'HCPCS Code'                      # Healthcare Common Procedure Coding System code
]

# Drop the specified columns from the DataFrame, ignoring errors if columns are missing
cleaned_df = df.drop(columns=DropCols, errors='ignore')

# Print the shape of the cleaned DataFrame
print("Shape of the DataFrame after dropping specified columns:")
print(cleaned_df.shape)

Shape of the DataFrame after dropping specified columns:
(100000, 18)


In [259]:
# Display the first 5 rows of the DataFrame
cleaned_df.head()

Unnamed: 0,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,M.D.,F,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,M.D.,F,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,DPM,M,I,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155.0,64.4396875,60.5959375
3,MD,M,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20,18,20,3.5,5.0,3.43,3.43
4,DO,M,I,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40.0,19.539393939,19.057575758


In [260]:
# List of columns to be converted to numeric
columns_to_convert = [
    'Number of Services',  # Number of services provided
    'Number of Medicare Beneficiaries',  # Count of Medicare beneficiaries
    'Number of Distinct Medicare Beneficiary/Per Day Services',  # Distinct per-day Medicare services
    'Average Medicare Allowed Amount',  # Average amount allowed by Medicare
    'Average Submitted Charge Amount',  # Average charge amount submitted
    'Average Medicare Payment Amount',  # Average payment made by Medicare
    'Average Medicare Standardized Amount'  # Standardized average payment by Medicare
]

# Iterate through each column in the list and convert it to numeric
# Invalid parsing (e.g., non-numeric strings) will be set to NaN
for column in columns_to_convert:
    cleaned_df[column] = pd.to_numeric(cleaned_df[column], errors='coerce')

# Display information about the selected columns to verify their data types and changes
print("Data types after conversion:")
print(cleaned_df[columns_to_convert].info())

Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Number of Services                                        97347 non-null  float64
 1   Number of Medicare Beneficiaries                          99595 non-null  float64
 2   Number of Distinct Medicare Beneficiary/Per Day Services  98500 non-null  float64
 3   Average Medicare Allowed Amount                           99255 non-null  float64
 4   Average Submitted Charge Amount                           93277 non-null  float64
 5   Average Medicare Payment Amount                           99534 non-null  float64
 6   Average Medicare Standardized Amount                      99530 non-null  float64
dtypes: float64(7)
memory usage: 5.3 MB
None


In [261]:
cleaned_df

Unnamed: 0,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,M.D.,F,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27.0,24.0,27.0,200.587778,305.211111,157.262222,160.908889
1,M.D.,F,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175.0,175.0,175.0,123.730000,548.800000,118.830000,135.315257
2,DPM,M,I,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32.0,13.0,32.0,90.650000,155.000000,64.439688,60.595937
3,MD,M,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20.0,18.0,20.0,3.500000,5.000000,3.430000,3.430000
4,DO,M,I,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33.0,24.0,31.0,26.520000,40.000000,19.539394,19.057576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,PT,F,I,WILMINGTON,IL,US,Physical Therapist in Private Practice,Y,O,"Evaluation of physical therapy, typically 30 m...",N,20.0,20.0,20.0,85.372500,214.000000,60.725500,62.248500
99996,ARNP,F,I,REDMOND,OR,US,Nurse Practitioner,Y,O,Established patient office or other outpatient...,N,136.0,107.0,136.0,61.270000,144.051471,30.006176,37.040221
99997,M.D.,M,I,SAINT LOUIS,MO,US,Cardiology,Y,F,"Doppler ultrasound study of heart blood flow, ...",N,11.0,11.0,11.0,17.980000,109.545455,14.090000,14.620000
99998,,F,I,LARCHMONT,NY,US,Internal Medicine,Y,O,Administration of influenza virus vaccine,N,12.0,12.0,12.0,30.540000,65.000000,29.930000,25.320000


### **We have dropped the unnecessary columns from the dataframe.**

In [262]:
# Importing necessary modules for data transformation and preprocessing
from sklearn.compose import ColumnTransformer  # To apply different transformations to different columns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder  # For scaling and encoding
from sklearn.pipeline import make_pipeline  # To create a machine learning pipeline for transformations

In [263]:
import pandas as pd

# Assuming 'cleaned_df' is your DataFrame and 'target_column' is your target variable
nominal_columns = ['Credentials of the Provider','Gender of the Provider','Entity Type of the Provider','City of the Provider','State Code of the Provider',
           'Country Code of the Provider','Provider Type','Medicare Participation Indicator','Place of Service','HCPCS Description','HCPCS Drug Indicator']  # Replace with actual nominal columns

# Check unique values in each nominal column
for col in nominal_columns:
    unique_count = cleaned_df[col].nunique()
    print(f"Column '{col}' has {unique_count} unique values.")
    
    if unique_count > 10:
        print(f"Applying frequency encoding to column: {col}")
        # Apply Frequency Encoding
        freq_map = cleaned_df[col].value_counts().to_dict()
        cleaned_df[col] = cleaned_df[col].map(freq_map)
    else:
        print(f"Applying one-hot encoding to column: {col}")
        # Apply One-Hot Encoding for low-cardinality columns
        one_hot = pd.get_dummies(cleaned_df[col], prefix=col)
        cleaned_df = pd.concat([cleaned_df, one_hot], axis=1)
        cleaned_df.drop(columns=[col], inplace=True)

# Display the updated DataFrame
print(cleaned_df.info())

Column 'Credentials of the Provider' has 1854 unique values.
Applying frequency encoding to column: Credentials of the Provider
Column 'Gender of the Provider' has 2 unique values.
Applying one-hot encoding to column: Gender of the Provider
Column 'Entity Type of the Provider' has 2 unique values.
Applying one-hot encoding to column: Entity Type of the Provider
Column 'City of the Provider' has 5846 unique values.
Applying frequency encoding to column: City of the Provider
Column 'State Code of the Provider' has 58 unique values.
Applying frequency encoding to column: State Code of the Provider
Column 'Country Code of the Provider' has 4 unique values.
Applying one-hot encoding to column: Country Code of the Provider
Column 'Provider Type' has 90 unique values.
Applying frequency encoding to column: Provider Type
Column 'Medicare Participation Indicator' has 2 unique values.
Applying one-hot encoding to column: Medicare Participation Indicator
Column 'Place of Service' has 2 unique val

In [264]:
# Define the list of features for Min-Max scaling and Standard scaling
min_max_features = ['Number of Services', 'Number of Medicare Beneficiaries', 'Number of Distinct Medicare Beneficiary/Per Day Services']
standard_features = ['Average Medicare Allowed Amount', 'Average Submitted Charge Amount', 'Average Medicare Payment Amount', 'Average Medicare Standardized Amount']

# Create the transformation pipeline for Min-Max Scaling
min_max_scaler_pipeline = make_pipeline(MinMaxScaler())

# Create the transformation pipeline for Standard Scaling
standard_scaler_pipeline = make_pipeline(StandardScaler())

# Combine both transformations into a single ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('min max', min_max_scaler_pipeline, min_max_features),  # Apply Min-Max Scaling to specified features
    ('standard', standard_scaler_pipeline, standard_features)  # Apply Standard Scaling to specified features
], remainder='passthrough', verbose_feature_names_out=False, force_int_remainder_cols=False)

# Ensure the output of the transformer is in a pandas DataFrame format
preprocessor.set_output(transform='pandas')

In [265]:
# Apply the transformations defined in 'preprocessor' to the 'cleaned_df' DataFrame
transformed_df = preprocessor.fit_transform(cleaned_df)

# Display the transformed DataFrame
transformed_df

Unnamed: 0,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Credentials of the Provider,City of the Provider,State Code of the Provider,...,Country Code of the Provider_DE,Country Code of the Provider_JP,Country Code of the Provider_TR,Country Code of the Provider_US,Medicare Participation Indicator_N,Medicare Participation Indicator_Y,Place of Service_F,Place of Service_O,HCPCS Drug Indicator_N,HCPCS Drug Indicator_Y
0,0.016194,0.013158,0.016194,1.094128,0.599773,0.970184,1.000960,32757.0,500,1997,...,False,False,False,True,False,True,True,False,True,False
1,0.165992,0.165992,0.165992,0.350820,1.874600,0.548672,0.721089,32757.0,209,3725,...,False,False,False,True,False,True,False,True,True,False
2,0.021255,0.002024,0.021255,0.030896,-0.186359,-0.047863,-0.095982,1330.0,10,1403,...,False,False,False,True,False,True,False,True,True,False
3,0.009109,0.007085,0.009109,-0.811950,-0.971386,-0.716998,-0.721103,32874.0,317,1997,...,False,False,False,True,False,True,False,True,True,False
4,0.022267,0.013158,0.020243,-0.589319,-0.788213,-0.540315,-0.550213,2478.0,51,7263,...,False,False,False,True,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.009109,0.009109,0.009109,-0.020144,0.122418,-0.088599,-0.077911,800.0,323,4073,...,False,False,False,True,False,True,False,True,True,False
99996,0.126518,0.097166,0.126518,-0.253244,-0.243658,-0.425519,-0.353569,463.0,14,1046,...,False,False,False,True,False,True,False,True,True,False
99997,0.000000,0.000000,0.000000,-0.671911,-0.424246,-0.600082,-0.598739,32757.0,500,1997,...,False,False,False,True,False,True,True,False,True,False
99998,0.001012,0.001012,0.001012,-0.550441,-0.657375,-0.426354,-0.481732,,6,6361,...,False,False,False,True,False,True,False,True,True,False
