# Pre Processing

In [18]:
import pandas as pd 
import numpy as np
from sklearn.compose import ColumnTransformer  
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder 
from sklearn.pipeline import make_pipeline

In [2]:
dataset = pd.read_csv(r"C:\Users\PRIYANSHI\Desktop\MediAlert\Healthcare Providers.csv")

In [3]:
df_copy = dataset.copy()

In [4]:
df_copy.columns

Index(['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider',
       'Credentials of the Provider', 'Gender of the Provider',
       'Entity Type of the Provider', 'Street Address 1 of the Provider',
       'Street Address 2 of the Provider', 'City of the Provider',
       'Zip Code of the Provider', 'State Code of the Provider',
       'Country Code of the Provider', 'Provider Type',
       'Medicare Participation Indicator', 'Place of Service', 'HCPCS Code',
       'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services',
       'Number of Medicare Beneficiaries',
       'Number of Distinct Medicare Beneficiary/Per Day Services',
       'Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount'],
      dtype='object')

In [5]:
df_copy.head(5)

Unnamed: 0,index,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,8774979,1891106191,UPADHYAYULA,SATYASREE,,M.D.,F,I,1402 S GRAND BLVD,FDT 14TH FLOOR,...,99223,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,3354385,1346202256,JONES,WENDY,P,M.D.,F,I,2950 VILLAGE DR,,...,G0202,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,3001884,1306820956,DUROCHER,RICHARD,W,DPM,M,I,20 WASHINGTON AVE,STE 212,...,99348,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155.0,64.4396875,60.5959375
3,7594822,1770523540,FULLARD,JASPER,,MD,M,I,5746 N BROADWAY ST,,...,81002,"Urinalysis, manual test",N,20,18,20,3.5,5.0,3.43,3.43
4,746159,1073627758,PERROTTI,ANTHONY,E,DO,M,I,875 MILITARY TRL,SUITE 200,...,96372,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40.0,19.539393939,19.057575758


In [6]:
df_copy.describe()

Unnamed: 0,index,National Provider Identifier,Zip Code of the Provider
count,100000.0,100000.0,100000.0
mean,4907646.0,1498227000.0,416382000.0
std,2839633.0,287412500.0,308256600.0
min,209.0,1003001000.0,601.0
25%,2458791.0,1245669000.0,142630000.0
50%,4901266.0,1497847000.0,363302500.0
75%,7349450.0,1740374000.0,681988100.0
max,9847440.0,1993000000.0,999016600.0


In [7]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   index                                                     100000 non-null  int64  
 1   National Provider Identifier                              100000 non-null  int64  
 2   Last Name/Organization Name of the Provider               100000 non-null  object 
 3   First Name of the Provider                                95745 non-null   object 
 4   Middle Initial of the Provider                            70669 non-null   object 
 5   Credentials of the Provider                               92791 non-null   object 
 6   Gender of the Provider                                    95746 non-null   object 
 7   Entity Type of the Provider                               100000 non-null  object 
 8   Stree

In [8]:
columns_numerical = [
    'Number of Services', 
    'Number of Medicare Beneficiaries', 
    'Number of Distinct Medicare Beneficiary/Per Day Services',
    'Average Medicare Allowed Amount', 
    'Average Submitted Charge Amount',  
    'Average Medicare Payment Amount',  
    'Average Medicare Standardized Amount' 
]

for column in columns_numerical:
    df_copy[column] = pd.to_numeric(df_copy[column], errors='coerce')

print("Data types after conversion:")
print(df_copy[columns_numerical].info())

Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Number of Services                                        97347 non-null  float64
 1   Number of Medicare Beneficiaries                          99595 non-null  float64
 2   Number of Distinct Medicare Beneficiary/Per Day Services  98500 non-null  float64
 3   Average Medicare Allowed Amount                           99255 non-null  float64
 4   Average Submitted Charge Amount                           93277 non-null  float64
 5   Average Medicare Payment Amount                           99534 non-null  float64
 6   Average Medicare Standardized Amount                      99530 non-null  float64
dtypes: float64(7)
memory usage: 5.3 MB
None


In [9]:
df_copy.shape

(100000, 27)

# Null Values / Missing Values

In [10]:
df_copy.isnull().sum()

index                                                           0
National Provider Identifier                                    0
Last Name/Organization Name of the Provider                     0
First Name of the Provider                                   4255
Middle Initial of the Provider                              29331
Credentials of the Provider                                  7209
Gender of the Provider                                       4254
Entity Type of the Provider                                     0
Street Address 1 of the Provider                                0
Street Address 2 of the Provider                            59363
City of the Provider                                            0
Zip Code of the Provider                                        0
State Code of the Provider                                      0
Country Code of the Provider                                    0
Provider Type                                                   0
Medicare P

In [11]:
DropCols = ['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]

In [12]:
df_copy.drop(columns=DropCols,inplace=True)

In [13]:
df_copy.columns

Index(['Credentials of the Provider', 'Gender of the Provider',
       'Entity Type of the Provider', 'City of the Provider',
       'State Code of the Provider', 'Country Code of the Provider',
       'Provider Type', 'Medicare Participation Indicator', 'Place of Service',
       'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services',
       'Number of Medicare Beneficiaries',
       'Number of Distinct Medicare Beneficiary/Per Day Services',
       'Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount'],
      dtype='object')

In [15]:
df_copy

Unnamed: 0,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,M.D.,F,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27.0,24.0,27.0,200.587778,305.211111,157.262222,160.908889
1,M.D.,F,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175.0,175.0,175.0,123.730000,548.800000,118.830000,135.315257
2,DPM,M,I,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32.0,13.0,32.0,90.650000,155.000000,64.439688,60.595937
3,MD,M,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20.0,18.0,20.0,3.500000,5.000000,3.430000,3.430000
4,DO,M,I,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33.0,24.0,31.0,26.520000,40.000000,19.539394,19.057576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,PT,F,I,WILMINGTON,IL,US,Physical Therapist in Private Practice,Y,O,"Evaluation of physical therapy, typically 30 m...",N,20.0,20.0,20.0,85.372500,214.000000,60.725500,62.248500
99996,ARNP,F,I,REDMOND,OR,US,Nurse Practitioner,Y,O,Established patient office or other outpatient...,N,136.0,107.0,136.0,61.270000,144.051471,30.006176,37.040221
99997,M.D.,M,I,SAINT LOUIS,MO,US,Cardiology,Y,F,"Doppler ultrasound study of heart blood flow, ...",N,11.0,11.0,11.0,17.980000,109.545455,14.090000,14.620000
99998,,F,I,LARCHMONT,NY,US,Internal Medicine,Y,O,Administration of influenza virus vaccine,N,12.0,12.0,12.0,30.540000,65.000000,29.930000,25.320000


# Duplicate Values

In [16]:
s = dataset.duplicated()
print(s)

0        False
1        False
2        False
3        False
4        False
         ...  
99995    False
99996    False
99997    False
99998    False
99999    False
Length: 100000, dtype: bool


In [17]:
dataset.duplicated().sum()

0

# Nominal Encoding 

In [15]:
df_copy.head(5)

Unnamed: 0,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,M.D.,F,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,M.D.,F,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,DPM,M,I,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155.0,64.4396875,60.5959375
3,MD,M,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20,18,20,3.5,5.0,3.43,3.43
4,DO,M,I,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40.0,19.539393939,19.057575758


In [16]:
df_copy['Gender of the Provider'].fillna('Unknown', inplace=True)

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
# GENDER COLUMN
encoder = OneHotEncoder(sparse=False)
encoded_array = encoder.fit_transform(df_copy[['Gender of the Provider']])
encoded_columns = encoder.get_feature_names_out(['Gender of the Provider'])

df_copy[encoded_columns] = encoded_array
df_copy.drop(columns=['Gender of the Provider'], inplace=True)



In [19]:
df_copy

Unnamed: 0,Credentials of the Provider,Entity Type of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Gender of the Provider_F,Gender of the Provider_M,Gender of the Provider_Unknown
0,M.D.,I,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889,1.0,0.0,0.0
1,M.D.,I,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714,1.0,0.0,0.0
2,DPM,I,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155,64.4396875,60.5959375,0.0,1.0,0.0
3,MD,I,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20,18,20,3.5,5,3.43,3.43,0.0,1.0,0.0
4,DO,I,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40,19.539393939,19.057575758,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,PT,I,WILMINGTON,IL,US,Physical Therapist in Private Practice,Y,O,"Evaluation of physical therapy, typically 30 m...",N,20,20,20,85.3725,214,60.7255,62.2485,1.0,0.0,0.0
99996,ARNP,I,REDMOND,OR,US,Nurse Practitioner,Y,O,Established patient office or other outpatient...,N,136,107,136,61.27,144.05147059,30.006176471,37.040220588,1.0,0.0,0.0
99997,M.D.,I,SAINT LOUIS,MO,US,Cardiology,Y,F,"Doppler ultrasound study of heart blood flow, ...",N,11,11,11,17.98,109.54545455,14.09,14.62,0.0,1.0,0.0
99998,,I,LARCHMONT,NY,US,Internal Medicine,Y,O,Administration of influenza virus vaccine,N,12,12,12,30.54,65,29.93,25.32,1.0,0.0,0.0


In [20]:
for column in df_copy.columns:
    print(f"Unique values in '{column}': {df_copy[column].nunique()}")

Unique values in 'Credentials of the Provider': 1854
Unique values in 'Entity Type of the Provider': 2
Unique values in 'City of the Provider': 5846
Unique values in 'State Code of the Provider': 58
Unique values in 'Country Code of the Provider': 4
Unique values in 'Provider Type': 90
Unique values in 'Medicare Participation Indicator': 2
Unique values in 'Place of Service': 2
Unique values in 'HCPCS Description': 2455
Unique values in 'HCPCS Drug Indicator': 2
Unique values in 'Number of Services': 2748
Unique values in 'Number of Medicare Beneficiaries': 1274
Unique values in 'Number of Distinct Medicare Beneficiary/Per Day Services': 1979
Unique values in 'Average Medicare Allowed Amount': 49629
Unique values in 'Average Submitted Charge Amount': 38088
Unique values in 'Average Medicare Payment Amount': 83367
Unique values in 'Average Medicare Standardized Amount': 76237
Unique values in 'Gender of the Provider_F': 2
Unique values in 'Gender of the Provider_M': 2
Unique values in '

In [21]:
df_copy['Entity Type of the Provider'].unique()

array(['I', 'O'], dtype=object)

In [22]:
# Entity Type of the Provider
encoder = OneHotEncoder(sparse=False)
encoded_array = encoder.fit_transform(df_copy[['Entity Type of the Provider']])
encoded_columns = encoder.get_feature_names_out(['Entity Type of the Provider'])

df_copy[encoded_columns] = encoded_array
df_copy.drop(columns=['Entity Type of the Provider'], inplace=True)



In [23]:
df_copy.head(5)

Unnamed: 0,Credentials of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,...,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Gender of the Provider_F,Gender of the Provider_M,Gender of the Provider_Unknown,Entity Type of the Provider_I,Entity Type of the Provider_O
0,M.D.,SAINT LOUIS,MO,US,Internal Medicine,Y,F,"Initial hospital inpatient care, typically 70 ...",N,27,...,27,200.58777778,305.21111111,157.26222222,160.90888889,1.0,0.0,0.0,1.0,0.0
1,M.D.,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,Y,O,"Screening mammography, bilateral (2-view study...",N,175,...,175,123.73,548.8,118.83,135.31525714,1.0,0.0,0.0,1.0,0.0
2,DPM,NORTH HAVEN,CT,US,Podiatry,Y,O,"Established patient home visit, typically 25 m...",N,32,...,32,90.65,155.0,64.4396875,60.5959375,0.0,1.0,0.0,1.0,0.0
3,MD,KANSAS CITY,MO,US,Internal Medicine,Y,O,"Urinalysis, manual test",N,20,...,20,3.5,5.0,3.43,3.43,0.0,1.0,0.0,1.0,0.0
4,DO,JUPITER,FL,US,Internal Medicine,Y,O,Injection beneath the skin or into muscle for ...,N,33,...,31,26.52,40.0,19.539393939,19.057575758,0.0,1.0,0.0,1.0,0.0


In [24]:
df_copy['Medicare Participation Indicator'].unique()

array(['Y', 'N'], dtype=object)

In [25]:
# HCPCS Drug Indicator
encoder = OneHotEncoder(sparse=False)
encoded_array = encoder.fit_transform(df_copy[['Medicare Participation Indicator']])
encoded_columns = encoder.get_feature_names_out(['Medicare Participation Indicator'])

df_copy[encoded_columns] = encoded_array
df_copy.drop(columns=['Medicare Participation Indicator'], inplace=True)



In [26]:
df_copy.head(5)

Unnamed: 0,Credentials of the Provider,City of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Place of Service,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,...,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Gender of the Provider_F,Gender of the Provider_M,Gender of the Provider_Unknown,Entity Type of the Provider_I,Entity Type of the Provider_O,Medicare Participation Indicator_N,Medicare Participation Indicator_Y
0,M.D.,SAINT LOUIS,MO,US,Internal Medicine,F,"Initial hospital inpatient care, typically 70 ...",N,27,24,...,305.21111111,157.26222222,160.90888889,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,M.D.,FAYETTEVILLE,NC,US,Obstetrics & Gynecology,O,"Screening mammography, bilateral (2-view study...",N,175,175,...,548.8,118.83,135.31525714,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,DPM,NORTH HAVEN,CT,US,Podiatry,O,"Established patient home visit, typically 25 m...",N,32,13,...,155.0,64.4396875,60.5959375,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,MD,KANSAS CITY,MO,US,Internal Medicine,O,"Urinalysis, manual test",N,20,18,...,5.0,3.43,3.43,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,DO,JUPITER,FL,US,Internal Medicine,O,Injection beneath the skin or into muscle for ...,N,33,24,...,40.0,19.539393939,19.057575758,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [29]:
df_copy['Credentials of the Provider'].unique()

array(['M.D.', 'DPM', 'MD', ..., 'DNP FNP-BC', 'MRCP, MD', 'PT,DPTATC'],
      dtype=object)

In [21]:
nominal_columns = ['Credentials of the Provider','City of the Provider','State Code of the Provider',
           'Country Code of the Provider','Provider Type','Place of Service','HCPCS Description','HCPCS Drug Indicator']  # Replace with actual nominal columns

for col in nominal_columns:
    unique_count = df_copy[col].nunique()
    print(f"Column '{col}' has {unique_count} unique values.")
    
    if unique_count > 10:
        print(f"Applying frequency encoding to column: {col}")
        freq_map = df_copy[col].value_counts().to_dict()
        df_copy[col] = df_copy[col].map(freq_map)
    else:
        print(f"Applying one-hot encoding to column: {col}")
        one_hot = pd.get_dummies(df_copy[col], prefix=col)
        df_copy = pd.concat([df_copy, one_hot], axis=1)
        df_copy.drop(columns=[col], inplace=True)

print(df_copy.info())

Column 'Credentials of the Provider' has 1854 unique values.
Applying frequency encoding to column: Credentials of the Provider
Column 'City of the Provider' has 5846 unique values.
Applying frequency encoding to column: City of the Provider
Column 'State Code of the Provider' has 58 unique values.
Applying frequency encoding to column: State Code of the Provider
Column 'Country Code of the Provider' has 4 unique values.
Applying one-hot encoding to column: Country Code of the Provider
Column 'Provider Type' has 90 unique values.
Applying frequency encoding to column: Provider Type
Column 'Place of Service' has 2 unique values.
Applying one-hot encoding to column: Place of Service
Column 'HCPCS Description' has 2455 unique values.
Applying frequency encoding to column: HCPCS Description
Column 'HCPCS Drug Indicator' has 2 unique values.
Applying one-hot encoding to column: HCPCS Drug Indicator
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (to

In [23]:
min_max_features = ['Number of Services', 'Number of Medicare Beneficiaries', 'Number of Distinct Medicare Beneficiary/Per Day Services']
standard_features = ['Average Medicare Allowed Amount', 'Average Submitted Charge Amount', 'Average Medicare Payment Amount', 'Average Medicare Standardized Amount']

min_max_scaler_pipeline = make_pipeline(MinMaxScaler())
standard_scaler_pipeline = make_pipeline(StandardScaler())

preprocessor = ColumnTransformer(
    transformers=[
        ('min max', min_max_scaler_pipeline, min_max_features),  
        ('standard', standard_scaler_pipeline, standard_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

preprocessor.set_output(transform='pandas')

In [24]:
df_copy = preprocessor.fit_transform(df_copy)
df_copy

Unnamed: 0,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,...,Medicare Participation Indicator,HCPCS Description,Country Code of the Provider_DE,Country Code of the Provider_JP,Country Code of the Provider_TR,Country Code of the Provider_US,Place of Service_F,Place of Service_O,HCPCS Drug Indicator_N,HCPCS Drug Indicator_Y
0,0.016194,0.013158,0.016194,1.094128,0.599773,0.970184,1.000960,32757.0,F,I,...,Y,1297,False,False,False,True,True,False,True,False
1,0.165992,0.165992,0.165992,0.350820,1.874600,0.548672,0.721089,32757.0,F,I,...,Y,243,False,False,False,True,False,True,True,False
2,0.021255,0.002024,0.021255,0.030896,-0.186359,-0.047863,-0.095982,1330.0,M,I,...,Y,44,False,False,False,True,False,True,True,False
3,0.009109,0.007085,0.009109,-0.811950,-0.971386,-0.716998,-0.721103,32874.0,M,I,...,Y,460,False,False,False,True,False,True,True,False
4,0.022267,0.013158,0.020243,-0.589319,-0.788213,-0.540315,-0.550213,2478.0,M,I,...,Y,732,False,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.009109,0.009109,0.009109,-0.020144,0.122418,-0.088599,-0.077911,800.0,F,I,...,Y,330,False,False,False,True,False,True,True,False
99996,0.126518,0.097166,0.126518,-0.253244,-0.243658,-0.425519,-0.353569,463.0,F,I,...,Y,4578,False,False,False,True,False,True,True,False
99997,0.000000,0.000000,0.000000,-0.671911,-0.424246,-0.600082,-0.598739,32757.0,M,I,...,Y,75,False,False,False,True,True,False,True,False
99998,0.001012,0.001012,0.001012,-0.550441,-0.657375,-0.426354,-0.481732,,F,I,...,Y,1444,False,False,False,True,False,True,True,False


In [34]:
print("Applying binary encoding for Medicare Participation Indicator")
binary_map = {'Y': 1, 'N': 0}
df_copy['Medicare Participation Encoded'] = df_copy['Medicare Participation Indicator'].map(binary_map)

Applying binary encoding for Medicare Participation Indicator


In [36]:
#df.to_csv('Processed_dataset.csv')