# **Importing Libraries and Loading Dataset**

In [15]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('Healthcare Providers.csv')

In [16]:
df

Unnamed: 0,index,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,8774979,1891106191,UPADHYAYULA,SATYASREE,,M.D.,F,I,1402 S GRAND BLVD,FDT 14TH FLOOR,...,99223,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,3354385,1346202256,JONES,WENDY,P,M.D.,F,I,2950 VILLAGE DR,,...,G0202,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,3001884,1306820956,DUROCHER,RICHARD,W,DPM,M,I,20 WASHINGTON AVE,STE 212,...,99348,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155,64.4396875,60.5959375
3,7594822,1770523540,FULLARD,JASPER,,MD,M,I,5746 N BROADWAY ST,,...,81002,"Urinalysis, manual test",N,20,18,20,3.5,5,3.43,3.43
4,746159,1073627758,PERROTTI,ANTHONY,E,DO,M,I,875 MILITARY TRL,SUITE 200,...,96372,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40,19.539393939,19.057575758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3837311,1386938868,PAPES,JOAN,,PT,F,I,324 E BALTIMORE ST,,...,97162,"Evaluation of physical therapy, typically 30 m...",N,20,20,20,85.3725,214,60.7255,62.2485
99996,2079360,1215091327,HAYNER,MARGARET,S,ARNP,F,I,645 NW 4TH ST,,...,99213,Established patient office or other outpatient...,N,136,107,136,61.27,144.05147059,30.006176471,37.040220588
99997,8927965,1902868185,VALENCIA,DANA,,M.D.,M,I,3009 N BALLAS RD,SUITE 202B,...,93320,"Doppler ultrasound study of heart blood flow, ...",N,11,11,11,17.98,109.54545455,14.09,14.62
99998,8854571,1891941183,GONZALEZ-LAMOS,RAFAELA,,,F,I,2365 BOSTON POST RD,SUITE 201,...,G0008,Administration of influenza virus vaccine,N,12,12,12,30.54,65,29.93,25.32


In [17]:
# Check the shape of the DataFrame
rows, columns = df.shape
print(f"The dataset contains {rows} rows and {columns} columns.")

The dataset contains 100000 rows and 27 columns.


In [18]:
# View the data types of columns in the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   index                                                     100000 non-null  int64  
 1   National Provider Identifier                              100000 non-null  int64  
 2   Last Name/Organization Name of the Provider               100000 non-null  object 
 3   First Name of the Provider                                95745 non-null   object 
 4   Middle Initial of the Provider                            70669 non-null   object 
 5   Credentials of the Provider                               92791 non-null   object 
 6   Gender of the Provider                                    95746 non-null   object 
 7   Entity Type of the Provider                               100000 non-null  object 
 8   Stree

# **Checking for Missing Values**

In [19]:
# Count missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

index                                                           0
National Provider Identifier                                    0
Last Name/Organization Name of the Provider                     0
First Name of the Provider                                   4255
Middle Initial of the Provider                              29331
Credentials of the Provider                                  7209
Gender of the Provider                                       4254
Entity Type of the Provider                                     0
Street Address 1 of the Provider                                0
Street Address 2 of the Provider                            59363
City of the Provider                                            0
Zip Code of the Provider                                        0
State Code of the Provider                                      0
Country Code of the Provider                                    0
Provider Type                                                   0
Medicare P

### **As there are 59363 missing values in 'Street Address 2 of the Provider', which is more than 50% of the total number of the values in the column, we will remove the respective column.**

In [20]:
# Dropping the 'Street Address 2 of the Provider' column
df.drop(['Street Address 2 of the Provider'], axis=1, inplace=True)

# **Exploring Data Distribution in Columns**

## **Step 1: Inspecting Unique Values in Each Column**

In [21]:
# Inspecting unique values in each column of the dataset
for column in df.columns:
    # Display the column name and its unique values
    print(f"Column: {column}")
    print(f"Unique Values: {df[column].unique()}")
    print("-" * 50)  # Separator for readability

Column: index
Unique Values: [8774979 3354385 3001884 ... 8927965 8854571 3547535]
--------------------------------------------------
Column: National Provider Identifier
Unique Values: [1891106191 1346202256 1306820956 ... 1215091327 1891941183 1356772156]
--------------------------------------------------
Column: Last Name/Organization Name of the Provider
Unique Values: ['UPADHYAYULA' 'JONES' 'DUROCHER' ... 'PAPES' 'GONZALEZ-LAMOS' 'RAMEZANI']
--------------------------------------------------
Column: First Name of the Provider
Unique Values: ['SATYASREE' 'WENDY' 'RICHARD' ... 'ILIE' 'RAFAELA' 'ELIIAN']
--------------------------------------------------
Column: Middle Initial of the Provider
Unique Values: [nan 'P' 'W' 'E' 'R' 'B' 'G' 'M' 'S' 'J' 'K' 'X' 'A' 'H' 'V' 'D' 'N' 'L'
 'C' 'T' 'F' 'I' 'O' 'U' 'Y' 'Z' 'Q' '(' '.' '-']
--------------------------------------------------
Column: Credentials of the Provider
Unique Values: ['M.D.' 'DPM' 'MD' ... 'DNP FNP-BC' 'MRCP, MD' 'PT,DPTAT

## **Step 2: Checking Value Counts in Each Column**

In [22]:
# Getting the frequency distribution of values in each column
for column in df.columns:
    # Display the column name and the frequency of each value
    print(f"Column: {column}")
    print("Value Counts:")
    print(df[column].value_counts())
    print("-" * 50)  # Separator for readability

Column: index
Value Counts:
index
8774979    1
6046068    1
2759205    1
275238     1
8229708    1
          ..
4484394    1
7130197    1
6969208    1
7692844    1
3547535    1
Name: count, Length: 100000, dtype: int64
--------------------------------------------------
Column: National Provider Identifier
Value Counts:
National Provider Identifier
1538144910    12
1932166386    12
1538105366     9
1609812445     9
1295726032     8
              ..
1831288992     1
1346535077     1
1962874883     1
1376557017     1
1356772156     1
Name: count, Length: 89508, dtype: int64
--------------------------------------------------
Column: Last Name/Organization Name of the Provider
Value Counts:
Last Name/Organization Name of the Provider
PATEL          557
SMITH          491
JOHNSON        372
WALGREEN CO    362
LEE            361
              ... 
SHUE             1
STANDRIDGE       1
SIMCOE           1
KENNESON         1
RAMEZANI         1
Name: count, Length: 42820, dtype: int64
-----------

# **Handling Missing Values in Categorical Columns**

In [23]:
# Identifying categorical columns for handling missing values
# These columns represent textual or identifier data rather than numeric or continuous values.
# Filling missing values with 'Unknown' makes sense as it retains the dataset's structure without introducing noise.
categorical_columns = [
    'First Name of the Provider',      # Categorical: Represents names, can be unknown if not provided
    'Middle Initial of the Provider', # Categorical: A single character or empty, 'Unknown' is an acceptable placeholder
    'Credentials of the Provider'     # Categorical: Professional qualifications, can be unknown
]

# Filling missing values with 'Unknown' for identified categorical columns
for column in categorical_columns:
    df[column] = df[column].fillna('Unknown')

# Filling missing values in 'Gender of the Provider' with the mode
# Gender is a limited categorical column (e.g., 'M', 'F', or NaN); replacing missing values with the most frequent value (mode) ensures logical consistency.
gender_mode = df['Gender of the Provider'].mode()[0]  # Calculate mode
df['Gender of the Provider'] = df['Gender of the Provider'].fillna(gender_mode)
print(f"Filled missing values in 'Gender of the Provider' with mode: {gender_mode}")

# Validate that all missing values are handled
missing_values = df.isnull().sum()
print("Missing values after filling:\n", missing_values)


Filled missing values in 'Gender of the Provider' with mode: M
Missing values after filling:
 index                                                       0
National Provider Identifier                                0
Last Name/Organization Name of the Provider                 0
First Name of the Provider                                  0
Middle Initial of the Provider                              0
Credentials of the Provider                                 0
Gender of the Provider                                      0
Entity Type of the Provider                                 0
Street Address 1 of the Provider                            0
City of the Provider                                        0
Zip Code of the Provider                                    0
State Code of the Provider                                  0
Country Code of the Provider                                0
Provider Type                                               0
Medicare Participation Indicator      

# **Encoding for Categorical Columns**

In [24]:
# Importing LabelEncoder from sklearn
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Encoding 'Gender of the Provider' into binary values (0 and 1)
df['Gender of the Provider'] = le.fit_transform(df['Gender of the Provider'])
print("Encoded 'Gender of the Provider' values:", df['Gender of the Provider'].unique())

# Encoding 'Medicare Participation Indicator' into binary values (0 and 1)
df['Medicare Participation Indicator'] = le.fit_transform(df['Medicare Participation Indicator'])
print("Encoded 'Medicare Participation Indicator' values:", df['Medicare Participation Indicator'].unique())

# Encoding 'Place of Service' into binary values (0 and 1)
df['Place of Service'] = le.fit_transform(df['Place of Service'])
print("Encoded 'Place of Service' values:", df['Place of Service'].unique())

# Encoding 'HCPCS Drug Indicator' into binary values (0 and 1)
df['HCPCS Drug Indicator'] = le.fit_transform(df['HCPCS Drug Indicator'])
print("Encoded 'HCPCS Drug Indicator' values:", df['HCPCS Drug Indicator'].unique())

# Encoding 'Entity Type of the Provider' into binary values (0 and 1)
df['Entity Type of the Provider'] = le.fit_transform(df['Entity Type of the Provider'])
print("Encoded 'Entity Type of the Provider' values:", df['Entity Type of the Provider'].unique())

Encoded 'Gender of the Provider' values: [0 1]
Encoded 'Medicare Participation Indicator' values: [1 0]
Encoded 'Place of Service' values: [0 1]
Encoded 'HCPCS Drug Indicator' values: [0 1]
Encoded 'Entity Type of the Provider' values: [0 1]
