In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import FeatureHasher

In [18]:
def standardize_column_names(df):
    """Standardize column names by stripping whitespace and replacing spaces with underscores."""
    print("Standardizing column names...")
    return df.rename(columns=lambda x: x.strip().replace(" ", "_"))

def encode_categorical(df, columns, strategy="frequency"):
    """Apply categorical encoding based on the specified strategy.

    Parameters:
    - df: DataFrame
    - columns: list of column names to encode
    - strategy: str, encoding strategy ('frequency', 'ordinal', etc.)

    Returns:
    - DataFrame with encoded columns
    """
    for column in columns:
        print(f"Applying {strategy} encoding to column: {column}")
        if strategy == "frequency":
            freq_map = df[column].value_counts(normalize=True).to_dict()
            df[f'{column}_Freq'] = df[column].map(freq_map)
        elif strategy == "ordinal":
            df[column] = pd.Categorical(df[column]).codes
        else:
            raise ValueError(f"Unsupported encoding strategy: {strategy}")
    return df

def impute_missing_values(df, column, group_by_column, overall=True):
    """Impute missing values using grouped median and optionally overall median."""
    print(f"Imputing missing values for {column} grouped by {group_by_column}...")
    df[column] = df[column].fillna(df.groupby(group_by_column)[column].transform("median"))
    if overall:
        overall_median = df[column].median()
        print(f"Filling remaining missing values in {column} with overall median: {overall_median}")
        df[column] = df[column].fillna(overall_median)
    return df

In [19]:
# Load the dataset from the specified file path
file_path = 'Healthcare Providers.csv'
print(f"Loading dataset from {file_path}...")
data = pd.read_csv(file_path)
data_with_all_col = data  # Keep a backup of the original dataset

Loading dataset from Healthcare Providers.csv...


In [20]:
# Define columns to retain for fraud detection analysis
selected_columns = [
    "Number of Services",
    "Number of Medicare Beneficiaries",
    "Number of Distinct Medicare Beneficiary/Per Day Services",
    "Average Submitted Charge Amount",
    "Average Medicare Payment Amount",
    "Average Medicare Allowed Amount",
    "Average Medicare Standardized Amount",
    "HCPCS Code",
    "Provider Type",
    "Place of Service",
    "State Code of the Provider",
    "Medicare Participation Indicator",
]

print("Selecting relevant columns...")
# Select only the specified columns from the dataset
df = data[selected_columns].copy()

Selecting relevant columns...


In [21]:
# Display dataset information to understand the data types and missing values
print("Dataset information:")
df.info()

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column                                                    Non-Null Count   Dtype 
---  ------                                                    --------------   ----- 
 0   Number of Services                                        100000 non-null  object
 1   Number of Medicare Beneficiaries                          100000 non-null  object
 2   Number of Distinct Medicare Beneficiary/Per Day Services  100000 non-null  object
 3   Average Submitted Charge Amount                           100000 non-null  object
 4   Average Medicare Payment Amount                           100000 non-null  object
 5   Average Medicare Allowed Amount                           100000 non-null  object
 6   Average Medicare Standardized Amount                      100000 non-null  object
 7   HCPCS Code                                                100000 non-null  object

In [22]:
# Convert numerical features stored as object types to numeric
numerical_object_columns = [
    "Number of Services",
    "Number of Medicare Beneficiaries",
    "Number of Distinct Medicare Beneficiary/Per Day Services",
    "Average Submitted Charge Amount",
    "Average Medicare Payment Amount",
    "Average Medicare Allowed Amount",
    "Average Medicare Standardized Amount"
]

print("Converting numerical features to numeric...")
# Convert columns to numeric and coerce invalid values to NaN
for column in numerical_object_columns:
    print(f"Converting column {column} to numeric...")
    df[column] = pd.to_numeric(df[column], errors='coerce')

Converting numerical features to numeric...
Converting column Number of Services to numeric...
Converting column Number of Medicare Beneficiaries to numeric...
Converting column Number of Distinct Medicare Beneficiary/Per Day Services to numeric...
Converting column Average Submitted Charge Amount to numeric...
Converting column Average Medicare Payment Amount to numeric...
Converting column Average Medicare Allowed Amount to numeric...
Converting column Average Medicare Standardized Amount to numeric...


In [23]:
# Standardize column names using the defined function
print("Standardizing column names...")
df = standardize_column_names(df)

# Impute missing values for relevant columns
df = impute_missing_values(df, "Number_of_Services", "Provider_Type")
df = impute_missing_values(df, "Number_of_Medicare_Beneficiaries", "Provider_Type")
df = impute_missing_values(df, "Average_Submitted_Charge_Amount", "HCPCS_Code")
df = impute_missing_values(df, "Number_of_Distinct_Medicare_Beneficiary/Per_Day_Services", "Provider_Type")
df = impute_missing_values(df, "Average_Medicare_Payment_Amount", "HCPCS_Code")
df = impute_missing_values(df, "Average_Medicare_Allowed_Amount", "HCPCS_Code")
df = impute_missing_values(df, "Average_Medicare_Standardized_Amount", "HCPCS_Code")

Standardizing column names...
Standardizing column names...
Imputing missing values for Number_of_Services grouped by Provider_Type...
Filling remaining missing values in Number_of_Services with overall median: 42.0
Imputing missing values for Number_of_Medicare_Beneficiaries grouped by Provider_Type...
Filling remaining missing values in Number_of_Medicare_Beneficiaries with overall median: 32.0
Imputing missing values for Average_Submitted_Charge_Amount grouped by HCPCS_Code...
Filling remaining missing values in Average_Submitted_Charge_Amount with overall median: 144.16666667
Imputing missing values for Number_of_Distinct_Medicare_Beneficiary/Per_Day_Services grouped by Provider_Type...
Filling remaining missing values in Number_of_Distinct_Medicare_Beneficiary/Per_Day_Services with overall median: 40.0
Imputing missing values for Average_Medicare_Payment_Amount grouped by HCPCS_Code...
Filling remaining missing values in Average_Medicare_Payment_Amount with overall median: 46.9604

In [24]:
# Calculate derived features
print("Adding derived features...")
df["Payment_to_Charge_Ratio"] = df["Average_Medicare_Payment_Amount"] / df["Average_Submitted_Charge_Amount"]
df["Allowed_to_Charge_Ratio"] = df["Average_Medicare_Allowed_Amount"] / df["Average_Submitted_Charge_Amount"]
print("Derived features added: Payment_to_Charge_Ratio, Allowed_to_Charge_Ratio")

Adding derived features...
Derived features added: Payment_to_Charge_Ratio, Allowed_to_Charge_Ratio


In [25]:
# Scaling numerical features for autoencoders
print("Scaling numerical features for autoencoders...")
scaler = MinMaxScaler()
numerical_columns = [
    "Number_of_Services",
    "Number_of_Medicare_Beneficiaries",
    "Number_of_Distinct_Medicare_Beneficiary/Per_Day_Services",
    "Average_Submitted_Charge_Amount",
    "Average_Medicare_Payment_Amount",
    "Average_Medicare_Allowed_Amount",
    "Average_Medicare_Standardized_Amount",
    "Payment_to_Charge_Ratio",
    "Allowed_to_Charge_Ratio"
]
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

Scaling numerical features for autoencoders...


In [26]:
# Encoding categorical variables
print("Applying categorical encoding...")
df = encode_categorical(df, ['Provider_Type', 'State_Code_of_the_Provider'], strategy="frequency")

# One-Hot Encoding for Place of Service
print("Applying one-hot encoding for Place_of_Service...")
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
place_of_service_encoded = one_hot_encoder.fit_transform(df[['Place_of_Service']])
df = pd.concat([df, pd.DataFrame.sparse.from_spmatrix(
    place_of_service_encoded, 
    columns=one_hot_encoder.get_feature_names_out(['Place_of_Service'])
)], axis=1)

Applying categorical encoding...
Applying frequency encoding to column: Provider_Type
Applying frequency encoding to column: State_Code_of_the_Provider
Applying one-hot encoding for Place_of_Service...


In [27]:
# Binary Encoding for Medicare Participation Indicator
print("Applying binary encoding for Medicare_Participation_Indicator...")
binary_map = {'Y': 1, 'N': 0}
df['Medicare_Participation_Encoded'] = df['Medicare_Participation_Indicator'].map(binary_map)

Applying binary encoding for Medicare_Participation_Indicator...


In [28]:
# Feature hashing for HCPCS Code
print("Applying feature hashing...")
def dynamic_feature_hasher(df, column, max_features=50):
    """Apply feature hashing dynamically adjusting n_features based on data cardinality."""
    if column not in df.columns:
        raise ValueError(f"Column {column} does not exist in the DataFrame.")
    print(f"Applying feature hashing for {column} with max features: {max_features}")
    n_features = min(max_features, len(df[column].unique()))
    hasher = FeatureHasher(n_features=n_features, input_type='string')
    hashed_features = hasher.transform(df[column].astype(str).apply(lambda x: [x]))
    print(f"Generated {n_features} hashed features for {column}")
    return pd.DataFrame.sparse.from_spmatrix(hashed_features, columns=[f'{column}_Hash_{i}' for i in range(n_features)])

hashed_features_df = dynamic_feature_hasher(df, 'HCPCS_Code')
df = pd.concat([df, hashed_features_df], axis=1)

Applying feature hashing...
Applying feature hashing for HCPCS_Code with max features: 50
Generated 50 hashed features for HCPCS_Code


In [29]:
# Dimensionality reduction for high-cardinality data
print("Reducing dimensionality of hashed features...")
hashed_columns = [col for col in df.columns if col.startswith('HCPCS_Code_Hash_')]
svd = TruncatedSVD(n_components=20, random_state=42)
reduced_data = svd.fit_transform(df[hashed_columns])
reduced_df = pd.DataFrame(reduced_data, columns=[f'SVD_Component_{i}' for i in range(reduced_data.shape[1])])
df = pd.concat([df, reduced_df], axis=1)
df = df.drop(columns=hashed_columns)

Reducing dimensionality of hashed features...


In [30]:
# Verify if columns are no longer needed before dropping them
print("Verifying columns to drop...")
columns_to_drop = ['HCPCS_Code', 'Provider_Type', 'State_Code_of_the_Provider', 'Place_of_Service', 'Medicare_Participation_Indicator']
missing_columns = [col for col in columns_to_drop if col not in df.columns]
if missing_columns:
    raise ValueError(f"The following columns are missing and cannot be dropped: {missing_columns}")
df = df.drop(columns=columns_to_drop)

Verifying columns to drop...


In [31]:
# Final dataset with all preprocessing applied
print("Preprocessing complete. Here is the updated dataset:")
print(df.head())

Preprocessing complete. Here is the updated dataset:
   Number_of_Services  Number_of_Medicare_Beneficiaries  \
0            0.016194                          0.013158   
1            0.165992                          0.165992   
2            0.021255                          0.002024   
3            0.009109                          0.007085   
4            0.022267                          0.013158   

   Number_of_Distinct_Medicare_Beneficiary/Per_Day_Services  \
0                                           0.016194          
1                                           0.165992          
2                                           0.021255          
3                                           0.009109          
4                                           0.020243          

   Average_Submitted_Charge_Amount  Average_Medicare_Payment_Amount  \
0                         0.305285                         0.157472   
1                         0.548940                         0.118987   


In [33]:
df.to_csv('Processed_dataset.csv')