In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

from tqdm import tqdm

from tabulate import tabulate

from scipy import stats
from scipy.stats import chi2_contingency

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from joblib import Memory

import joblib
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # supress warning 

In [2]:
# Read in data - might take a while because there 1m+ rows
data = pd.read_csv('Data/df_fwa.csv')

In [3]:
data.head()

Unnamed: 0,reference_period,type_of_service,state,county,state_fips,county_fips,number_of_fee_for_service_beneficiaries,number_of_providers,average_number_of_users_per_provider,percentage_of_users_out_of_ffs_beneficiaries,...,number_of_providers_dual_color,average_number_of_users_per_provider_dual_color,percentage_of_users_out_of_ffs_beneficiaries_dual_color,number_of_users_dual_color,average_number_of_providers_per_county_dual_color,number_of_dual_eligible_users_dual_color,percentage_of_dual_eligible_users_out_of_total_users_dual_color,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color,total_payment_dual_color,potential_fwa
0,2019-01-01 to 2019-12-31,Ambulance (Emergency & Non-Emergency),AL,--ALL--,1,,547486.0,146.0,501.47,13.37,...,2,4,4,3,2,3,4,3,3,0
1,2019-01-01 to 2019-12-31,Ambulance (Emergency & Non-Emergency),AK,--ALL--,2,,91480.0,33.0,254.97,9.2,...,1,2,2,1,1,1,4,2,1,0
2,2019-01-01 to 2019-12-31,Ambulance (Emergency & Non-Emergency),AZ,--ALL--,4,,740278.0,170.0,401.34,9.22,...,3,3,2,3,4,2,1,1,2,0
3,2019-01-01 to 2019-12-31,Ambulance (Emergency & Non-Emergency),AR,--ALL--,5,,437616.0,86.0,628.07,12.34,...,2,4,3,3,2,1,1,1,3,0
4,2019-01-01 to 2019-12-31,Ambulance (Emergency & Non-Emergency),CA,--ALL--,6,,3423285.0,330.0,1169.09,11.27,...,4,5,2,5,4,5,4,2,5,1


In [4]:
orig_columns = data.columns

In [5]:
data.dtypes

reference_period                                                                        object
type_of_service                                                                         object
state                                                                                   object
county                                                                                  object
state_fips                                                                               int64
county_fips                                                                             object
number_of_fee_for_service_beneficiaries                                                float64
number_of_providers                                                                    float64
average_number_of_users_per_provider                                                   float64
percentage_of_users_out_of_ffs_beneficiaries                                           float64
number_of_users                                   

In [6]:
# these columns were removed because of the high pvalues and redundancy
columns_to_drop = [
    'moratorium',
    'reference_period',
    'state_fips',
    'county_fips']
data = data.drop(columns_to_drop, axis = 1)

In [7]:
# Convert all integer columns to categorical
int_columns = data.select_dtypes(include='int').columns
data[int_columns] = data[int_columns].astype('category')

In [8]:
cat_data = data.select_dtypes(include=['category'])  # select only categorical columns
cat_data_columns = cat_data.columns.tolist()
cat_data_columns.remove('potential_fwa')

In [9]:
cat_data

Unnamed: 0,number_of_fee_for_service_beneficiaries_dual_color,number_of_providers_dual_color,average_number_of_users_per_provider_dual_color,percentage_of_users_out_of_ffs_beneficiaries_dual_color,number_of_users_dual_color,average_number_of_providers_per_county_dual_color,number_of_dual_eligible_users_dual_color,percentage_of_dual_eligible_users_out_of_total_users_dual_color,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color,total_payment_dual_color,potential_fwa
0,3,2,4,4,3,2,3,4,3,3,0
1,1,1,2,2,1,1,1,4,2,1,0
2,3,3,3,2,3,4,2,1,1,2,0
3,2,2,4,3,3,2,1,1,1,3,0
4,5,4,5,2,5,4,5,4,2,5,1
...,...,...,...,...,...,...,...,...,...,...,...
1044349,3,3,4,3,3,3,3,1,4,3,0
1044350,3,2,1,2,3,2,1,1,1,3,0
1044351,2,3,3,3,3,3,1,1,4,3,0
1044352,1,2,1,1,1,2,1,1,1,2,0


In [10]:
# One-Hot Encoding
one_hot_encoded_df = pd.get_dummies(data, columns= cat_data_columns)

In [11]:
potential_fwa = one_hot_encoded_df['potential_fwa']
one_hot_encoded_df.drop(columns = 'potential_fwa', inplace=True)
one_hot_encoded_df.shape

(1044354, 61)

In [12]:
# Function to reconstruct original column from one-hot encoded columns
def reconstruct_column(df, prefix):
    # Extract one-hot columns matching the prefix
    one_hot_columns = [col for col in df.columns if col.startswith(prefix)]
    
    # Reconstruct original values based on the one-hot encoded columns
    if one_hot_columns:
        # Find the column with the value 1
        original_values = df[one_hot_columns].idxmax(axis=1)
        # Extract the part of the column name after the last underscore
        return original_values.apply(lambda x: x.rsplit('_', 1)[-1])
    
    # For columns that are not one-hot encoded
    return df[prefix] if prefix in df.columns else None

# Loop through each prefix to reconstruct
for prefix in cat_data_columns:
    reconstructed_column = reconstruct_column(one_hot_encoded_df, prefix)
    if reconstructed_column is not None:
        one_hot_encoded_df[f'{prefix}_reconstructed'] = reconstructed_column

In [13]:
# Get all column names that contain '_reconstructed'
reconstructed_columns = one_hot_encoded_df.columns[one_hot_encoded_df.columns.str.contains('_reconstructed')]
reconstructed_cols_data = one_hot_encoded_df[reconstructed_columns]

# Drop the columns that have _reconstructed in it
one_hot_encoded_df.drop(columns = reconstructed_columns, inplace=True)

In [14]:
# Encode the column for state, type_of_service, county, state_fips, and county_fips
# Apply binary encoding to the categorical columns
encoder = ce.BinaryEncoder(cols=['type_of_service','state', 'county'])
data_for_split_encoded = encoder.fit_transform(one_hot_encoded_df)

In [15]:
# save fitted binary encoder
binary_encoder_path_job = 'flask/binary_encoder.joblib'
binary_encoder_path_pickle = 'flask/binary_encoder.pkl'
joblib.dump(encoder, binary_encoder_path_job)
with open(binary_encoder_path_pickle, 'wb') as file:
    pickle.dump(encoder, file)
    
print(f'Fitted Binary Encoder saved at: {binary_encoder_path_job}')
print(f'Fitted Binary Encoder saved at: {binary_encoder_path_pickle}')

Fitted Binary Encoder saved at: flask/binary_encoder.joblib
Fitted Binary Encoder saved at: flask/binary_encoder.pkl


In [16]:
# Use inverse_transform to get the original values back
df_decoded = encoder.inverse_transform(data_for_split_encoded)

In [17]:
data_to_bind = df_decoded[['type_of_service', 'state', 'county']]

In [18]:
new_data_for_split = pd.concat([data_for_split_encoded, data_to_bind,reconstructed_cols_data, potential_fwa ], axis = 1)

In [19]:
cols_to_drop = ['type_of_service', 'state', 'county']
cols_to_drop.extend(reconstructed_columns)

In [20]:
cols_to_drop

['type_of_service',
 'state',
 'county',
 'number_of_fee_for_service_beneficiaries_dual_color_reconstructed',
 'number_of_providers_dual_color_reconstructed',
 'average_number_of_users_per_provider_dual_color_reconstructed',
 'percentage_of_users_out_of_ffs_beneficiaries_dual_color_reconstructed',
 'number_of_users_dual_color_reconstructed',
 'average_number_of_providers_per_county_dual_color_reconstructed',
 'number_of_dual_eligible_users_dual_color_reconstructed',
 'percentage_of_dual_eligible_users_out_of_total_users_dual_color_reconstructed',
 'percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_reconstructed',
 'total_payment_dual_color_reconstructed']

In [21]:
new_data_for_split

Unnamed: 0,type_of_service_0,type_of_service_1,type_of_service_2,type_of_service_3,type_of_service_4,state_0,state_1,state_2,state_3,state_4,...,number_of_providers_dual_color_reconstructed,average_number_of_users_per_provider_dual_color_reconstructed,percentage_of_users_out_of_ffs_beneficiaries_dual_color_reconstructed,number_of_users_dual_color_reconstructed,average_number_of_providers_per_county_dual_color_reconstructed,number_of_dual_eligible_users_dual_color_reconstructed,percentage_of_dual_eligible_users_out_of_total_users_dual_color_reconstructed,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_reconstructed,total_payment_dual_color_reconstructed,potential_fwa
0,0,0,0,0,1,0,0,0,0,0,...,2,4,4,3,2,3,4,3,3,0
1,0,0,0,0,1,0,0,0,0,1,...,1,2,2,1,1,1,4,2,1,0
2,0,0,0,0,1,0,0,0,0,1,...,3,3,2,3,4,2,1,1,2,0
3,0,0,0,0,1,0,0,0,1,0,...,2,4,3,3,2,1,1,1,3,0
4,0,0,0,0,1,0,0,0,1,0,...,4,5,2,5,4,5,4,2,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044349,1,0,1,0,1,1,1,0,0,1,...,3,4,3,3,3,3,1,4,3,0
1044350,1,0,1,0,1,1,1,0,0,1,...,2,1,2,3,2,1,1,1,3,0
1044351,1,0,1,0,1,1,1,0,0,1,...,3,3,3,3,3,1,1,4,3,0
1044352,1,0,1,0,1,1,1,0,0,1,...,2,1,1,1,2,1,1,1,2,0


In [22]:
# split into X and y dataframes
# Split the encoded DataFrame into training and test sets
# Separate features and target
y = new_data_for_split['potential_fwa']
X = new_data_for_split.drop(columns=['potential_fwa'])

# Split into training and test sets with stratification
# Step 1: Split into 70% training and 30% temporary set
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42,
    stratify=y  # Ensures the class distribution is preserved
)


# Split the training set into training and validation sets with stratification
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5,  # Split the temporary set equally into validation and test sets
    random_state=42,
    stratify=y_temp  # Ensures the class distribution is preserved
)

In [23]:
X_train.drop(columns = cols_to_drop, inplace=True)
X_test_labels = X_test[cols_to_drop]
X_test.drop(columns = cols_to_drop, inplace =True)

In [24]:
#set up numeric columns list
x_train_num_cols = X_train.select_dtypes(include=['float'])
x_train_num_cols = x_train_num_cols.columns.tolist()

In [25]:
# Set up scaler
scaler = RobustScaler()

# Apply standardization to each numeric column
X_train_scaled = X_train.copy()
X_train_scaled[x_train_num_cols] = scaler.fit_transform(X_train[x_train_num_cols])

X_test_scaled = X_test.copy()
X_test_scaled[x_train_num_cols] = scaler.transform(X_test_scaled[x_train_num_cols])

In [26]:
# save fitted binary encoder
scaler_path_job = 'flask/scaler.joblib'
scaler_path_pickle = 'flask/scaler.pkl'
joblib.dump(scaler, scaler_path_job)
with open(scaler_path_pickle, 'wb') as file:
    pickle.dump(scaler, file)
    
print(f'Fitted Scaler saved at: {scaler_path_job}')
print(f'Fitted Scaler saved at: {scaler_path_pickle}')

Fitted Scaler saved at: flask/scaler.joblib
Fitted Scaler saved at: flask/scaler.pkl


In [27]:
# function to save df into CSV files
def save_to_csv(data, file_name, directory='Data'):
    # create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # define path
    output_file = os.path.join(directory, file_name)
    
    # save data to CSV
    data.to_csv(output_file, index=False)
    print(f"DataFrame saved to {output_file}")

# define df and file names
data_files = [
    (X_test_scaled, 'new_x_test_scaled.csv'),
    (y_test, 'new_y_test.csv')
]

for data, file_name in data_files:
    save_to_csv(data, file_name)

DataFrame saved to Data\new_x_test_scaled.csv
DataFrame saved to Data\new_y_test.csv


In [28]:
pca_scaled = PCA(n_components=3)
X_train_scaled_pca = pca_scaled.fit_transform(X_train_scaled)
X_test_scaled_pca = pca_scaled.transform(X_test_scaled)

In [29]:
# save fitted binary encoder
pca_path_job = 'flask/pca.joblib'
pca_path_pickle = 'flask/pca.pkl'
joblib.dump(pca_scaled, pca_path_job)
with open(pca_path_pickle, 'wb') as file:
    pickle.dump(pca_scaled, file)
    
print(f'Fitted pca saved at: {pca_path_job}')
print(f'Fitted pca saved at: {pca_path_pickle}')

Fitted pca saved at: flask/pca.joblib
Fitted pca saved at: flask/pca.pkl


In [30]:
model = joblib.load('Models/bag_best_model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [31]:
predictions = model.predict(X_test_scaled_pca)
predictions = pd.DataFrame(predictions, columns = ['predictions'])



In [32]:
predictions

Unnamed: 0,predictions
0,0
1,0
2,1
3,0
4,0
...,...
156649,0
156650,0
156651,1
156652,0


In [33]:
probabilities = model.predict_proba(X_test_scaled_pca)[:,1]
probabilities = pd.DataFrame(probabilities, columns = ['probabilities'])



In [34]:
y_test

117529    0
663266    1
143388    1
475252    0
90080     1
         ..
677415    0
261615    0
216696    1
540490    1
385352    0
Name: potential_fwa, Length: 156654, dtype: category
Categories (2, int64): [0, 1]

In [35]:
actual_labels = X_test_labels.reset_index(drop=True)
y_test.reset_index(drop=True, inplace=True)
final_results = pd.concat([actual_labels, predictions, y_test, probabilities], axis = 1)


In [36]:
final_results = final_results.rename(columns = {'potential_fwa':'actual'})

In [37]:
final_results

Unnamed: 0,type_of_service,state,county,number_of_fee_for_service_beneficiaries_dual_color_reconstructed,number_of_providers_dual_color_reconstructed,average_number_of_users_per_provider_dual_color_reconstructed,percentage_of_users_out_of_ffs_beneficiaries_dual_color_reconstructed,number_of_users_dual_color_reconstructed,average_number_of_providers_per_county_dual_color_reconstructed,number_of_dual_eligible_users_dual_color_reconstructed,percentage_of_dual_eligible_users_out_of_total_users_dual_color_reconstructed,percentage_of_dual_eligible_users_out_of_dual_eligible_ffs_beneficiaries_dual_color_reconstructed,total_payment_dual_color_reconstructed,predictions,actual,probabilities
0,Skilled Nursing Facility,ID,SHOSHONE,2,3,1,1,2,3,2,3,1,2,0,0,0.0000
1,Preventive Health Services,NJ,ATLANTIC,5,5,4,4,5,5,5,3,3,5,0,1,0.1875
2,Federally Qualified Health Center (FQHC),NC,--ALL--,4,4,3,3,4,2,4,2,2,4,1,1,1.0000
3,Podiatry Services,MS,LEAKE,2,1,2,1,1,1,1,3,1,1,0,0,0.0000
4,Hospice,TX,BRAZOS,4,4,3,3,4,4,4,2,3,4,0,1,0.0250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156649,Ambulance (Emergency & Non-Emergency),GA,BEN HILL,2,3,1,4,2,3,2,3,3,2,0,0,0.0000
156650,Clinical Laboratory (Billing Independently),IA,WRIGHT,2,2,3,2,2,2,1,1,2,2,0,0,0.0000
156651,Independent Diagnostic Testing Facility Pt A,VA,HENRICO,5,4,5,2,5,4,4,1,3,5,1,1,1.0000
156652,Preventive Health Services,NC,PITT,4,5,3,4,5,5,5,4,4,5,0,1,0.1550


In [38]:
# define df and file names
data_files = [
    (final_results, 'final_bagging_results.csv'),
]

for data, file_name in data_files:
    save_to_csv(data, file_name)

DataFrame saved to Data\final_bagging_results.csv
