In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from sklearn.metrics import classification_report, accuracy_score
import random
random.seed(100)

import time
import pyodbc
print(pyodbc.drivers())

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


In [2]:
data = pd.read_csv(r'../../data/validation/testing_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,SamePhysician,OPD_Flag,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count
0,0,BENE11014,CLM67387,PRV57070,9000,1068.0,7,7,10.0,2.0,0.0,0,1938-04-01,2009-12-01,Female,White,1,Utah,780,12,12,No,Yes,Yes,No,Yes,Yes,No,Yes,No,No,No,21260,2136,120,100,1938,72.0,0,5.0
1,1,BENE11017,CLM31237,PRV54750,14000,1068.0,14,14,9.0,1.0,0.0,0,1940-06-01,2009-12-01,Female,White,0,New Jersey,270,12,12,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,22000,2136,1400,840,1940,70.0,0,9.0
2,2,BENE11026,CLM78930,PRV53758,2000,1068.0,4,4,9.0,0.0,0.0,0,1938-04-01,2009-12-01,Male,White,0,Massachusetts,20,12,12,No,No,No,No,No,No,No,Yes,No,No,No,2000,1068,0,0,1938,72.0,0,1.0
3,3,BENE11031,CLM56810,PRV55825,16000,1068.0,13,13,10.0,2.0,0.0,0,1944-12-01,2009-12-01,Female,White,0,Oregon,200,12,12,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,23650,2136,40,0,1944,65.0,0,4.0
4,4,BENE11085,CLM34625,PRV52338,19000,1068.0,11,11,8.0,0.0,0.0,0,1963-05-01,2009-12-01,Female,White,1,Georgia,470,12,12,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,19000,1068,1670,520,1963,47.0,0,9.0


In [3]:
# Encoding Categorical Variables
cat_cols = ['SamePhysician', 'OPD_Flag', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']

In [4]:
data['RenalDiseaseIndicator'] = data['RenalDiseaseIndicator'].replace({0:'No', 1:'Yes'})
data['OPD_Flag'] = data['OPD_Flag'].replace({0:'No', 1:'Yes'})
data['SamePhysician'] = data['SamePhysician'].astype(int).replace({0:'No', 1:'Yes'})

In [5]:
del data['Unnamed: 0']
data.describe()

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count
count,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0,135392.0
mean,981.307906,76.387002,1.720249,0.407498,2.992621,0.051598,373.100375,11.930469,11.94301,5271.108559,575.083978,2277.197988,645.259572,1935.708432,73.791605,0.0,4.511596
std,3788.177532,270.599536,4.907056,2.140654,2.434405,0.275035,275.836853,0.895486,0.756944,11746.248324,1211.89884,3985.942527,997.355872,12.892976,12.905014,0.0,2.336062
min,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1000.0,0.0,-60.0,0.0,1909.0,26.0,0.0,0.0
25%,40.0,0.0,0.0,0.0,1.0,0.0,140.0,12.0,12.0,0.0,0.0,470.0,120.0,1927.0,68.0,0.0,3.0
50%,80.0,0.0,0.0,0.0,2.0,0.0,330.0,12.0,12.0,0.0,0.0,1170.0,340.0,1935.0,75.0,0.0,5.0
75%,300.0,0.0,0.0,0.0,4.0,0.0,570.0,12.0,12.0,6000.0,1068.0,2560.0,790.0,1941.0,82.0,0.0,6.0
max,125000.0,1068.0,35.0,35.0,10.0,5.0,999.0,12.0,12.0,155600.0,38272.0,97510.0,13840.0,1983.0,101.0,0.0,11.0


In [6]:
data = data[data['IPAnnualReimbursementAmt'] >=0].reset_index(drop=True)
data = data[data['OPAnnualReimbursementAmt'] >= 0].reset_index(drop=True)
data = data[data['DeductibleAmtPaid'] >= 0].reset_index(drop=True)

In [None]:
# def treat_outliers(df, columns):
   
#     for column in columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         print('lower', lower_bound)

#         upper_bound = Q3 + 1.5 * IQR
#         print('upper', upper_bound)
#         # Cap outliers
#         df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
#         df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
#     return df
 
# columns = ['InscClaimAmtReimbursed', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']
# data = treat_outliers(data, columns)

In [8]:
# Total Claims per Beneficiary

data['Total_Claims_Per_Bene'] = data.groupby('BeneID')['ClaimID'].transform('count')


In [9]:
# Average Reimbursement Amount per Beneficiary

data['Avg_Reimbursement_Per_Bene'] = data.groupby('BeneID')['InscClaimAmtReimbursed'].transform('mean')


In [10]:
#Age at Claim

# Convert DOB and ClaimPeriod to datetime
data['DOB'] = pd.to_datetime(data['DOB'], format='%m/%d/%Y', errors='coerce')
#data['ClaimPeriod'] = pd.to_datetime(data['ClaimPeriod'], format='%m/%d/%Y', errors='coerce')

# Handle any conversion errors if necessary
data['DOB'].fillna(pd.to_datetime('1/1/1900'), inplace=True)  # Example placeholder

# Calculate Age at Claim
data['Age_At_Claim'] = data.apply(lambda row: 
                                  (row['ClaimPeriod'] - row['DOB']).days // 365 if row['DOB'] != pd.Timestamp('1900-01-01') else row['Age'], axis=1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DOB'].fillna(pd.to_datetime('1/1/1900'), inplace=True)  # Example placeholder


In [11]:
#Multiple Chronic Conditions

data['Multiple_Chronic_Conditions'] = (data['ChronicDisease_Count'] > 1).astype(int)


In [12]:
#Claim to Deductible Ratio

data['Claim_To_Deductible_Ratio'] = data['InscClaimAmtReimbursed'] / (data['DeductibleAmtPaid'] + 1)  # +1 to avoid division by zero


In [13]:
#Total Annual Reimbursement

data['Total_Annual_Reimbursement'] = data['IPAnnualReimbursementAmt'] + data['OPAnnualReimbursementAmt']


In [14]:
#Average Claim Amount by Provider

provider_cols = [col for col in data.columns if 'Provider_' in col]
data['Avg_Reimbursement_By_Provider'] = data[provider_cols].multiply(data['InscClaimAmtReimbursed'], axis=0).sum(axis=1) / (data[provider_cols].sum(axis=1) + 1)


In [15]:
#Provider Claim Frequency

data['Provider_Claim_Frequency'] = data.groupby('Provider')['ClaimID'].transform('count')


In [16]:
data.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'DeductibleAmtPaid', 'ClaimPeriod', 'TimeInHptal', 'Diagnosis Count',
       'Procedures Count', 'SamePhysician', 'OPD_Flag', 'DOB', 'DOD', 'Gender',
       'Race', 'RenalDiseaseIndicator', 'State', 'County',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'BirthYear', 'Age',
       'Alive', 'ChronicDisease_Count', 'Total_Claims_Per_Bene',
       'Avg_Reimbursement_Per_Bene', 'Age_At_Claim',
       'Multiple_Chronic_Conditions', 'Claim_To_Deductible_Ratio',
       

In [17]:
#Flag High-Risk Providers

# Calculate average reimbursement per provider
provider_avg_reimb = data.groupby('Provider')['InscClaimAmtReimbursed'].mean()
high_risk_providers = provider_avg_reimb[provider_avg_reimb > provider_avg_reimb.quantile(0.95)].index.tolist()

data['High_Risk_Provider'] = data['Provider'].apply(lambda x: 1 if x in high_risk_providers else 0)


In [18]:
data.describe()

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,DOB,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count,Total_Claims_Per_Bene,Avg_Reimbursement_Per_Bene,Age_At_Claim,Multiple_Chronic_Conditions,Claim_To_Deductible_Ratio,Total_Annual_Reimbursement,Avg_Reimbursement_By_Provider,Provider_Claim_Frequency,High_Risk_Provider
count,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0,135181.0
mean,964.193415,76.47608,1.710048,0.39524,2.984532,0.050436,1900-01-01 00:00:00,373.063078,11.930508,11.943032,5230.590394,572.712778,2276.831803,645.242704,1935.709293,73.790732,0.0,4.508607,3.710751,964.193415,73.790732,0.887588,258.638037,7507.422197,0.0,737.910764,0.007279
min,0.0,0.0,0.0,0.0,0.0,0.0,1900-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1909.0,26.0,0.0,0.0,1.0,0.0,26.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,40.0,0.0,0.0,0.0,1.0,0.0,1900-01-01 00:00:00,140.0,12.0,12.0,0.0,0.0,470.0,120.0,1927.0,68.0,0.0,3.0,1.0,70.0,68.0,1.0,20.0,720.0,0.0,111.0,0.0
50%,80.0,0.0,0.0,0.0,2.0,0.0,1900-01-01 00:00:00,330.0,12.0,12.0,0.0,0.0,1170.0,340.0,1935.0,75.0,0.0,4.0,3.0,179.0,75.0,1.0,60.0,2320.0,0.0,348.0,0.0
75%,300.0,0.0,0.0,0.0,4.0,0.0,1900-01-01 00:00:00,570.0,12.0,12.0,6000.0,1068.0,2560.0,790.0,1941.0,82.0,0.0,6.0,5.0,543.333333,82.0,1.0,200.0,8460.0,0.0,933.0,0.0
max,125000.0,1068.0,35.0,35.0,10.0,5.0,1900-01-01 00:00:00,999.0,12.0,12.0,155600.0,38272.0,97510.0,13840.0,1983.0,101.0,0.0,11.0,25.0,125000.0,101.0,1.0,17300.0,241510.0,0.0,3246.0,1.0
std,3731.239569,270.739988,4.89376,2.095719,2.426342,0.27185,,275.842771,0.895333,0.756792,11672.317476,1206.378131,3984.324924,997.02569,12.89185,12.903888,0.0,2.335354,3.009448,3059.705426,12.903888,0.315874,601.20439,12998.274562,0.0,871.147664,0.085007


In [19]:
data.head()

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,SamePhysician,OPD_Flag,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count,Total_Claims_Per_Bene,Avg_Reimbursement_Per_Bene,Age_At_Claim,Multiple_Chronic_Conditions,Claim_To_Deductible_Ratio,Total_Annual_Reimbursement,Avg_Reimbursement_By_Provider,Provider_Claim_Frequency,High_Risk_Provider
0,BENE11014,CLM67387,PRV57070,9000,1068.0,7,7,10.0,2.0,No,No,1900-01-01,2009-12-01,Female,White,Yes,Utah,780,12,12,No,Yes,Yes,No,Yes,Yes,No,Yes,No,No,No,21260,2136,120,100,1938,72.0,0,5.0,2,4530.0,72.0,1,8.419083,21380,0.0,12,1
1,BENE11017,CLM31237,PRV54750,14000,1068.0,14,14,9.0,1.0,No,No,1900-01-01,2009-12-01,Female,White,No,New Jersey,270,12,12,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,22000,2136,1400,840,1940,70.0,0,9.0,1,14000.0,70.0,1,13.096352,23400,0.0,38,0
2,BENE11026,CLM78930,PRV53758,2000,1068.0,4,4,9.0,0.0,No,No,1900-01-01,2009-12-01,Male,White,No,Massachusetts,20,12,12,No,No,No,No,No,No,No,Yes,No,No,No,2000,1068,0,0,1938,72.0,0,1.0,1,2000.0,72.0,0,1.870907,2000,0.0,180,0
3,BENE11031,CLM56810,PRV55825,16000,1068.0,13,13,10.0,2.0,No,No,1900-01-01,2009-12-01,Female,White,No,Oregon,200,12,12,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,23650,2136,40,0,1944,65.0,0,4.0,1,16000.0,65.0,1,14.967259,23690,0.0,118,0
4,BENE11085,CLM34625,PRV52338,19000,1068.0,11,11,8.0,0.0,No,No,1900-01-01,2009-12-01,Female,White,Yes,Georgia,470,12,12,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,19000,1068,1670,520,1963,47.0,0,9.0,1,19000.0,47.0,1,17.77362,20670,0.0,1548,0


In [20]:
data.to_csv('../../data/validation/testing_final_data.csv')