In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from sklearn.metrics import classification_report, accuracy_score
import random
random.seed(100)

import time
import pyodbc
print(pyodbc.drivers())

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


In [2]:
data = pd.read_csv(r'../data/interim/training_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,SamePhysician,OPD_Flag,PotentialFraud,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count
0,0,BENE11001,CLM46614,PRV55912,26000,1068.0,6,6,9.0,0.0,0.0,0,1,1943-01-01,2009-12-01,Male,White,0,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,36000,3204,60,70,1943,67.0,0,7.0
1,1,BENE11001,CLM66048,PRV55907,5000,1068.0,2,2,3.0,1.0,1.0,0,0,1943-01-01,2009-12-01,Male,White,0,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,36000,3204,60,70,1943,67.0,0,7.0
2,2,BENE11001,CLM68358,PRV56046,5000,1068.0,3,3,6.0,0.0,0.0,0,0,1943-01-01,2009-12-01,Male,White,0,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,36000,3204,60,70,1943,67.0,0,7.0
3,3,BENE11011,CLM38412,PRV52405,5000,1068.0,8,8,9.0,1.0,0.0,0,0,1914-03-01,2009-12-01,Female,Black or African American,0,Alabama,360,12,12,No,Yes,Yes,No,No,Yes,Yes,No,No,Yes,Yes,5000,1068,250,320,1914,96.0,0,6.0
4,4,BENE11014,CLM63689,PRV56614,10000,1068.0,17,17,9.0,1.0,0.0,0,0,1938-04-01,2009-12-01,Female,White,1,Utah,780,12,12,No,Yes,Yes,No,Yes,Yes,No,Yes,No,No,No,21260,2136,120,100,1938,72.0,0,5.0


In [3]:
# Encoding Categorical Variables
cat_cols = ['SamePhysician', 'OPD_Flag', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']

In [4]:
data['RenalDiseaseIndicator'] = data['RenalDiseaseIndicator'].replace({0:'No', 1:'Yes'})
data['OPD_Flag'] = data['OPD_Flag'].replace({0:'No', 1:'Yes'})
data['SamePhysician'] = data['SamePhysician'].astype(int).replace({0:'No', 1:'Yes'})

In [5]:
del data['Unnamed: 0']
data.describe()

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,PotentialFraud,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count
count,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0
mean,997.012133,78.293178,1.72794,0.410762,3.010897,0.053557,0.381211,378.588195,11.931472,11.93877,5227.971466,568.756807,2278.225348,649.698745,1935.72318,73.76977,0.0,4.498616
std,3821.534891,273.814592,4.904984,2.112693,2.448213,0.280534,0.485685,265.215531,0.889712,0.7859,11786.274732,1179.172616,3881.846386,1002.020811,13.011761,13.022524,0.0,2.332301
min,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8000.0,0.0,-70.0,0.0,1909.0,26.0,0.0,0.0
25%,40.0,0.0,0.0,0.0,1.0,0.0,0.0,150.0,12.0,12.0,0.0,0.0,460.0,120.0,1927.0,68.0,0.0,3.0
50%,80.0,0.0,0.0,0.0,2.0,0.0,0.0,350.0,12.0,12.0,0.0,0.0,1170.0,340.0,1935.0,75.0,0.0,5.0
75%,300.0,0.0,0.0,0.0,4.0,0.0,1.0,570.0,12.0,12.0,6000.0,1068.0,2590.0,790.0,1941.0,82.0,0.0,6.0
max,125000.0,1068.0,36.0,35.0,10.0,5.0,1.0,999.0,12.0,12.0,161470.0,38272.0,102960.0,13840.0,1983.0,101.0,0.0,11.0


In [6]:
data = data[data['IPAnnualReimbursementAmt'] >=0].reset_index(drop=True)
data = data[data['OPAnnualReimbursementAmt'] >= 0].reset_index(drop=True)
data = data[data['DeductibleAmtPaid'] >= 0].reset_index(drop=True)

In [7]:
def treat_outliers(df, columns):
   
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        print('lower', lower_bound)

        upper_bound = Q3 + 1.5 * IQR
        print('upper', upper_bound)
        # Cap outliers
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df
 
columns = ['InscClaimAmtReimbursed', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']
data = treat_outliers(data, columns)

lower -350.0
upper 690.0
lower -8550.0
upper 14250.0
lower -1602.0
upper 2670.0
lower -2735.0
upper 5785.0
lower -885.0
upper 1795.0


In [8]:
# Total Claims per Beneficiary

data['Total_Claims_Per_Bene'] = data.groupby('BeneID')['ClaimID'].transform('count')


In [9]:
# Average Reimbursement Amount per Beneficiary

data['Avg_Reimbursement_Per_Bene'] = data.groupby('BeneID')['InscClaimAmtReimbursed'].transform('mean')


In [10]:
#Age at Claim

# Convert DOB and ClaimPeriod to datetime
data['DOB'] = pd.to_datetime(data['DOB'], format='%m/%d/%Y', errors='coerce')
#data['ClaimPeriod'] = pd.to_datetime(data['ClaimPeriod'], format='%m/%d/%Y', errors='coerce')

# Handle any conversion errors if necessary
data['DOB'].fillna(pd.to_datetime('1/1/1900'), inplace=True)  # Example placeholder

# Calculate Age at Claim
data['Age_At_Claim'] = data.apply(lambda row: 
                                  (row['ClaimPeriod'] - row['DOB']).days // 365 if row['DOB'] != pd.Timestamp('1900-01-01') else row['Age'], axis=1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DOB'].fillna(pd.to_datetime('1/1/1900'), inplace=True)  # Example placeholder


In [11]:
#Multiple Chronic Conditions

data['Multiple_Chronic_Conditions'] = (data['ChronicDisease_Count'] > 1).astype(int)


In [12]:
#Claim to Deductible Ratio

data['Claim_To_Deductible_Ratio'] = data['InscClaimAmtReimbursed'] / (data['DeductibleAmtPaid'] + 1)  # +1 to avoid division by zero


In [13]:
#Total Annual Reimbursement

data['Total_Annual_Reimbursement'] = data['IPAnnualReimbursementAmt'] + data['OPAnnualReimbursementAmt']


In [14]:
#Average Claim Amount by Provider

provider_cols = [col for col in data.columns if 'Provider_' in col]
data['Avg_Reimbursement_By_Provider'] = data[provider_cols].multiply(data['InscClaimAmtReimbursed'], axis=0).sum(axis=1) / (data[provider_cols].sum(axis=1) + 1)


In [15]:
#Provider Claim Frequency

data['Provider_Claim_Frequency'] = data.groupby('Provider')['ClaimID'].transform('count')


In [16]:
data.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'DeductibleAmtPaid', 'ClaimPeriod', 'TimeInHptal', 'Diagnosis Count',
       'Procedures Count', 'SamePhysician', 'OPD_Flag', 'PotentialFraud',
       'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
       'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'BirthYear', 'Age', 'Alive',
       'ChronicDisease_Count', 'Total_Claims_Per_Bene',
       'Avg_Reimbursement_Per_Bene', 'Age_At_Claim',
       'Multiple_Chronic_Conditions', 'Claim_To_D

In [17]:
#Flag High-Risk Providers

# Calculate average reimbursement per provider
provider_avg_reimb = data.groupby('Provider')['InscClaimAmtReimbursed'].mean()
high_risk_providers = provider_avg_reimb[provider_avg_reimb > provider_avg_reimb.quantile(0.95)].index.tolist()

data['High_Risk_Provider'] = data['Provider'].apply(lambda x: 1 if x in high_risk_providers else 0)


In [18]:
data.describe()

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,PotentialFraud,DOB,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count,Total_Claims_Per_Bene,Avg_Reimbursement_Per_Bene,Age_At_Claim,Multiple_Chronic_Conditions,Claim_To_Deductible_Ratio,Total_Annual_Reimbursement,Avg_Reimbursement_By_Provider,Provider_Claim_Frequency,High_Risk_Provider
count,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0,557240.0
mean,213.544021,78.401985,1.716442,0.397233,3.002207,0.052362,0.380917,1900-01-01 00:00:00.000000256,378.577807,11.931532,11.938753,3241.515092,500.276556,1775.496357,533.798297,1935.723993,73.768934,0.0,4.495198,6.60496,213.544021,73.768934,0.886564,159.707479,5017.011449,0.0,822.717493,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1900-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1909.0,26.0,0.0,0.0,1.0,0.0,26.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,40.0,0.0,0.0,0.0,1.0,0.0,0.0,1900-01-01 00:00:00,150.0,12.0,12.0,0.0,0.0,460.0,120.0,1927.0,68.0,0.0,3.0,3.0,110.0,68.0,1.0,20.0,710.0,0.0,122.0,0.0
50%,80.0,0.0,0.0,0.0,2.0,0.0,0.0,1900-01-01 00:00:00,350.0,12.0,12.0,0.0,0.0,1170.0,340.0,1935.0,75.0,0.0,4.0,6.0,191.666667,75.0,1.0,60.0,2310.0,0.0,361.0,0.0
75%,300.0,0.0,0.0,0.0,4.0,0.0,1.0,1900-01-01 00:00:00,570.0,12.0,12.0,5700.0,1068.0,2590.0,790.0,1941.0,82.0,0.0,6.0,9.0,281.25,82.0,1.0,200.0,7410.0,0.0,1013.0,0.0
max,690.0,1068.0,36.0,35.0,10.0,5.0,1.0,1900-01-01 00:00:00,999.0,12.0,12.0,14250.0,2670.0,5785.0,1795.0,1983.0,101.0,0.0,11.0,29.0,690.0,101.0,1.0,690.0,20035.0,0.0,8240.0,0.0
std,248.056362,273.985337,4.88966,2.061639,2.439777,0.277317,0.485613,,265.223469,0.889318,0.786028,5234.177103,784.435859,1692.427577,526.200176,13.010535,13.0212,0.0,2.331492,4.081212,138.459511,13.0212,0.317125,214.859213,5705.707451,0.0,1276.159398,0.0


In [19]:
data.head()

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,SamePhysician,OPD_Flag,PotentialFraud,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,BirthYear,Age,Alive,ChronicDisease_Count,Total_Claims_Per_Bene,Avg_Reimbursement_Per_Bene,Age_At_Claim,Multiple_Chronic_Conditions,Claim_To_Deductible_Ratio,Total_Annual_Reimbursement,Avg_Reimbursement_By_Provider,Provider_Claim_Frequency,High_Risk_Provider
0,BENE11001,CLM46614,PRV55912,690.0,1068.0,6,6,9.0,0.0,No,No,1,1900-01-01,2009-12-01,Male,White,No,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,14250.0,2670.0,60.0,70.0,1943,67.0,0,7.0,3,690.0,67.0,1,0.645463,14310.0,0.0,107,0
1,BENE11001,CLM66048,PRV55907,690.0,1068.0,2,2,3.0,1.0,Yes,No,0,1900-01-01,2009-12-01,Male,White,No,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,14250.0,2670.0,60.0,70.0,1943,67.0,0,7.0,3,690.0,67.0,1,0.645463,14310.0,0.0,243,0
2,BENE11001,CLM68358,PRV56046,690.0,1068.0,3,3,6.0,0.0,No,No,0,1900-01-01,2009-12-01,Male,White,No,Pennsylvania,230,12,12,Yes,No,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,14250.0,2670.0,60.0,70.0,1943,67.0,0,7.0,3,690.0,67.0,1,0.645463,14310.0,0.0,20,0
3,BENE11011,CLM38412,PRV52405,690.0,1068.0,8,8,9.0,1.0,No,No,0,1900-01-01,2009-12-01,Female,Black or African American,No,Alabama,360,12,12,No,Yes,Yes,No,No,Yes,Yes,No,No,Yes,Yes,5000.0,1068.0,250.0,320.0,1914,96.0,0,6.0,4,217.5,96.0,1,0.645463,5250.0,0.0,89,0
4,BENE11014,CLM63689,PRV56614,690.0,1068.0,17,17,9.0,1.0,No,No,0,1900-01-01,2009-12-01,Female,White,Yes,Utah,780,12,12,No,Yes,Yes,No,Yes,Yes,No,Yes,No,No,No,14250.0,2136.0,120.0,100.0,1938,72.0,0,5.0,2,375.0,72.0,1,0.645463,14370.0,0.0,24,0
