In [None]:
!pip install -q imbalanced-learn

In [156]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report,f1_score,confusion_matrix,ConfusionMatrixDisplay
import warnings
warnings.simplefilter("ignore")

In [None]:
import imblearn
imblearn.__version__

Load data from GCS

In [89]:
Train = pd.read_csv("gs://modical-fraud-data/Train/Train-1542865627584.csv.xls")
Train_Inpatientdata = pd.read_csv("gs://modical-fraud-data/Train/Train_Inpatientdata-1542865627584.csv")
Train_Outpatientdata = pd.read_csv("gs://modical-fraud-data/Train/Train_Outpatientdata-1542865627584.csv")
Train_Benfdata = pd.read_csv("gs://modical-fraud-data/Train/Train_Beneficiarydata-1542865627584.csv")

In [90]:
Test = pd.read_csv("gs://modical-fraud-data/Test/Test-1542969243754.csv.xls")
Test_Inpatientdata = pd.read_csv("gs://modical-fraud-data/Test/Test_Inpatientdata-1542969243754.csv")
Test_Outpatientdata = pd.read_csv("gs://modical-fraud-data/Test/Test_Outpatientdata-1542969243754.csv")
Test_Benfdata = pd.read_csv("gs://modical-fraud-data/Test/Test_Beneficiarydata-1542969243754.csv")

Exploratory Data Analysis

In [None]:
print("Train data: {} and features {}".format(Train.shape[0],Train.shape[1]))
print("Train Inpatient data: {} and features {}".format(Train_Inpatientdata.shape[0],Train_Inpatientdata.shape[1]))
print("Train Outpatient data: {} and features {}".format(Train_Outpatientdata.shape[0],Train_Outpatientdata.shape[1]))
print("Train Beneficiary data: {} and features {}".format(Train_Benfdata.shape[0],Train_Benfdata.shape[1]))

In [None]:
print("Test data: {} and features {}".format(Test.shape[0],Test.shape[1]))
print("Test Inpatient data: {} and features {}".format(Test_Inpatientdata.shape[0],Test_Inpatientdata.shape[1]))
print("Test Outpatient data: {} and features {}".format(Test_Outpatientdata.shape[0],Test_Outpatientdata.shape[1]))
print("Test Beneficiary data: {} and features {}".format(Test_Benfdata.shape[0],Test_Benfdata.shape[1]))


In [None]:
Train.head()

In [None]:
Test.head()

Analyze Beneficiary Data

In [None]:
Train_Benfdata.head()

Duplicate in Beneficiary data

In [None]:
print("Duplicate Benef id Train Benef data: ", Train_Benfdata.duplicated().sum())
print("Duplicate Benef id Test Benef data: ", Test_Benfdata.duplicated().sum())

Unique Beneficiaries

In [None]:
print("Unique Benef in Train Benef data: ", Train_Benfdata['BeneID'].nunique())
print("Unique Benef in Test Benef data: ", Test_Benfdata['BeneID'].nunique())

convert DOB and DOD to datetime format

In [99]:
Train_Benfdata['DOB'] = pd.to_datetime(Train_Benfdata['DOB'],format='%Y-%m-%d')
Train_Benfdata['DOD'] = pd.to_datetime(Train_Benfdata['DOD'],format='%Y-%m-%d',errors='ignore')
Test_Benfdata['DOB'] = pd.to_datetime(Test_Benfdata['DOB'],format='%Y-%m-%d')
Test_Benfdata['DOD'] = pd.to_datetime(Test_Benfdata['DOD'],format='%Y-%m-%d',errors='ignore')

In [100]:
Train_Benfdata['Age'] = round((Train_Benfdata['DOD'] - Train_Benfdata['DOB']).dt.days / 365)
Test_Benfdata['Age'] = round((Test_Benfdata['DOD'] - Test_Benfdata['DOB']).dt.days / 365)

In [None]:
Train_Benfdata['DOD'].max()

In [None]:
Test_Benfdata['DOD'].max()

In [103]:
Train_Benfdata['Age'].fillna(
    round((Train_Benfdata['DOD'].max() - Train_Benfdata['DOB']).dt.days / 365), 
    inplace=True)

Test_Benfdata['Age'].fillna(
    round((Test_Benfdata['DOD'].max() - Test_Benfdata['DOB']).dt.days / 365), 
    inplace=True)

In [None]:
print(Train_Benfdata['Age'].isna().any())
print(Test_Benfdata['Age'].isna().any())

In [105]:
Train_Benfdata['AliveorDead'] = Train_Benfdata['DOD'].notna().astype(int)
Test_Benfdata['AliveorDead'] = Test_Benfdata['DOD'].notna().astype(int)


In [None]:
Train_Benfdata['AliveorDead'].value_counts()

In [None]:
Test_Benfdata['AliveorDead'].value_counts()

Analyze Inpatient data

In [None]:
Train_Inpatientdata.isna().sum()

In [None]:
Train_Inpatientdata.head()

In [110]:
Train_Inpatientdata['AdmissionDt'] = pd.to_datetime(Train_Inpatientdata['AdmissionDt'],format='%Y-%m-%d')
Train_Inpatientdata['DischargeDt'] = pd.to_datetime(Train_Inpatientdata['DischargeDt'],format='%Y-%m-%d')
Test_Inpatientdata['AdmissionDt'] = pd.to_datetime(Test_Inpatientdata['AdmissionDt'],format='%Y-%m-%d')
Test_Inpatientdata['DischargeDt'] = pd.to_datetime(Test_Inpatientdata['DischargeDt'],format='%Y-%m-%d')

In [111]:
Train_Inpatientdata['NumberofDaysAdmitted'] = (Train_Inpatientdata['DischargeDt'] - Train_Inpatientdata['AdmissionDt']).dt.days + 1
Test_Inpatientdata['NumberofDaysAdmitted'] = (Test_Inpatientdata['DischargeDt'] - Test_Inpatientdata['AdmissionDt']).dt.days + 1

In [112]:
Train_Inpatientdata['ClaimEndDt'] = pd.to_datetime(Train_Inpatientdata['ClaimEndDt'],format='%Y-%m-%d')
Train_Inpatientdata['ClaimStartDt'] = pd.to_datetime(Train_Inpatientdata['ClaimStartDt'],format='%Y-%m-%d')
Test_Inpatientdata['ClaimEndDt'] = pd.to_datetime(Test_Inpatientdata['ClaimEndDt'],format='%Y-%m-%d')
Test_Inpatientdata['ClaimStartDt'] = pd.to_datetime(Test_Inpatientdata['ClaimStartDt'],format='%Y-%m-%d')

In [113]:
Train_Inpatientdata['DurationofClaim'] = (Train_Inpatientdata['ClaimEndDt'] - Train_Inpatientdata['ClaimStartDt']).dt.days
Test_Inpatientdata['DurationofClaim'] = (Test_Inpatientdata['ClaimEndDt'] - Test_Inpatientdata['ClaimStartDt']).dt.days

In [114]:
Train_Inpatientdata['Admitted']=1
Test_Inpatientdata['Admitted']=1

In [None]:
px.histogram(Train_Inpatientdata,x='NumberofDaysAdmitted',title='Number of Days Admitted')

In [None]:
px.histogram(Train_Inpatientdata,x='DurationofClaim',title='Duration of Claim')

Outpatient Data Analysis

In [None]:
Train_Outpatientdata.head()

In [None]:
Train_Outpatientdata.isnull().sum()

In [None]:
print('Duplicate rows in train set: ', Train_Outpatientdata.duplicated().sum())

print('Duplicate rows in test set: ', Test_Outpatientdata.duplicated().sum())


In [120]:
Train_Outpatientdata['Admitted'] = 0
Test_Outpatientdata['Admitted'] = 0

In [121]:
Train_Outpatientdata['ClaimEndDt'] = pd.to_datetime(Train_Outpatientdata['ClaimEndDt'],format='%Y-%m-%d')
Train_Outpatientdata['ClaimStartDt'] = pd.to_datetime(Train_Outpatientdata['ClaimStartDt'],format='%Y-%m-%d')
Test_Outpatientdata['ClaimEndDt'] = pd.to_datetime(Test_Outpatientdata['ClaimEndDt'],format='%Y-%m-%d')
Test_Outpatientdata['ClaimStartDt'] = pd.to_datetime(Test_Outpatientdata['ClaimStartDt'],format='%Y-%m-%d')

In [122]:
Train_Outpatientdata['DurationofClaim'] = (Train_Outpatientdata['ClaimEndDt'] - Train_Outpatientdata['ClaimStartDt']).dt.days
Test_Outpatientdata['DurationofClaim'] = (Test_Outpatientdata['ClaimEndDt'] - Test_Outpatientdata['ClaimStartDt']).dt.days

Data Merging between in/out patients

In [None]:
common_cols = list(set(Train_Inpatientdata.columns).intersection(set(Train_Outpatientdata.columns)))
print(common_cols)

In [None]:
Train_allpatientdata = pd.merge(Train_Outpatientdata,Train_Inpatientdata,on=common_cols,how='outer')

Test_allpatientdata = pd.merge(Test_Outpatientdata,Test_Inpatientdata,on=common_cols,how='outer')

print(Train_allpatientdata.shape)
print(Test_allpatientdata.shape)



Merge Patient data with Beneficiary data

In [125]:
df_train = Train_allpatientdata.merge(Train_Benfdata,on='BeneID', how='inner')

df_test = Test_allpatientdata.merge(Test_Benfdata,on='BeneID',how='inner')

In [None]:
print('Training data shape: ', df_train.shape)
print('Test data shpe: ', df_test.shape)

In [127]:
df_train1 = pd.merge(Train,df_train,on='Provider')
df_test1 = pd.merge(Test,df_test,on='Provider')

In [128]:
df_train1['RenalDiseaseIndicator'].replace('Y','1',inplace=True)

df_train1['RenalDiseaseIndicator']=df_train1['RenalDiseaseIndicator'].astype(int)

df_test1['RenalDiseaseIndicator'].replace('Y','1',inplace=True)

df_test1['RenalDiseaseIndicator']=df_test1['RenalDiseaseIndicator'].astype(int)



df_train1.drop(columns=['DOB', 'DOD'], axis = 1, inplace=True)
df_test1.drop(columns=['DOB', 'DOD'], axis = 1, inplace=True)

In [129]:
df_train1['ClmDiagnosisCodeIndex'] = df_train1.filter(regex='ClmDiagnosisCode_').notnull().sum(axis=1)
df_test1['ClmDiagnosisCodeIndex'] = df_test1.filter(regex='ClmDiagnosisCode_').notnull().sum(axis=1)
df_train1['ClmProcedureCodeIndex'] = df_train1.filter(regex='ClmProcedureCode_').notnull().sum(axis=1)
df_test1['ClmProcedureCodeIndex'] = df_test1.filter(regex='ClmProcedureCode_').notnull().sum(axis=1)

In [130]:
columns_to_drop = df_train1.filter(regex='ClmProcedureCode_|ClmDiagnosisCode_').columns
df_train1 = df_train1.drop(columns_to_drop, axis=1)
df_test1 = df_test1.drop(columns_to_drop, axis=1)

In [131]:
df_train1['NumberofDaysAdmitted'] = df_train1['NumberofDaysAdmitted'].fillna(0)
df_test1['NumberofDaysAdmitted'] = df_test1['NumberofDaysAdmitted'].fillna(0)

In [132]:
df_train1 = df_train1.dropna(subset=['AttendingPhysician'])
df_test1 = df_test1.dropna(subset=['AttendingPhysician'])

df_train1['DeductibleAmtPaid'] = df_train1['DeductibleAmtPaid'].fillna(df_train1['DeductibleAmtPaid'].mean())
df_test1['DeductibleAmtPaid'] = df_test1['DeductibleAmtPaid'].fillna(df_test1['DeductibleAmtPaid'].mean())

In [133]:
'''Average features grouped by Provider'''

columns_to_transform = ["InscClaimAmtReimbursed", "DeductibleAmtPaid", "IPAnnualReimbursementAmt", "IPAnnualDeductibleAmt","OPAnnualReimbursementAmt", "OPAnnualDeductibleAmt", "Age", "NoOfMonths_PartACov", "NoOfMonths_PartBCov","DurationofClaim","NumberofDaysAdmitted"]

for column in columns_to_transform:
    df_train1[f"PerProviderAvg_{column}"] = df_train1.groupby('Provider')[column].transform('mean')
    df_test1[f"PerProviderAvg_{column}"] = df_test1.groupby('Provider')[column].transform('mean')

In [134]:
'''Average features group by BeneId and Operating Physician'''

columns_to_transform = ["InscClaimAmtReimbursed","DeductibleAmtPaid","IPAnnualReimbursementAmt","IPAnnualDeductibleAmt","OPAnnualReimbursementAmt","OPAnnualDeductibleAmt", "DurationofClaim","NumberofDaysAdmitted"]


for column in columns_to_transform:
  
    
    df_train1[f"PerBeneIDAvg_{column}"] = df_train1.groupby('BeneID')[column].transform('mean')
    df_test1[f"PerBeneIDAvg_{column}"] = df_test1.groupby('BeneID')[column].transform('mean')
    
    df_train1[f"PerAttendingPhysician Avg_{column}"] = df_train1.groupby('AttendingPhysician')[column].transform('mean')
    df_test1[f"PerAttendingPhysician Avg_{column}"] = df_test1.groupby('AttendingPhysician')[column].transform('mean')

In [135]:
'''Drop repeated and unnecessary features'''

df_train1.drop(columns=['ClmAdmitDiagnosisCode', 'Provider', 'State', 'Race', 'Gender', 'County', 'AdmissionDt', 'AttendingPhysician', 'OtherPhysician', 'OperatingPhysician',  
                        'DischargeDt', 'ClaimID', 'ClaimEndDt', 'DiagnosisGroupCode', 'ClaimStartDt', 'BeneID', 'ClaimID'], axis=1, inplace=True)

df_test1.drop(columns=['ClmAdmitDiagnosisCode', 'State', 'Race', 'County', 'Gender', 'AdmissionDt', 'DiagnosisGroupCode', 'OperatingPhysician', 'DischargeDt', 'AttendingPhysician', 'OtherPhysician', 
                       'ClaimID', 'ClaimEndDt', 'ClaimStartDt', 'ClaimID'], axis=1, inplace=True)


In [None]:
px.histogram(df_train1,x="PotentialFraud",color='PotentialFraud',title="Number of Fraud chart", height=500,width=700)

Preprocessing

In [137]:
df_train1['PotentialFraud'].replace({'No':0,'Yes':1},inplace=True)

In [None]:
df_train2, df_val = train_test_split(df_train1,test_size=0.10,random_state=42)

Y_train = df_train2.pop('PotentialFraud')
X_train = df_train2

Y_val = df_val.pop("PotentialFraud")
X_val = df_val

X_test = df_test1

print(X_train.shape,X_val.shape,X_test.shape)

In [140]:
categorical_cols = [col for col in X_train.columns if col.startswith('ChronicCond_')]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


encoded_data_train = encoder.fit_transform(X_train[categorical_cols])
encoded_data_val = encoder.fit_transform(X_val[categorical_cols])
encoded_data_test = encoder.fit_transform(X_test[categorical_cols])


encoded_df_train = pd.DataFrame(encoded_data_train,columns=encoder.get_feature_names_out())
encoded_df_val = pd.DataFrame(encoded_data_val,columns=encoder.get_feature_names_out())
encoded_df_test = pd.DataFrame(encoded_data_test,columns=encoder.get_feature_names_out())


X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

X_train = pd.concat([X_train.drop(categorical_cols,axis=1),encoded_df_train],axis=1)
X_val = pd.concat([X_val.drop(categorical_cols,axis=1),encoded_df_val],axis=1)
X_test = pd.concat([X_test.drop(categorical_cols,axis=1),encoded_df_test],axis=1)





In [None]:
from collections import Counter


print("before SMOTE: ", Counter(Y_train))

smt = SMOTE()

X_train,Y_train = smt.fit_resample(X_train,Y_train)

print("After SMOTE: ", Counter(Y_train))


In [None]:
X_train.head()

Model Building

In [None]:
rmf = RandomForestClassifier(n_estimators=100,random_state=0,max_depth=15)

rmf.fit(X_train,Y_train)

In [None]:
print("Accuracy on training data: ", rmf.score(X_train,Y_train))

print("Accuracy on validation data: ", rmf.score(X_val,Y_val))


In [152]:
'''Predictions on Test set'''

Y_pred = rmf.predict(X_val)



In [None]:
rec = recall_score(Y_val, Y_pred)
pre = precision_score(Y_val, Y_pred)
acc = accuracy_score(Y_val, Y_pred)
f1_sc =  f1_score(Y_val, Y_pred)

print("Accuracy :: ",acc)
print("Precision :: ",pre)
print("Recall :: ", rec)
print("f1_score", f1_sc)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm_RF)
disp.plot(cmap=plt.cm.Blues)
plt.title("Random Forest Confusion Matrix")
plt.show()