## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score

import imblearn

import plotly.express as px

In [2]:
providers = load('./data/Iteration_1/Providers_Final_Iteration_1.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

# # 70/30 split gives roughly the same baseline model results
# # but saves grid_search time

In [5]:
# # Scale only the training data to avoid data leakage
scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
# # Stratify folds so that classes always have the same sample ratio
skfold = ms.StratifiedKFold(n_splits=10, random_state=0, shuffle=True);

In [7]:
def model_results(model,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
        print(model)
        print(recall_score(y_train, model.predict(X_train)))
        print(recall_score(y_test, model.predict(X_test)))

## Logistic Regression

### Models

In [8]:
# # baseline model

# # L1 penalty for feature selection, liblinear solver faster than saga
# logRegCV = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1)

# logRegCV.fit(X_train, y_train)

# # dump(logRegCV, './data/logRegCV.pkl')

logRegCV = load('./data/Iteration_1/logRegCV.pkl')
model_results(logRegCV)

LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                     n_jobs=-1, penalty='l1', random_state=0, scoring='recall',
                     solver='liblinear', verbose=1)
0.9180790960451978
0.9144736842105263




In [9]:
# # grid search with accuracy scoring metric

# logRegGSAccuracy = ms.GridSearchCV(logRegModel, param_grid=params,
#                                    cv=skfold, n_jobs=(-1), verbose=1)

# logRegAccuracy = logRegGSAccuracy.fit(X_train, y_train)
# bestLogRegAccuracy = logRegAccuracy.best_estimator_

# # dump(bestLogRegAccuracy, './data/bestLogRegAccuracy.pkl')

bestLogRegAccuracy = load('./data/Iteration_1/bestLogRegAccuracy.pkl')
model_results(bestLogRegAccuracy)

LogisticRegression(C=68.66488450042998, class_weight='balanced', n_jobs=-1,
                   penalty='l1', random_state=0, solver='liblinear', verbose=1)
0.9152542372881356
0.8289473684210527




In [10]:
# # grid search with recall scoring metric

# # can't use scoring param, need to use recall_score()
# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear',
#                        class_weight='balanced', random_state=0,
#                        n_jobs=(-1), verbose=1)

# params = {'C': np.logspace(-2, 2, 50),
#           'max_iter': [100, 500, 1000]}

# logRegGS = ms.GridSearchCV(logRegModel, param_grid=params,
#                            scoring='recall', cv=skfold, verbose=1)

# logReg = logRegGS.fit(X_train, y_train)
# bestLogReg = logReg.best_estimator_

# # dump(bestLogReg, './data/bestLogReg.pkl')

bestLogReg = load('./data/Iteration_1/bestLogReg.pkl')
model_results(bestLogReg)

LogisticRegression(C=0.0655128556859551, class_weight='balanced', n_jobs=-1,
                   penalty='l1', random_state=0, solver='liblinear', verbose=1)
0.9265536723163842
0.9210526315789473




### Results

In [11]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

Unnamed: 0,Coefficient
PatientsPerOthPhys,7.691565
Ratio_ClaimsPerPatient,3.215364
IP_Count_UniquePatients,3.138851
Perc_Outpatient,1.847467
DualPatientProvider,1.63698
OP_Perc_MultHosp,0.674412
IP_Mean_InsReimbursementRatio,0.379315
OP_Count_UniqueState,0.22813
IP_Perc_HeartFailure_Chronic,0.14315
Perc_MultHospAttPhys,0.057042


## Tree Models

### Random Forest

In [31]:
# max_features default is 'auto' (sqrt(n_features))
# class_weight='balanced_subsample' performs better than 'balanced'
# randForestModel = \
#     RandomForestClassifier(class_weight='balanced_subsample', random_state=0)

# randForestModel.fit(X_train, y_train)

# dump(randForestModel, './data/randForestModel.pkl')

randForestModel = load('./data/Iteration_1/randForestModel.pkl')
model_results(randForestModel)

RandomForestClassifier(class_weight='balanced', random_state=0)
1.0
0.4144736842105263




In [55]:
best_randForest = load('./data/Iteration_1/best_randForest.pkl')
model_results(best_randForest)

RandomForestClassifier(class_weight='balanced', max_depth=5.0, max_features=3,
                       random_state=0)
0.9293785310734464
0.9276315789473685




In [27]:
# max_features default is 'auto' (sqrt(n_features))
# class_weight='balanced_subsample' performs better than 'balanced'
bestRF_bal_sub = \
    RandomForestClassifier(class_weight='balanced_subsample',
                           max_depth=5, max_features=3,
                           random_state=0)

bestRF_bal_sub.fit(X_train, y_train)
model_results(bestRF_bal_sub)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=5,
                       max_features=3, random_state=0)
0.9350282485875706
0.881578947368421


#### Feature Importances

In [57]:
def feature_importances(model):
    df = pd.DataFrame({'feature': np.array(X.columns),
                       'importance': model.feature_importances_}
                     ).sort_values('importance')
    print('Top 10:', df.sort_values('importance', ascending=False
                                   ).feature.head(10).to_list())
    return px.bar(df, 'importance', 'feature', height=1000)
feature_importances(best_randForest)

Top 10: ['IP_Mean_InscClaimAmtReimbursed', 'IP_Mean_ClaimCost', 'IP_Count_UniquePatients', 'PatientsPerAttPhys', 'IP_Mean_AdmitDuration', 'IP_Mean_ClaimDuration', 'PatientsPerOthPhys', 'DualPatientProvider', 'PatientsPerOperPhys', 'IP_Mean_AgeAtClaim']


#### Grid Searches

In [35]:
# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'max_features': [2, 3, 5]}

# randForestGS = ms.GridSearchCV(randForestModel, param_grid=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=(-1), verbose=1)

# randForest = randForestGS.fit(X_train, y_train)

# bestRandForest = randForest.best_estimator_
# dump(bestRandForest, './data/Iteration_1/bestRandForest.pkl')

In [16]:
# 70/30 train/test split, MinMaxScaler(), skf=10

# params = {'n_estimators': [100, 500, 1500],
#           'max_depth': [2, 5, 8]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, n_estimators=500,
#                        random_state=0)
# 0.8870056497175142
# 0.875
# RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
#                        n_estimators=500, random_state=0)
# 0.884180790960452
# 0.875

# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'min_samples_split': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, n_estimators=500,
#                        random_state=0)
# 0.8870056497175142
# 0.875
# RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
#                        n_estimators=500, random_state=0)
# 0.884180790960452
# 0.875

# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'max_features': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, max_features=3,
#                        random_state=0)
# 0.8615819209039548
# 0.8618421052631579
# RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
#                        max_features=3, random_state=0)
# 0.8615819209039548
# 0.8618421052631579



# 90/10 train/test split, MinMaxScaler, skfold=10
# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'max_features': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, max_features=2,
#                        n_estimators=750, random_state=0)
# 0.8637362637362638
# 0.8235294117647058
# RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
#                        max_features=2, n_estimators=750, random_state=0)
# 0.8637362637362638
# 0.8235294117647058

### Gradient Boosting

#### Upsampling

In [18]:
oversample = imblearn.over_sampling.SMOTE(random_state=0)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

#### Models

In [79]:
gradBoostModel = \
    GradientBoostingClassifier(max_features='auto', random_state=0)

gradBoostModel.fit(X_train_SMOTE, y_train_SMOTE)
# dump(gradBoostModel, './data/gradBoostModel.pkl')

# gradBoostModel = load('./data/Iteration_1/gradBoostModel.pkl')
model_results(gradBoostModel)

GradientBoostingClassifier(max_features='auto', random_state=0)
0.847457627118644
0.743421052631579


In [20]:
# params = {'n_estimators': [100, 1000, 5000],
#           'learning_rate':[0.05, 0.1, 0.5],
#           'min_samples_leaf': [1, 3, 5],
#           'max_depth': np.arange(1, 20, 8),
#           'max_features' : np.arange(1, 20, 8)}

# gradBoostGS = ms.GridSearchCV(gradBoostModel, param_grid=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=-1, verbose=1)

# gradBoost = gradBoostGS.fit(X_train_SMOTE, y_train_SMOTE)

# bestGradBoost = gradBoost.best_estimator_
# dump(bestGradBoost, './data/bestGradBoost.pkl')

bestGradBoost = load('./data/Iteration_1/bestGradBoost.pkl')
model_results(bestGradBoost)

GradientBoostingClassifier(max_depth=9, max_features=1, min_samples_leaf=3,
                           n_estimators=1000, random_state=0)
1.0
0.5986842105263158


In [21]:
# params = {'n_estimators': [100,500,1000,2500,5000,6000],
#                 'learning_rate':[0.01,0.05,0.08,0.1, 0.25],
#                 'max_depth': np.arange(2, 21, 3),
#                 'min_samples_split': np.arange(2, 13, 2),
#                 'min_samples_leaf' : np.arange(2, 13, 2)}

# gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
#                                     param_distributions=params,
#                                     cv=skfold, scoring = 'recall',
#                                     return_train_score = True,
#                                     verbose=1, n_jobs=(-1))
# gradBoostRSfit = gradBoostRS.fit(X_train_SMOTE, y_train_SMOTE)

# bestGradBoostRS = gradBoostRSfit.best_estimator_
# dump(bestGradBoostRS, './data/bestGradBoostRS.pkl')

bestGradBoostRS = load('./data/Iteration_1/bestGradBoostRS.pkl')
model_results(bestGradBoostRS)

GradientBoostingClassifier(learning_rate=0.01, max_depth=11,
                           max_features='auto', min_samples_leaf=2,
                           min_samples_split=6, n_estimators=2500,
                           random_state=0)
1.0
0.625


## Support Vector Machine

In [179]:
svmModel = SVC(random_state = 0, class_weight = 'balanced')

svmFit = svmModel.fit(X_train, y_train)
model_results(svmModel)

SVC(class_weight='balanced', random_state=0)
0.9378531073446328
0.9210526315789473


In [178]:
# params = {'C': np.logspace(-3, 2, 50)}

# svmGS = ms.GridSearchCV(svmModel, param_grid=params, scoring='recall',
#                         cv=skfold, n_jobs=(-1), verbose=1).fit(X_train, y_train)

# bestSVM = svmGS.best_estimator_
# dump(bestSVM, './data/Iteration_1/bestSVM.pkl')

bestSVM = load('./data/Iteration_1/bestSVM.pkl')
model_results(bestSVM)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   55.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.7min finished


SVC(C=0.22229964825261955, class_weight='balanced', random_state=0)
0.9293785310734464
0.9210526315789473


Gaussian Naive Bayes

In [175]:
# # baseline model
nbModel = GaussianNB()

gnbFit = gnbModel.fit(X_train, y_train)
model_results(gnbFit)

GaussianNB()
0.8559322033898306
0.8289473684210527


In [177]:
# # grid search
# params = {'var_smoothing': [0.001, 0.01, 0.1]}

# gnbModelGS = ms.GridSearchCV(gnbModel, param_grid=params,
#                              scoring='recall', cv=skfold,
#                              n_jobs=(-1), verbose=1
#                             ).fit(X_train, y_train)

# bestGNB = gnbModelGS.best_estimator_
# dump(bestGNB, './data/Iteration_1/bestGNB.pkl')

bestGNB = load('./data/Iteration_1/bestGNB.pkl')
model_results(bestGNB)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


GaussianNB(var_smoothing=0.01)
0.9096045197740112
0.9013157894736842
