## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

from imblearn import over_sampling

In [2]:
providers = load('./data/Providers_Final_Second.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop(['PotentialFraud', 'PatientsPerAttPhys',
                    'PatientsPerOperPhys', 'PatientsPerOthPhys'], axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
# # 70/30 split gives roughly the same baseline model results
# # as 80/20 and 90/10, but saves grid_search time

X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

In [5]:
# # Scale only the training data to avoid data leakage
# # MinMax performed better or about the same as StandardScaler,
# # RobustScaler, and Normalize on most models

scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
# Upsample for K-nearest Neighbors

oversample = over_sampling.SMOTE(random_state=0)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

In [7]:
# # Stratify folds so that classes always have the same sample ratio
# # n_splits=10 to maximize testing for this small dataset

skfold = ms.StratifiedKFold(n_splits=5, random_state=0, shuffle=True);

In [8]:
def model_results(model,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
        train_recall = recall_score(y_train, model.predict(X_train))
        test_recall = recall_score(y_test, model.predict(X_test))
        
        print('Model details:', model, '\n')
        print('Train Set Recall Score:',
              f'{round(train_recall, 4) * 100}%')
        print('Test Set Recall Score:',
              f'{round(test_recall, 4) * 100}%')
        print('\nTrain Set Confusion Matrix:\n',
              confusion_matrix(y_train, model.predict(X_train)))
        print('Test Set Confusion Matrix:\n',
              confusion_matrix(y_test, model.predict(X_test)))

## Logistic Regression

### Models

In [9]:
# # baseline model

# logRegCV = \
#     LogisticRegressionCV(class_weight='balanced',
#                          cv=skfold, scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCV, './data/Iteration_2/logRegCV.pkl')

logRegCV = load('./data/Iteration_2/logRegCV.pkl')
model_results(logRegCV)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, random_state=0, scoring='recall', verbose=1) 

Train Set Recall Score: 87.29%
Test Set Recall Score: 86.18%

Train Set Confusion Matrix:
 [[2255 1178]
 [  45  309]]
Test Set Confusion Matrix:
 [[1003  468]
 [  21  131]]


In [10]:
# # baseline model
# # L1 penalty for feature selection, liblinear solver faster than saga

# logRegCVL1 = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCVL1, './data/Iteration_2/logRegCVL1.pkl')

logRegCVL1 = load('./data/Iteration_2/logRegCVL1.pkl')
model_results(logRegCVL1)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, penalty='l1', random_state=0, scoring='recall',
                     solver='liblinear', verbose=1) 

Train Set Recall Score: 89.55%
Test Set Recall Score: 89.47%

Train Set Confusion Matrix:
 [[2919  514]
 [  37  317]]
Test Set Confusion Matrix:
 [[1271  200]
 [  16  136]]


In [11]:
# # grid search (fine-tuned)

# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear',
#                        class_weight='balanced', random_state=0)

# params = {'C': np.logspace(-2, 2, 50),
#           'max_iter': [150, 200]}

# logRegRS = ms.RandomizedSearchCV(logRegModel, param_distributions=params,
#                            scoring='recall', cv=skfold, verbose=1,
#                            return_train_score=True
#                           ).fit(X_train, y_train)

# bestLogReg = logRegRS.best_estimator_

# dump(bestLogReg, './data/Iteration_2/bestLogReg.pkl')

bestLogReg = load('./data/Iteration_2/bestLogReg.pkl')
model_results(bestLogReg)
print('\nAbout 2% worse than Iteration 1')

Model details: LogisticRegression(C=0.2442053094548651, class_weight='balanced', max_iter=150,
                   penalty='l1', random_state=0, solver='liblinear') 

Train Set Recall Score: 89.55%
Test Set Recall Score: 89.47%

Train Set Confusion Matrix:
 [[2885  548]
 [  37  317]]
Test Set Confusion Matrix:
 [[1253  218]
 [  16  136]]

About 2% worse than Iteration 1


### Results

In [12]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

Unnamed: 0,Coefficient
OP_Sum_InscClaimAmtReimbursed,8.485075
Ratio_ClaimsPerPatient,6.175729
IP_Sum_InscClaimAmtReimbursed,6.120085
Perc_Outpatient,3.324114
IP_Mean_ClaimDuration,2.173876
OP_AgeRange,2.043294
IP_Mean_ClaimCostPerOperPhys,1.848461
OP_Perc_KidneyDisease_Chronic,1.455212
IP_AgeRange,1.226295
OP_Sum_DeductibleAmtPaid,1.163725


## K-nearest Neighbors

In [13]:
# # baseline model with SMOTE

# knn = KNeighborsClassifier().fit(X_train_SMOTE, y_train_SMOTE)

# dump(knn, './data/Iteration_2/knn.pkl')

knn = load('./data/Iteration_2/knn.pkl')
model_results(knn, X_train_SMOTE, y_train_SMOTE)

Model details: KNeighborsClassifier() 

Train Set Recall Score: 100.0%
Test Set Recall Score: 87.5%

Train Set Confusion Matrix:
 [[2750  683]
 [   0 3433]]
Test Set Confusion Matrix:
 [[1109  362]
 [  19  133]]


In [14]:
# # grid search (fine-tuned)

# params = {'n_neighbors': np.arange(30, 41)}

# knnGS = \
#     ms.GridSearchCV(knn, param_grid=params,
#                           cv=skfold, n_jobs=(-1), verbose = 1,
#                           return_train_score = True,
#                           scoring = 'recall').fit(X_train_SMOTE,
#                                                   y_train_SMOTE)

# bestKNN = knnGS.best_estimator_
# dump(bestKNN, './data/Iteration_2/bestKNN.pkl')

bestKNN = load('./data/Iteration_2/bestKNN.pkl')
model_results(bestKNN, X_train_SMOTE, y_train_SMOTE)
print('\nAbout 2% worse than Iteration 1')

Model details: KNeighborsClassifier(n_neighbors=31) 

Train Set Recall Score: 97.76%
Test Set Recall Score: 94.08%

Train Set Confusion Matrix:
 [[2364 1069]
 [  77 3356]]
Test Set Confusion Matrix:
 [[1011  460]
 [   9  143]]

About 2% worse than Iteration 1


## Gaussian Naive Bayes

In [15]:
# # baseline model

# gnb = GaussianNB().fit(X_train, y_train)

# dump(gnb, './data/Iteration_2/gnb.pkl')

gnb = load('./data/Iteration_2/gnb.pkl')
model_results(gnb)

Model details: GaussianNB() 

Train Set Recall Score: 77.68%
Test Set Recall Score: 75.66000000000001%

Train Set Confusion Matrix:
 [[3096  337]
 [  79  275]]
Test Set Confusion Matrix:
 [[1336  135]
 [  37  115]]


In [16]:
# # grid search (fine-tuned)

# params = {'var_smoothing': np.arange(0.04, 0.6, .001)}

# gnbModel = ms.GridSearchCV(gnb, param_grid=params,
#                              scoring='recall', cv=skfold,
#                              n_jobs=(-1), verbose=1
#                             ).fit(X_train, y_train)

# bestGNB = gnbModel.best_estimator_
# dump(bestGNB, './data/Iteration_2/bestGNB.pkl')

bestGNB = load('./data/Iteration_2/bestGNB.pkl')
model_results(bestGNB)
print('\nExactly the same as Iteration 1')

# ----------------------------------------------
# params = {'var_smoothing': np.arange(0.01, 0.2, .0001)}
# Model details: GaussianNB(var_smoothing=0.050999999999999754) 
# Train Set Recall Score: 88.42%
# Test Set Recall Score: 86.83999999999999%
# Train Set Confusion Matrix:
#  [[2425 1008]
#  [  41  313]]
# Test Set Confusion Matrix:
#  [[1077  394]
#  [  20  132]]

Model details: GaussianNB(var_smoothing=0.05100000000000001) 

Train Set Recall Score: 88.42%
Test Set Recall Score: 86.83999999999999%

Train Set Confusion Matrix:
 [[2425 1008]
 [  41  313]]
Test Set Confusion Matrix:
 [[1077  394]
 [  20  132]]

Exactly the same as Iteration 1


## Tree Models

### Random Forest

In [17]:
# # baseline model

randForest = \
    RandomForestClassifier(class_weight='balanced', random_state=0
                          ).fit(X_train, y_train)

# dump(randForest, './data/Iteration_2/randForest.pkl')

randForest = load('./data/Iteration_2/randForest.pkl')
model_results(randForest)

Model details: RandomForestClassifier(class_weight='balanced', random_state=0) 

Train Set Recall Score: 99.72%
Test Set Recall Score: 40.79%

Train Set Confusion Matrix:
 [[3433    0]
 [   1  353]]
Test Set Confusion Matrix:
 [[1456   15]
 [  90   62]]


In [18]:
# # grid search (fine-tuned)

# params = {'max_depth': [1, 2, 3],
#           'min_samples_split': np.arange(2, 7),
#           'min_samples_leaf': np.arange(2, 7),
#           'n_estimators': [1000]}

# randForestGS = \
#     ms.GridSearchCV(randForest,
#                           param_grid=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                           ).fit(X_train, y_train)

# bestRandForest = randForestGS.best_estimator_
# dump(bestRandForest, './data/Iteration_2/bestRandForest.pkl')

bestRandForest = load('./data/Iteration_2/bestRandForest.pkl')
model_results(bestRandForest)
print('\nAbout 4% worse than Iteration 1')

# --------------------------------------------------------------
# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7)}
# Model details: RandomForestClassifier(class_weight='balanced',
#                    max_depth=3, min_samples_leaf=3,
#                    min_samples_split=3, random_state=0) 
# Train Set Recall Score: 91.81%
# Test Set Recall Score: 88.16000000000001%
# Train Set Confusion Matrix:
#  [[2905  528]
#  [  29  325]]
# Test Set Confusion Matrix:
#  [[1268  203]
#  [  18  134]]

# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7),
#           'n_estimators': [300, 335, 375]}
# Model details: RandomForestClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=3,
#                        min_samples_split=3, n_estimators=335, random_state=0) 
# Train Set Recall Score: 91.53%
# Test Set Recall Score: 88.82%
# Train Set Confusion Matrix:
#  [[2902  531]
#  [  30  324]]
# Test Set Confusion Matrix:
#  [[1263  208]
#  [  17  135]]

Model details: RandomForestClassifier(class_weight='balanced', max_depth=2, min_samples_leaf=2,
                       n_estimators=1000, random_state=0) 

Train Set Recall Score: 88.7%
Test Set Recall Score: 88.16000000000001%

Train Set Confusion Matrix:
 [[2842  591]
 [  40  314]]
Test Set Confusion Matrix:
 [[1248  223]
 [  18  134]]

About 4% worse than Iteration 1


### Gradient Boosting

In [19]:
# # baseline model with sample_weight

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto', random_state=0)
# gradBoostModel.fit(X_train, y_train, sample_weight=
#     compute_sample_weight(class_weight='balanced', y=y_train))

# dump(gradBoostModel, './data/Iteration_2/gradBoostModel.pkl')

gradBoostModel = load('./data/Iteration_2/gradBoostModel.pkl')
model_results(gradBoostModel)

Model details: GradientBoostingClassifier(max_features='auto', random_state=0) 

Train Set Recall Score: 99.15%
Test Set Recall Score: 80.25999999999999%

Train Set Confusion Matrix:
 [[3248  185]
 [   3  351]]
Test Set Confusion Matrix:
 [[1355  116]
 [  30  122]]


In [20]:
# # grid search (fine-tuned)

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto',
#                                learning_rate=0.001,
#                                min_samples_leaf=12,
#                                random_state=0)

# gradBoostModel.fit(X_train, y_train, sample_weight=
#     compute_sample_weight(class_weight='balanced', y=y_train))

# params = {'n_estimators': [950, 1050, 1200],
#           'min_samples_split': [22, 24, 26],
#           'max_depth': [1, 2, 3]}

# gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
#                                     param_distributions=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=(-1), verbose=1,
#                                     return_train_score=True)
# gradBoostRS.fit(X_train, y_train,
#     sample_weight=\
#         compute_sample_weight(class_weight='balanced', y=y_train))

# bestGradBoost = gradBoostRS.best_estimator_
# dump(bestGradBoost, './data/Iteration_2/bestGradBoost.pkl')

bestGradBoost = load('./data/Iteration_2/bestGradBoost.pkl')
model_results(bestGradBoost)
print('\nComparable score to Iteration 1 but about 20 more false positives')

Model details: GradientBoostingClassifier(learning_rate=0.001, max_depth=2,
                           max_features='auto', min_samples_leaf=12,
                           min_samples_split=26, n_estimators=1050,
                           random_state=0) 

Train Set Recall Score: 92.09%
Test Set Recall Score: 92.75999999999999%

Train Set Confusion Matrix:
 [[2883  550]
 [  28  326]]
Test Set Confusion Matrix:
 [[1262  209]
 [  11  141]]

Comparable score to Iteration 1 but about 20 more false positives


In [21]:
# # grid search tuning steps
# params = {'learning_rate':[0.003, 0.005, 0.007],
#           'min_samples_split': [22, 24, 26],
#           'min_samples_leaf': [8, 12, 16],
#           'max_depth': [1, 2],
#           'n_estimators': [250, 300, 350]}
# Model details: GradientBoostingClassifier(learning_rate=0.005, max_depth=2,
#                            max_features='auto', min_samples_leaf=8,
#                            min_samples_split=24, n_estimators=300,
#                            random_state=0) 
# Train Set Recall Score: 92.36999999999999%
# Test Set Recall Score: 90.13%
# Train Set Confusion Matrix:
#  [[2943  490]
#  [  27  327]]
# Test Set Confusion Matrix:
#  [[1279  192]
#  [  15  137]]

# params = {'learning_rate':[0.001, 0.01, 0.1],
#           'n_estimators': [500, 850, 1200],
#           'min_samples_split': [2, 8, 20],
#           'min_samples_leaf': [1, 5, 12]}
# Model details: GradientBoostingClassifier(learning_rate=0.001, max_features='auto',
#                            min_samples_leaf=12, min_samples_split=20,
#                            n_estimators=850, random_state=0) 
# Train Set Recall Score: 94.07%
# Test Set Recall Score: 91.45%
# Train Set Confusion Matrix:
#  [[2948  485]
#  [  21  333]]
# Test Set Confusion Matrix:
#  [[1275  196]
#  [  13  139]]

# params = {'learning_rate':[0.0005, 0.001, 0.005],
#           'n_estimators': [750, 850, 950],
#           'min_samples_split': [16, 20, 24],
#           'min_samples_leaf': [8, 12, 16]}
# Model details: GradientBoostingClassifier(learning_rate=0.001, max_features='auto',
#                            min_samples_leaf=12, min_samples_split=24,
#                            n_estimators=950, random_state=0) 
# Train Set Recall Score: 93.22%
# Test Set Recall Score: 91.45%
# Train Set Confusion Matrix:
#  [[2981  452]
#  [  24  330]]
# Test Set Confusion Matrix:
#  [[1285  186]
#  [  13  139]]

## Support Vector Machine

In [22]:
# # baseline model

# svm = \
#     SVC(random_state=0, class_weight='balanced').fit(X_train, y_train)

# dump(svm, './data/Iteration_2/svm.pkl')

svm = load('./data/Iteration_2/svm.pkl')
model_results(svm)

Model details: SVC(class_weight='balanced', random_state=0) 

Train Set Recall Score: 93.22%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2838  595]
 [  24  330]]
Test Set Confusion Matrix:
 [[1234  237]
 [  13  139]]


In [23]:
# # grid search

# params = {'C': np.logspace(-3, 2, 100),
#           'kernel': ['linear', 'poly', 'rbf'],
#           'degree': [1, 3, 5],
#           'gamma': ['scale', 'auto']}

# svmRS = \
#     ms.RandomizedSearchCV(svm, param_distributions=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                          ).fit(X_train, y_train)

# bestSVM = svmRS.best_estimator_
# dump(bestSVM, './data/Iteration_2/bestSVM.pkl')

bestSVM = load('./data/Iteration_2/bestSVM.pkl')
model_results(bestSVM)
print('\nAbout 2% worse than Iteration 1')

Model details: SVC(C=0.08302175681319744, class_weight='balanced', degree=5, gamma='auto',
    kernel='linear', random_state=0) 

Train Set Recall Score: 90.96%
Test Set Recall Score: 90.79%

Train Set Confusion Matrix:
 [[2777  656]
 [  32  322]]
Test Set Confusion Matrix:
 [[1219  252]
 [  14  138]]

About 2% worse than Iteration 1
