## Setup and Data Import

In [56]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

import imblearn

import Functions as fxns

import matplotlib.pyplot as plt
plt.style.use('dark_background')

# some pkl models were made with a previous version of sklearn
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

In [2]:
providers = load('./data/providers_last.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
# # 70/30 split gives roughly the same baseline model results
# # as 80/20 and 90/10, but saves grid_search time

X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

In [5]:
# # Scale only the training data to avoid data leakage
# # MinMax performed better or about the same as StandardScaler,
# # RobustScaler, and Normalize on most models

scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
# Upsample for K-nearest Neighbors and Gradient Boosting

oversample = imblearn.over_sampling.SMOTE(random_state=0)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

In [7]:
# # Stratify folds so that classes always have the same sample ratio
# # n_splits=10 to maximize testing for this small dataset

skfold = ms.StratifiedKFold(n_splits=5, random_state=0, shuffle=True);

In [8]:
def model_results(model,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
        train_recall = recall_score(y_train, model.predict(X_train))
        test_recall = recall_score(y_test, model.predict(X_test))
        
        print('Model details:', model, '\n')
        print('Train Set Recall Score:',
              f'{round(train_recall, 4) * 100}%')
        print('Test Set Recall Score:',
              f'{round(test_recall, 4) * 100}%')
        print('\nTrain Set Confusion Matrix:\n',
              confusion_matrix(y_train, model.predict(X_train)))
        print('Test Set Confusion Matrix:\n',
              confusion_matrix(y_test, model.predict(X_test)))

## Logistic Regression

### Models

In [9]:
# # baseline model

# logRegCV = \
#     LogisticRegressionCV(class_weight='balanced',
#                          cv=skfold, scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCV, './data/Iteration_1/logRegCV.pkl')

logRegCV = load('./data/Iteration_1/logRegCV.pkl')
model_results(logRegCV)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, random_state=0, scoring='recall', verbose=1) 

Train Set Recall Score: 89.27000000000001%
Test Set Recall Score: 86.83999999999999%

Train Set Confusion Matrix:
 [[1818 1615]
 [  38  316]]
Test Set Confusion Matrix:
 [[820 651]
 [ 20 132]]


In [10]:
# # baseline model
# # L1 penalty for feature selection, liblinear solver faster than saga

# logRegCVL1 = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCVL1, './data/Iteration_1/logRegCVL1.pkl')

logRegCVL1 = load('./data/Iteration_1/logRegCVL1.pkl')
model_results(logRegCVL1)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, penalty='l1', random_state=0, scoring='recall',
                     solver='liblinear', verbose=1) 

Train Set Recall Score: 90.4%
Test Set Recall Score: 88.16000000000001%

Train Set Confusion Matrix:
 [[1881 1552]
 [  34  320]]
Test Set Confusion Matrix:
 [[811 660]
 [ 18 134]]


In [55]:
# # grid search (fine-tuned)

logRegModel = \
    LogisticRegression(penalty='l1', solver='liblinear',
                       class_weight='balanced', random_state=0,
                       n_jobs=(-1))

params = {'C': np.linspace(0.06, 0.2, 100)}

logRegGS = ms.GridSearchCV(logRegModel, param_grid=params,
                           scoring='recall', cv=skfold, verbose=1,
                           return_train_score=True
                          ).fit(X_train, y_train)

bestLogReg = logRegGS.best_estimator_

dump(bestLogReg, './data/Iteration_1/bestLogReg.pkl')

bestLogReg = load('./data/Iteration_1/bestLogReg.pkl')
model_results(bestLogReg)

# --------------------------------------------------------------------
# params = {'C': np.logspace(-2, 2, 50)}
# Model details:
# LogisticRegression(C=0.07906043210907697,
#                    class_weight='balanced', n_jobs=-1,
#                    penalty='l1', random_state=0, solver='liblinear') 

# Train Set Recall Score: 91.81%
# Test Set Recall Score: 91.45%

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model details: LogisticRegression(C=0.07838383838383839, class_weight='balanced', n_jobs=-1,
                   penalty='l1', random_state=0, solver='liblinear') 

Train Set Recall Score: 91.81%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2576  857]
 [  29  325]]
Test Set Confusion Matrix:
 [[1117  354]
 [  13  139]]


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   39.9s finished


### Results

In [51]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

Unnamed: 0,Coefficient
OP_Count_UniquePatients,7.737743
IP_Count_UniquePatients,4.772498
Ratio_ClaimsPerPatient,4.349272
Perc_Outpatient,2.012477
DualPatientProvider,1.603062
OP_Perc_MultHosp,0.628976
OP_Count_UniqueState,0.620527
IP_Mean_AdmitDuration,0.318757
IP_Mean_InsReimbursementRatio,0.231783
IP_Perc_HeartFailure_Chronic,0.129235


## K-nearest Neighbors

In [21]:
# # baseline model

knn = KNeighborsClassifier().fit(X_train_SMOTE, y_train_SMOTE)

dump(knn, './data/Iteration_1/knn.pkl')

knn = load('./data/Iteration_1/knn.pkl')
model_results(knn)

Model details: KNeighborsClassifier() 

Train Set Recall Score: 100.0%
Test Set Recall Score: 92.75999999999999%

Train Set Confusion Matrix:
 [[2612  821]
 [   0  354]]
Test Set Confusion Matrix:
 [[1062  409]
 [  11  141]]


In [52]:
# # grid search (fine-tuned)

params = {'leaf_size': np.arange(3, 8),
          'weights': ['uniform','distance']}

knnGS = \
    ms.GridSearchCV(knn, param_grid=params,
                          cv=skfold, n_jobs=(-1), verbose = 1,
                          return_train_score = True,
                          scoring = 'recall').fit(X_train_SMOTE,
                                                  y_train_SMOTE)

bestKNN = knnGS.best_estimator_
dump(bestKNN, './data/Iteration_1/bestKNN.pkl')

bestKNN = load('./data/Iteration_1/bestKNN.pkl')
model_results(bestKNN, X_train_SMOTE, y_train_SMOTE)


# params = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#           'leaf_size': [5, 15, 30, 75, 100],
#           'weights': ['uniform','distance']}
# Model details: KNeighborsClassifier(leaf_size=5, weights='distance') 

# Train Set Recall Score: 100.0%
# Test Set Recall Score: 92.75999999999999%

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   49.2s finished


Model details: KNeighborsClassifier(leaf_size=3, weights='distance') 

Train Set Recall Score: 100.0%
Test Set Recall Score: 92.75999999999999%

Train Set Confusion Matrix:
 [[3433    0]
 [   0 3433]]
Test Set Confusion Matrix:
 [[1062  409]
 [  11  141]]


## Gaussian Naive Bayes

In [17]:
# # baseline model

gnb = GaussianNB().fit(X_train, y_train)

dump(gnb, './data/Iteration_1/gnb.pkl')

gnb = load('./data/Iteration_1/gnb.pkl')
model_results(gnb)

Model details: GaussianNB() 

Train Set Recall Score: 85.03%
Test Set Recall Score: 82.24000000000001%

Train Set Confusion Matrix:
 [[2709  724]
 [  53  301]]
Test Set Confusion Matrix:
 [[1188  283]
 [  27  125]]


In [18]:
# # grid search

params = {'var_smoothing': [0.001, 0.01, 0.1]}

gnbModel = ms.GridSearchCV(gnb, param_grid=params,
                             scoring='recall', cv=skfold,
                             n_jobs=(-1), verbose=1
                            ).fit(X_train, y_train)

bestGNB = gnbModel.best_estimator_
dump(bestGNB, './data/Iteration_1/bestGNB.pkl')

bestGNB = load('./data/Iteration_1/bestGNB.pkl')
model_results(bestGNB)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.1s finished


Model details: GaussianNB(var_smoothing=0.1) 

Train Set Recall Score: 87.29%
Test Set Recall Score: 86.18%

Train Set Confusion Matrix:
 [[2289 1144]
 [  45  309]]
Test Set Confusion Matrix:
 [[1019  452]
 [  21  131]]


## Tree Models

### Random Forest

In [19]:
# # baseline model

randForest = \
    RandomForestClassifier(class_weight='balanced', random_state=0
                          ).fit(X_train, y_train)

dump(randForest, './data/Iteration_1/randForest.pkl')

randForest = load('./data/Iteration_1/randForest.pkl')
model_results(randForest)

Model details: RandomForestClassifier(class_weight='balanced', random_state=0) 

Train Set Recall Score: 99.72%
Test Set Recall Score: 41.449999999999996%

Train Set Confusion Matrix:
 [[3433    0]
 [   1  353]]
Test Set Confusion Matrix:
 [[1455   16]
 [  89   63]]


In [25]:
params = {'n_estimators': [500, 850, 1200],
          'max_depth': [2, 5, 8],
          'max_features': ['auto', 'sqrt'],
          'min_samples_leaf': [1, 5, 12],
          'class_weight': ['balanced', 'balanced_subsample']}

randForestRS = \
    RandomForestClassifier(random_state=0).fit(X_train, y_train)

randForestRS = \
    ms.RandomizedSearchCV(randForestRS,
                          param_distributions=params,
                          scoring='recall', cv=skfold,
                          n_jobs=(-1), verbose=1,
                          return_train_score=True
                          ).fit(X_train, y_train)

bestRandForest = randForestRS.best_estimator_
dump(bestRandForest, './data/Iteration_1/bestRandForest.pkl')

bestRandForest = load('./data/Iteration_1/bestRandForest.pkl')
model_results(bestRandForest)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   52.0s finished


Model details: RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
                       max_features='sqrt', min_samples_leaf=12,
                       n_estimators=1200, random_state=0) 

Train Set Recall Score: 84.75%
Test Set Recall Score: 84.87%

Train Set Confusion Matrix:
 [[2714  719]
 [  54  300]]
Test Set Confusion Matrix:
 [[1196  275]
 [  23  129]]


In [None]:
# 70/30 train/test split, MinMaxScaler(), skfold=10

# params = {'n_estimators': [100, 500, 1500],
#           'max_depth': [2, 5, 8]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, n_estimators=500,
#                        random_state=0)
# 0.8870056497175142
# 0.875
# RandomForestClassifier(class_weight='balanced_subsample', max_depth=2,
#                        n_estimators=500, random_state=0)
# 0.884180790960452
# 0.875

# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'min_samples_split': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, n_estimators=500,
#                        random_state=0)
# 0.8870056497175142
# 0.875

# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'max_features': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, max_features=3,
#                        random_state=0)
# 0.8615819209039548
# 0.8618421052631579


# 90/10 train/test split, MinMaxScaler, skfold=10

# params = {'n_estimators': [100, 500, 750],
#           'max_depth': [2, 5, 8],
#           'max_features': [2, 3, 5]}
# RandomForestClassifier(class_weight='balanced', max_depth=2, max_features=2,
#                        n_estimators=750, random_state=0)
# 0.8637362637362638
# 0.8235294117647058

### Gradient Boosting

In [64]:
# # baseline model with sample_weight

gradBoostModel = \
    GradientBoostingClassifier(max_features='auto', random_state=0)
gradBoostModel.fit(X_train, y_train, sample_weight=
    compute_sample_weight(class_weight='balanced', y=y_train))

dump(gradBoostModel, './data/Iteration_1/gradBoostModel.pkl')

gradBoostModel = load('./data/Iteration_1/gradBoostModel.pkl')
model_results(gradBoostModel)

# ---------------------------------------------------------------
# # baseline model with SMOTE
# Model details:
# GradientBoostingClassifier(max_features='auto', random_state=0) 

# Train Set Recall Score: 98.08%
# Test Set Recall Score: 74.33999999999999%

# Train Set Confusion Matrix:
#  [[3324  109]
#  [  66 3367]]
# Test Set Confusion Matrix:
#  [[1391   80]
#  [  39  113]]

Model details: GradientBoostingClassifier(max_features='auto', random_state=0) 

Train Set Recall Score: 98.87%
Test Set Recall Score: 80.92%

Train Set Confusion Matrix:
 [[3228  205]
 [   4  350]]
Test Set Confusion Matrix:
 [[1353  118]
 [  29  123]]


In [70]:
params = {'learning_rate':[0.01],
          'n_estimators': [200, 500],
          'min_samples_split': [24, 30, 36],
          'min_samples_leaf': [12, 16, 20]}

gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
                                    param_distributions=params,
                                    scoring='recall', cv=skfold,
                                    n_jobs=(-1), verbose=1,
                                    return_train_score=True,
                                   )
gradBoostRS.fit(X_train, y_train,
    sample_weight=\
        compute_sample_weight(class_weight='balanced', y=y_train))

bestGradBoost = gradBoostRS.best_estimator_
# dump(bestGradBoost, './data/Iteration_1/bestGradBoost.pkl')

# bestGradBoost = load('./data/Iteration_1/bestGradBoost.pkl')
model_results(bestGradBoost)


# params = {'learning_rate':[0.001, 0.01, 0.1],
#           'n_estimators': [500, 850, 1200],
#           'min_samples_split': [2, 8, 20],
#           'min_samples_leaf': [1, 5, 12]}
# Model details: GradientBoostingClassifier(learning_rate=0.001, max_features='auto',
#                            min_samples_leaf=12, min_samples_split=20,
#                            n_estimators=1200, random_state=0) 

# Train Set Recall Score: 92.66%
# Test Set Recall Score: 90.79%

# Train Set Confusion Matrix:
#  [[2971  462]
#  [  26  328]]
# Test Set Confusion Matrix:
#  [[1291  180]
#  [  14  138]]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished


Model details: GradientBoostingClassifier(learning_rate=0.01, max_features='auto',
                           min_samples_leaf=12, min_samples_split=24,
                           n_estimators=200, random_state=0) 

Train Set Recall Score: 94.07%
Test Set Recall Score: 88.16000000000001%

Train Set Confusion Matrix:
 [[3009  424]
 [  21  333]]
Test Set Confusion Matrix:
 [[1296  175]
 [  18  134]]


In [None]:
# params = {'n_estimators': [100,500,1000,2500,5000,6000],
#                 'learning_rate':[0.01,0.05,0.08,0.1, 0.25],
#                 'max_depth': np.arange(2, 21, 3),
#                 'min_samples_split': np.arange(2, 13, 2),
#                 'min_samples_leaf' : np.arange(2, 13, 2)}

# gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
#                                     param_distributions=params,
#                                     cv=skfold, scoring = 'recall',
#                                     return_train_score = True,
#                                     verbose=1, n_jobs=(-1))
# gradBoostRSfit = gradBoostRS.fit(X_train_SMOTE, y_train_SMOTE)

# bestGradBoostRS = gradBoostRSfit.best_estimator_
# dump(bestGradBoostRS, './data/Iteration_1/bestGradBoostRS.pkl')

# bestGradBoostRS = load('./data/Iteration_1/bestGradBoostRS.pkl')
# model_results(bestGradBoostRS)

## Support Vector Machine

In [29]:
# # baseline model

svm = \
    SVC(random_state=0, class_weight='balanced').fit(X_train, y_train)

dump(svm, './data/Iteration_1/svm.pkl')

svm = load('./data/Iteration_1/svm.pkl')
model_results(svm)

Model details: SVC(class_weight='balanced', random_state=0) 

Train Set Recall Score: 93.78999999999999%
Test Set Recall Score: 92.11%

Train Set Confusion Matrix:
 [[2722  711]
 [  22  332]]
Test Set Confusion Matrix:
 [[1177  294]
 [  12  140]]


In [31]:
# # grid search

params = {'C': np.logspace(-3, 2, 100),
          'kernel': ['linear', 'poly', 'rbf'],
          'degree': [1, 3, 5],
          'gamma': ['scale', 'auto']}

svmRS = \
    ms.RandomizedSearchCV(svm, param_grid=params,
                          scoring='recall', cv=skfold,
                          n_jobs=(-1), verbose=1,
                          return_train_score=True
                         ).fit(X_train, y_train)

bestSVM = svmRS.best_estimator_
dump(bestSVM, './data/Iteration_1/bestSVM.pkl')

bestSVM = load('./data/Iteration_1/bestSVM.pkl')
model_results(bestSVM)

# Model details: SVC(C=0.5689866029018299, class_weight='balanced', random_state=0)
# Train Set Recall Score: 92.09%
# Test Set Recall Score: 92.11%

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   43.2s finished


Model details: SVC(C=0.5689866029018299, class_weight='balanced', random_state=0) 

Train Set Recall Score: 92.09%
Test Set Recall Score: 92.11%

Train Set Confusion Matrix:
 [[2598  835]
 [  28  326]]
Test Set Confusion Matrix:
 [[1131  340]
 [  12  140]]


In [None]:
# # more detailed grid search, worse results

# params = {'C': np.logspace(-3, 2, 50),
#           'kernel': ['rbf', 'linear'],
#           'degree': [1, 3, 5],
#           'gamma': ['scale', 'auto']}

# ms.GridSearchCV(svmModel, param_grid=params, scoring='recall',
#                 cv=skfold, n_jobs=(-1), verbose=1).fit(X_train, y_train)

# Model details: SVC(C=0.06866488450043001, class_weight='balanced', degree=1, kernel='linear',
#     random_state=0) 

# Train Set Recall Score: 93.22%
# Test Set Recall Score: 92.11%

# Train Set Confusion Matrix:
#  [[2339 1094]
#  [  24  330]]
# Test Set Confusion Matrix:
#  [[1030  441]
#  [  12  140]]