## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

from imblearn import over_sampling

In [2]:
providers = load('./data/Providers_Third.pkl')

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
# # 70/30 split gives roughly the same baseline model results
# # as 80/20 and 90/10, but saves grid_search time

X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

In [5]:
# # Scale only the training data to avoid data leakage
# # MinMax performed better or about the same as StandardScaler,
# # RobustScaler, and Normalize on most models

scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
# Upsample for K-nearest Neighbors

oversample = over_sampling.SMOTE(random_state=0)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

In [7]:
# # Stratify folds so that classes always have the same sample ratio
# # n_splits=10 to maximize testing for this small dataset

skfold = ms.StratifiedKFold(n_splits=5, random_state=0, shuffle=True);

In [8]:
def model_results(model,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
        train_recall = recall_score(y_train, model.predict(X_train))
        test_recall = recall_score(y_test, model.predict(X_test))
        
        print('Model details:', model, '\n')
        print('Train Set Recall Score:',
              f'{round(train_recall, 4) * 100}%')
        print('Test Set Recall Score:',
              f'{round(test_recall, 4) * 100}%')
        print('\nTrain Set Confusion Matrix:\n',
              confusion_matrix(y_train, model.predict(X_train)))
        print('Test Set Confusion Matrix:\n',
              confusion_matrix(y_test, model.predict(X_test)))

## Logistic Regression

### Models

In [9]:
# # baseline model

# logRegCV = \
#     LogisticRegressionCV(class_weight='balanced',
#                          cv=skfold, scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCV, './data/Iteration_3/logRegCV.pkl')

logRegCV = load('./data/Iteration_3/logRegCV.pkl')
model_results(logRegCV)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, random_state=0, scoring='recall', verbose=1) 

Train Set Recall Score: 87.29%
Test Set Recall Score: 86.18%

Train Set Confusion Matrix:
 [[2256 1177]
 [  45  309]]
Test Set Confusion Matrix:
 [[1003  468]
 [  21  131]]


In [10]:
# # baseline model
# # L1 penalty for feature selection, liblinear solver faster than saga

# logRegCVL1 = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCVL1, './data/Iteration_3/logRegCVL1.pkl')

logRegCVL1 = load('./data/Iteration_3/logRegCVL1.pkl')
model_results(logRegCVL1)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, penalty='l1', random_state=0, scoring='recall',
                     solver='liblinear', verbose=1) 

Train Set Recall Score: 90.11%
Test Set Recall Score: 89.47%

Train Set Confusion Matrix:
 [[2891  542]
 [  35  319]]
Test Set Confusion Matrix:
 [[1251  220]
 [  16  136]]


In [11]:
# # grid search (fine-tuned)

# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear', max_iter=150,
#                        class_weight='balanced', random_state=0)

# params = {'C': np.logspace(-1, -0.25, 100)}

# logRegRS = ms.RandomizedSearchCV(logRegModel, param_distributions=params,
#                            scoring='recall', cv=skfold, verbose=1,
#                            return_train_score=True
#                           ).fit(X_train, y_train)

# bestLogReg = logRegRS.best_estimator_

# dump(bestLogReg, './data/Iteration_3/bestLogReg.pkl')

bestLogReg = load('./data/Iteration_3/bestLogReg.pkl')
model_results(bestLogReg)
print('\nAbout 1% worse than Iteration 1')

Model details: LogisticRegression(C=0.31622776601683794, class_weight='balanced', max_iter=150,
                   penalty='l1', random_state=0, solver='liblinear') 

Train Set Recall Score: 90.4%
Test Set Recall Score: 90.13%

Train Set Confusion Matrix:
 [[2879  554]
 [  34  320]]
Test Set Confusion Matrix:
 [[1248  223]
 [  15  137]]

About 1% worse than Iteration 1


In [12]:
# # grid search tuning steps

# params = {'C': np.logspace(-2, 2, 50),
#           'max_iter': [150, 200]}
# Model details: LogisticRegression(C=0.13894954943731375, class_weight='balanced', max_iter=200,
#                    penalty='l1', random_state=0, solver='liblinear') 

# Train Set Recall Score: 88.7%
# Test Set Recall Score: 91.45%

# Train Set Confusion Matrix:
#  [[2797  636]
#  [  40  314]]
# Test Set Confusion Matrix:
#  [[1229  242]
#  [  13  139]]

# params = {'C': np.logspace(-4, 4, 100),
#           'max_iter': [150, 250, 500]}
# Model details: LogisticRegression(C=1.0974987654930568, class_weight='balanced', max_iter=150,
#                    penalty='l1', random_state=0, solver='liblinear') 
# Train Set Recall Score: 91.24%
# Test Set Recall Score: 88.16000000000001%
# Train Set Confusion Matrix:
#  [[2930  503]
#  [  31  323]]
# Test Set Confusion Matrix:
#  [[1261  210]
#  [  18  134]]

# params = {'C': np.logspace(-4, 4, 100),
#           'max_iter': [75, 150, 225]}
# Model details: LogisticRegression(C=0.1176811952434999, class_weight='balanced', max_iter=150,
#                    penalty='l1', random_state=0, solver='liblinear') 

# Train Set Recall Score: 88.7%
# Test Set Recall Score: 92.11%

# Train Set Confusion Matrix:
#  [[2792  641]
#  [  40  314]]
# Test Set Confusion Matrix:
#  [[1219  252]
#  [  12  140]]

# params = {'C': np.logspace(-1.25, -0.5, 100),
#           'max_iter': [150]}
# Model details: LogisticRegression(C=0.2610157215682537, class_weight='balanced', max_iter=150,
#                    penalty='l1', random_state=0, solver='liblinear') 

# Train Set Recall Score: 90.11%
# Test Set Recall Score: 91.45%

# Train Set Confusion Matrix:
#  [[2866  567]
#  [  35  319]]
# Test Set Confusion Matrix:
#  [[1243  228]
#  [  13  139]]

### Results

In [13]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

Unnamed: 0,Coefficient
OP_Count_UniquePatients,8.90492
Ratio_ClaimsPerPatient,8.490204
IP_Count_UniquePatients,4.987
Perc_Outpatient,4.029324
OP_AgeRange,2.390386
IP_Mean_ClaimDuration,2.353617
IP_Mean_DailyClaimCost,2.305638
IP_Mean_AdmitDuration,2.049706
OP_Perc_KidneyDisease_Chronic,1.675708
IP_Mean_ClaimCost,1.516525


## K-nearest Neighbors

In [14]:
# # baseline model with SMOTE

knn = KNeighborsClassifier().fit(X_train_SMOTE, y_train_SMOTE)

dump(knn, './data/Iteration_3/knn.pkl')

knn = load('./data/Iteration_3/knn.pkl')
model_results(knn, X_train_SMOTE, y_train_SMOTE)

Model details: KNeighborsClassifier() 

Train Set Recall Score: 100.0%
Test Set Recall Score: 90.13%

Train Set Confusion Matrix:
 [[2677  756]
 [   0 3433]]
Test Set Confusion Matrix:
 [[1096  375]
 [  15  137]]


In [15]:
# # grid search (fine-tuned)

# params = {'n_neighbors': np.arange(75, 150)}

# knnGS = \
#     ms.GridSearchCV(knn, param_grid=params,
#                           cv=skfold, n_jobs=(-1), verbose = 1,
#                           return_train_score = True,
#                           scoring = 'recall').fit(X_train_SMOTE,
#                                                   y_train_SMOTE)

# bestKNN = knnGS.best_estimator_
# dump(bestKNN, './data/Iteration_3/bestKNN.pkl')

bestKNN = load('./data/Iteration_3/bestKNN.pkl')
model_results(bestKNN, X_train_SMOTE, y_train_SMOTE)
print('\nAbout 2% worse than Iteration 1')

# params = {'n_neighbors': np.arange(30, 41)}
# Model details: KNeighborsClassifier(n_neighbors=31) 
# Train Set Recall Score: 98.46000000000001%
# Test Set Recall Score: 94.74000000000001%
# Train Set Confusion Matrix:
#  [[2282 1151]
#  [  53 3380]]
# Test Set Confusion Matrix:
#  [[983 488]
#  [  8 144]]

# params = {'n_neighbors': np.arange(100, 150)}
# Model details: KNeighborsClassifier(n_neighbors=103)
# Train Set Recall Score: 96.3%
# Test Set Recall Score: 94.08%
# Train Set Confusion Matrix:
#  [[2196 1237]
#  [ 127 3306]]
# Test Set Confusion Matrix:
#  [[958 513]
#  [  9 143]

Model details: KNeighborsClassifier(n_neighbors=77) 

Train Set Recall Score: 96.74000000000001%
Test Set Recall Score: 94.08%

Train Set Confusion Matrix:
 [[2203 1230]
 [ 112 3321]]
Test Set Confusion Matrix:
 [[963 508]
 [  9 143]]

About 2% worse than Iteration 1


## Gaussian Naive Bayes

In [16]:
# # baseline model

gnb = GaussianNB().fit(X_train, y_train)

dump(gnb, './data/Iteration_3/gnb.pkl')

gnb = load('./data/Iteration_3/gnb.pkl')
model_results(gnb)

Model details: GaussianNB() 

Train Set Recall Score: 81.92%
Test Set Recall Score: 80.92%

Train Set Confusion Matrix:
 [[2855  578]
 [  64  290]]
Test Set Confusion Matrix:
 [[1233  238]
 [  29  123]]


In [17]:
# # grid search (fine-tuned)

params = {'var_smoothing': np.arange(0.04, 0.6, .001)}

gnbModel = ms.GridSearchCV(gnb, param_grid=params,
                             scoring='recall', cv=skfold,
                             n_jobs=(-1), verbose=1
                            ).fit(X_train, y_train)

bestGNB = gnbModel.best_estimator_
dump(bestGNB, './data/Iteration_3/bestGNB.pkl')

bestGNB = load('./data/Iteration_3/bestGNB.pkl')
model_results(bestGNB)
print('\nExactly the same as Iteration 1')

# ----------------------------------------------
# params = {'var_smoothing': np.arange(0.01, 0.2, .0001)}
# Model details: GaussianNB(var_smoothing=0.050999999999999754) 
# Train Set Recall Score: 88.42%
# Test Set Recall Score: 86.83999999999999%
# Train Set Confusion Matrix:
#  [[2425 1008]
#  [  41  313]]
# Test Set Confusion Matrix:
#  [[1077  394]
#  [  20  132]]

Fitting 5 folds for each of 560 candidates, totalling 2800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:    7.1s


Model details: GaussianNB(var_smoothing=0.05000000000000001) 

Train Set Recall Score: 88.14%
Test Set Recall Score: 86.83999999999999%

Train Set Confusion Matrix:
 [[2371 1062]
 [  42  312]]
Test Set Confusion Matrix:
 [[1047  424]
 [  20  132]]

Exactly the same as Iteration 1


[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed:   15.4s finished


## Tree Models

### Random Forest

In [18]:
# # baseline model

randForest = \
    RandomForestClassifier(class_weight='balanced', random_state=0
                          ).fit(X_train, y_train)

dump(randForest, './data/Iteration_3/randForest.pkl')

randForest = load('./data/Iteration_3/randForest.pkl')
model_results(randForest)

Model details: RandomForestClassifier(class_weight='balanced', random_state=0) 

Train Set Recall Score: 100.0%
Test Set Recall Score: 41.449999999999996%

Train Set Confusion Matrix:
 [[3433    0]
 [   0  354]]
Test Set Confusion Matrix:
 [[1457   14]
 [  89   63]]


In [21]:
# # grid search (fine-tuned)

# randForest = \
#     RandomForestClassifier(class_weight='balanced_subsample', random_state=0
#                           ).fit(X_train, y_train)

# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7),
#           'n_estimators': [500, 800, 1000]}

# randForestGS = \
#     ms.GridSearchCV(randForest,
#                           param_grid=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                           ).fit(X_train, y_train)

# bestRandForest = randForestGS.best_estimator_
# dump(bestRandForest, './data/Iteration_3/bestRandForest.pkl')

bestRandForest = load('./data/Iteration_3/bestRandForest.pkl')
model_results(bestRandForest)
print('\nAbout 1% better than Iteration 1')

Model details: RandomForestClassifier(class_weight='balanced_subsample', max_depth=3,
                       min_samples_leaf=3, min_samples_split=3,
                       n_estimators=800, random_state=0) 

Train Set Recall Score: 91.24%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2844  589]
 [  31  323]]
Test Set Confusion Matrix:
 [[1235  236]
 [  13  139]]

About 1% better than Iteration 1


In [22]:
# # grid search tuning steps

# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7)}
# Model details: RandomForestClassifier(class_weight='balanced',
#                    max_depth=3, min_samples_leaf=3,
#                    min_samples_split=3, random_state=0) 
# Train Set Recall Score: 91.81%
# Test Set Recall Score: 88.16000000000001%
# Train Set Confusion Matrix:
#  [[2905  528]
#  [  29  325]]
# Test Set Confusion Matrix:
#  [[1268  203]
#  [  18  134]]

# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7),
#           'n_estimators': [300, 335, 375]}
# Model details: RandomForestClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=3,
#                        min_samples_split=3, n_estimators=335, random_state=0) 
# Train Set Recall Score: 91.53%
# Test Set Recall Score: 88.82%
# Train Set Confusion Matrix:
#  [[2902  531]
#  [  30  324]]
# Test Set Confusion Matrix:
#  [[1263  208]
#  [  17  135]]

### Gradient Boosting

In [23]:
# # baseline model with sample_weight

gradBoostModel = \
    GradientBoostingClassifier(max_features='auto', random_state=0)
gradBoostModel.fit(X_train, y_train, sample_weight=
    compute_sample_weight(class_weight='balanced', y=y_train))

dump(gradBoostModel, './data/Iteration_3/gradBoostModel.pkl')

gradBoostModel = load('./data/Iteration_3/gradBoostModel.pkl')
model_results(gradBoostModel)

Model details: GradientBoostingClassifier(max_features='auto', random_state=0) 

Train Set Recall Score: 99.15%
Test Set Recall Score: 80.25999999999999%

Train Set Confusion Matrix:
 [[3222  211]
 [   3  351]]
Test Set Confusion Matrix:
 [[1348  123]
 [  30  122]]


In [24]:
# # grid search (fine-tuned)

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto',
#                                learning_rate=0.01,
#                                min_samples_leaf=12,
#                                random_state=0)

# gradBoostModel.fit(X_train, y_train, sample_weight=
#     compute_sample_weight(class_weight='balanced', y=y_train))

# params = {'n_estimators': [200, 500, 750],
#           'min_samples_split': [22, 24, 26],
#           'max_depth': [1, 2, 3]}

# gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
#                                     param_distributions=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=(-1), verbose=1,
#                                     return_train_score=True)
# gradBoostRS.fit(X_train, y_train,
#     sample_weight=\
#         compute_sample_weight(class_weight='balanced', y=y_train))

# bestGradBoost = gradBoostRS.best_estimator_
# dump(bestGradBoost, './data/Iteration_3/bestGradBoost.pkl')

bestGradBoost = load('./data/Iteration_3/bestGradBoost.pkl')
model_results(bestGradBoost)
print('\nAbout 1% worse than Iteration 1')

Model details: GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='auto',
                           min_samples_leaf=12, min_samples_split=22,
                           n_estimators=200, random_state=0) 

Train Set Recall Score: 91.81%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2931  502]
 [  29  325]]
Test Set Confusion Matrix:
 [[1278  193]
 [  13  139]]

About 1% worse than Iteration 1


In [25]:
# # grid search tuning steps

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto',
#                                learning_rate=0.001,
#                                min_samples_leaf=12,
#                                random_state=0)
# params = {'n_estimators': [950, 1050, 1200],
#           'min_samples_split': [22, 24, 26],
#           'max_depth': [1, 2, 3]}
# Model details: GradientBoostingClassifier(learning_rate=0.001, max_depth=2,
#                            max_features='auto', min_samples_leaf=12,
#                            min_samples_split=24, n_estimators=1050,
#                            random_state=0) 
# Train Set Recall Score: 91.53%
# Test Set Recall Score: 90.79%
# Train Set Confusion Matrix:
#  [[2895  538]
#  [  30  324]]
# Test Set Confusion Matrix:
#  [[1261  210]
#  [  14  138]]

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto',
#                                learning_rate=0.01,
#                                random_state=0)
# params = {'n_estimators': [950, 1050, 1200],
#           'min_samples_split': [22, 24, 26],
#           'max_depth': [1, 2, 3]}
# Model details: GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features='auto',
#                            min_samples_split=26, n_estimators=950,
#                            random_state=0) 
# Train Set Recall Score: 95.19999999999999%
# Test Set Recall Score: 87.5%
# Train Set Confusion Matrix:
#  [[3079  354]
#  [  17  337]]
# Test Set Confusion Matrix:
#  [[1321  150]
#  [  19  133]]

## Support Vector Machine

In [26]:
# # baseline model

svm = \
    SVC(random_state=0, class_weight='balanced').fit(X_train, y_train)

dump(svm, './data/Iteration_3/svm.pkl')

svm = load('./data/Iteration_3/svm.pkl')
model_results(svm)

Model details: SVC(class_weight='balanced', random_state=0) 

Train Set Recall Score: 93.78999999999999%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2770  663]
 [  22  332]]
Test Set Confusion Matrix:
 [[1204  267]
 [  13  139]]


In [27]:
# # grid search

# params = {'C': np.logspace(-3, 2, 100),
#           'kernel': ['linear', 'poly', 'rbf'],
#           'degree': [1, 5, 12],
#           'gamma': ['scale', 'auto']}

# svmRS = \
#     ms.RandomizedSearchCV(svm, param_distributions=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                          ).fit(X_train, y_train)

# bestSVM = svmRS.best_estimator_
# dump(bestSVM, './data/Iteration_3/bestSVM.pkl')

bestSVM = load('./data/Iteration_3/bestSVM.pkl')
model_results(bestSVM)
print('\nComparable score to Iteration 1 but fewer false classifications')

Model details: SVC(C=13.848863713938718, class_weight='balanced', degree=1, gamma='auto',
    random_state=0) 

Train Set Recall Score: 92.66%
Test Set Recall Score: 92.75999999999999%

Train Set Confusion Matrix:
 [[2791  642]
 [  26  328]]
Test Set Confusion Matrix:
 [[1218  253]
 [  11  141]]

Comparable score to Iteration 1 but fewer false classifications
