## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

from imblearn import over_sampling

In [2]:
providers = load('./data/Providers_Iteration_1.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
# # 70/30 split gives roughly the same baseline model results
# # as 80/20 and 90/10, but saves grid_search time

X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

In [5]:
# # Scale only the training data to avoid data leakage
# # MinMax performed better or about the same as StandardScaler,
# # RobustScaler, and Normalize on most models

scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
# Upsample for K-nearest Neighbors

oversample = over_sampling.SMOTE(random_state=0)
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

In [7]:
# # Stratify folds so that classes always have the same sample ratio
# # n_splits=10 to maximize testing for this small dataset

skfold = ms.StratifiedKFold(n_splits=5, random_state=0, shuffle=True);

In [8]:
def model_results(model,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
        train_recall = recall_score(y_train, model.predict(X_train))
        test_recall = recall_score(y_test, model.predict(X_test))
        
        print('Model details:', model, '\n')
        print('Train Set Recall Score:',
              f'{round(train_recall, 4) * 100}%')
        print('Test Set Recall Score:',
              f'{round(test_recall, 4) * 100}%')
        print('\nTrain Set Confusion Matrix:\n',
              confusion_matrix(y_train, model.predict(X_train)))
        print('Test Set Confusion Matrix:\n',
              confusion_matrix(y_test, model.predict(X_test)))

## Logistic Regression

### Models

In [9]:
# # baseline model

# logRegCV = \
#     LogisticRegressionCV(class_weight='balanced',
#                          cv=skfold, scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCV, './data/Iteration_1/logRegCV.pkl')

logRegCV = load('./data/Iteration_1/logRegCV.pkl')
model_results(logRegCV)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, random_state=0, scoring='recall', verbose=1) 

Train Set Recall Score: 89.27000000000001%
Test Set Recall Score: 86.83999999999999%

Train Set Confusion Matrix:
 [[1818 1615]
 [  38  316]]
Test Set Confusion Matrix:
 [[820 651]
 [ 20 132]]


In [10]:
# # baseline model
# # L1 penalty for feature selection, liblinear solver faster than saga

# logRegCVL1 = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1
#                         ).fit(X_train, y_train)

# dump(logRegCVL1, './data/Iteration_1/logRegCVL1.pkl')

logRegCVL1 = load('./data/Iteration_1/logRegCVL1.pkl')
model_results(logRegCVL1)

Model details: LogisticRegressionCV(class_weight='balanced',
                     cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                     n_jobs=-1, penalty='l1', random_state=0, scoring='recall',
                     solver='liblinear', verbose=1) 

Train Set Recall Score: 90.4%
Test Set Recall Score: 88.16000000000001%

Train Set Confusion Matrix:
 [[1881 1552]
 [  34  320]]
Test Set Confusion Matrix:
 [[811 660]
 [ 18 134]]


In [11]:
# # grid search (fine-tuned)

# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear',
#                        class_weight='balanced', random_state=0,
#                        n_jobs=(-1))

# params = {'C': np.linspace(0.06, 0.2, 100)}

# logRegGS = ms.GridSearchCV(logRegModel, param_grid=params,
#                            scoring='recall', cv=skfold, verbose=1,
#                            return_train_score=True
#                           ).fit(X_train, y_train)

# bestLogReg = logRegGS.best_estimator_

# dump(bestLogReg, './data/Iteration_1/bestLogReg.pkl')

bestLogReg = load('./data/Iteration_1/bestLogReg.pkl')
model_results(bestLogReg)

# --------------------------------------------------------------------
# params = {'C': np.logspace(-2, 2, 50)}
# Model details:
# LogisticRegression(C=0.07906043210907697,
#                    class_weight='balanced', n_jobs=-1,
#                    penalty='l1', random_state=0, solver='liblinear') 

# Train Set Recall Score: 91.81%
# Test Set Recall Score: 91.45%

Model details: LogisticRegression(C=0.07838383838383839, class_weight='balanced', n_jobs=-1,
                   penalty='l1', random_state=0, solver='liblinear') 

Train Set Recall Score: 91.81%
Test Set Recall Score: 91.45%

Train Set Confusion Matrix:
 [[2576  857]
 [  29  325]]
Test Set Confusion Matrix:
 [[1117  354]
 [  13  139]]


### Results

In [12]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

Unnamed: 0,Coefficient
OP_Count_UniquePatients,7.767074
IP_Count_UniquePatients,4.806858
Ratio_ClaimsPerPatient,4.38036
Perc_Outpatient,2.021702
DualPatientProvider,1.601835
OP_Perc_MultHosp,0.631752
OP_Count_UniqueState,0.629165
IP_Mean_AdmitDuration,0.349614
IP_Mean_InsReimbursementRatio,0.223584
IP_Perc_HeartFailure_Chronic,0.127918


## K-nearest Neighbors

In [13]:
# # baseline model with SMOTE

# knn = KNeighborsClassifier().fit(X_train_SMOTE, y_train_SMOTE)

# dump(knn, './data/Iteration_1/knn.pkl')

knn = load('./data/Iteration_1/knn.pkl')
model_results(knn, X_train_SMOTE, y_train_SMOTE)

Model details: KNeighborsClassifier() 

Train Set Recall Score: 100.0%
Test Set Recall Score: 92.75999999999999%

Train Set Confusion Matrix:
 [[2612  821]
 [   0 3433]]
Test Set Confusion Matrix:
 [[1062  409]
 [  11  141]]


In [14]:
# # grid search (fine-tuned)

# params = {'n_neighbors': np.arange(20, 30)}

# knnGS = \
#     ms.GridSearchCV(knn, param_grid=params,
#                           cv=skfold, n_jobs=(-1), verbose = 1,
#                           return_train_score = True,
#                           scoring = 'recall').fit(X_train_SMOTE,
#                                                   y_train_SMOTE)

# bestKNN = knnGS.best_estimator_
# dump(bestKNN, './data/Iteration_1/bestKNN.pkl')

bestKNN = load('./data/Iteration_1/bestKNN.pkl')
model_results(bestKNN, X_train_SMOTE, y_train_SMOTE)

Model details: KNeighborsClassifier(n_neighbors=21) 

Train Set Recall Score: 98.98%
Test Set Recall Score: 96.05%

Train Set Confusion Matrix:
 [[2259 1174]
 [  35 3398]]
Test Set Confusion Matrix:
 [[970 501]
 [  6 146]]


In [15]:
# # grid search tuning steps

# params = {'n_neighbors': [60, 75, 90, 200, 500, 1000]}
# Model details: KNeighborsClassifier(n_neighbors=60) 
# Train Set Recall Score: 97.44%
# Test Set Recall Score: 94.74000000000001%
# Train Set Confusion Matrix:
#  [[2160 1273]
#  [  88 3345]]
# Test Set Confusion Matrix:
#  [[942 529]
#  [  8 144]]

# params = {'n_neighbors': np.arange(50, 75, 5)}
# Model details: KNeighborsClassifier(n_neighbors=50) 
# Train Set Recall Score: 97.64%
# Test Set Recall Score: 94.74000000000001%
# Train Set Confusion Matrix:
#  [[2167 1266]
#  [  81 3352]]
# Test Set Confusion Matrix:
#  [[949 522]
#  [  8 144]]

# params = {'n_neighbors': np.arange(30, 55, 5)}
# Model details: KNeighborsClassifier(n_neighbors=30) 
# Train Set Recall Score: 98.08%
# Test Set Recall Score: 96.05%
# Train Set Confusion Matrix:
#  [[2226 1207]
#  [  66 3367]]
# Test Set Confusion Matrix:
#  [[961 510]
#  [  6 146]]

## Gaussian Naive Bayes

In [16]:
# # baseline model

# gnb = GaussianNB().fit(X_train, y_train)

# dump(gnb, './data/Iteration_1/gnb.pkl')

gnb = load('./data/Iteration_1/gnb.pkl')
model_results(gnb)

Model details: GaussianNB() 

Train Set Recall Score: 85.03%
Test Set Recall Score: 82.24000000000001%

Train Set Confusion Matrix:
 [[2709  724]
 [  53  301]]
Test Set Confusion Matrix:
 [[1188  283]
 [  27  125]]


In [17]:
# # grid search (fine-tuned)

# params = {'var_smoothing': np.arange(0.01, 0.2, .0001)}

# gnbModel = ms.GridSearchCV(gnb, param_grid=params,
#                              scoring='recall', cv=skfold,
#                              n_jobs=(-1), verbose=1
#                             ).fit(X_train_SMOTE, y_train_SMOTE)

# bestGNB = gnbModel.best_estimator_
# dump(bestGNB, './data/Iteration_1/bestGNB.pkl')

bestGNB = load('./data/Iteration_1/bestGNB.pkl')
model_results(bestGNB, X_train_SMOTE, y_train_SMOTE)

# ----------------------------------------------
# params = {'var_smoothing': [0.001, 0.01, 0.1]}
# Model details: GaussianNB(var_smoothing=0.1) 
# Train Set Recall Score: 87.29%
# Test Set Recall Score: 86.18%
# Train Set Confusion Matrix:
#  [[2289 1144]
#  [  45  309]]
# Test Set Confusion Matrix:
#  [[1019  452]
#  [  21  131]]

# params = {'var_smoothing': np.arange(0.005, 0.25, .001)}
# Model details: GaussianNB(var_smoothing=0.008) 

# Train Set Recall Score: 88.67%
# Test Set Recall Score: 86.83999999999999%

# Train Set Confusion Matrix:
#  [[2576  857]
#  [ 389 3044]]
# Test Set Confusion Matrix:
#  [[1128  343]
#  [  20  132]]

Model details: GaussianNB(var_smoothing=0.016199999999999964) 

Train Set Recall Score: 88.55%
Test Set Recall Score: 87.5%

Train Set Confusion Matrix:
 [[2503  930]
 [ 393 3040]]
Test Set Confusion Matrix:
 [[1098  373]
 [  19  133]]


## Tree Models

### Random Forest

In [18]:
# # baseline model

# randForest = \
#     RandomForestClassifier(class_weight='balanced', random_state=0
#                           ).fit(X_train, y_train)

# dump(randForest, './data/Iteration_1/randForest.pkl')

randForest = load('./data/Iteration_1/randForest.pkl')
model_results(randForest)

Model details: RandomForestClassifier(class_weight='balanced', random_state=0) 

Train Set Recall Score: 99.72%
Test Set Recall Score: 41.449999999999996%

Train Set Confusion Matrix:
 [[3433    0]
 [   1  353]]
Test Set Confusion Matrix:
 [[1455   16]
 [  89   63]]


In [19]:
# # grid search (fine-tuned)

# params = {'max_depth': np.arange(2, 5),
#           'min_samples_split': np.arange(3, 7),
#           'min_samples_leaf': np.arange(3, 7)}

# randForestGS = \
#     ms.GridSearchCV(randForest,
#                           param_grid=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                           ).fit(X_train, y_train)

# bestRandForest = randForestGS.best_estimator_
# dump(bestRandForest, './data/Iteration_1/bestRandForest.pkl')

bestRandForest = load('./data/Iteration_1/bestRandForest.pkl')
model_results(bestRandForest)

# --------------------------------------------------------------
# RandomizedSearchCV
# params = {'n_estimators': [50, 100, 200, 500],
#           'max_depth': np.arange(2, 7),
#           'min_samples_split': np.arange(2, 7),
#           'min_samples_leaf': np.arange(2, 7),
#           'class_weight': ['balanced', 'balanced_subsample']}
# Model details: RandomForestClassifier(class_weight='balanced',
#                    max_depth=3, min_samples_leaf=5,
#                    min_samples_split=5, random_state=0) 
# Train Set Recall Score: 90.96%
# Test Set Recall Score: 90.13%
# Train Set Confusion Matrix:
#  [[2822  611]
#  [  32  322]]
# Test Set Confusion Matrix:
#  [[1233  238]
#  [  15  137]]

Model details: RandomForestClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=3,
                       min_samples_split=3, random_state=0) 

Train Set Recall Score: 90.96%
Test Set Recall Score: 90.13%

Train Set Confusion Matrix:
 [[2822  611]
 [  32  322]]
Test Set Confusion Matrix:
 [[1233  238]
 [  15  137]]


### Gradient Boosting

In [20]:
# # baseline model with sample_weight

# gradBoostModel = \
#     GradientBoostingClassifier(max_features='auto', random_state=0)
# gradBoostModel.fit(X_train, y_train, sample_weight=
#     compute_sample_weight(class_weight='balanced', y=y_train))

# dump(gradBoostModel, './data/Iteration_1/gradBoostModel.pkl')

gradBoostModel = load('./data/Iteration_1/gradBoostModel.pkl')
model_results(gradBoostModel)

# ---------------------------------------------------------------
# # baseline model with SMOTE
# Model details:
# GradientBoostingClassifier(max_features='auto', random_state=0) 

# Train Set Recall Score: 98.08%
# Test Set Recall Score: 74.33999999999999%

# Train Set Confusion Matrix:
#  [[3324  109]
#  [  66 3367]]
# Test Set Confusion Matrix:
#  [[1391   80]
#  [  39  113]]

Model details: GradientBoostingClassifier(max_features='auto', random_state=0) 

Train Set Recall Score: 98.87%
Test Set Recall Score: 80.92%

Train Set Confusion Matrix:
 [[3228  205]
 [   4  350]]
Test Set Confusion Matrix:
 [[1353  118]
 [  29  123]]


In [21]:
# # grid search (fine-tuned)

# params = {'learning_rate':[0.003, 0.005, 0.007],
#           'min_samples_split': [22, 24, 26],
#           'min_samples_leaf': [8, 12, 16],
#           'max_depth': [1, 2],
#           'n_estimators': [250, 300, 350]}

# gradBoostRS = ms.RandomizedSearchCV(gradBoostModel,
#                                     param_distributions=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=(-1), verbose=1,
#                                     return_train_score=True)
# gradBoostRS.fit(X_train, y_train,
#     sample_weight=\
#         compute_sample_weight(class_weight='balanced', y=y_train))

# bestGradBoost = gradBoostRS.best_estimator_
# dump(bestGradBoost, './data/Iteration_1/bestGradBoost.pkl')

bestGradBoost = load('./data/Iteration_1/bestGradBoost.pkl')
model_results(bestGradBoost)

Model details: GradientBoostingClassifier(learning_rate=0.007, max_depth=2,
                           max_features='auto', min_samples_leaf=12,
                           min_samples_split=26, n_estimators=350,
                           random_state=0) 

Train Set Recall Score: 92.66%
Test Set Recall Score: 92.11%

Train Set Confusion Matrix:
 [[2954  479]
 [  26  328]]
Test Set Confusion Matrix:
 [[1282  189]
 [  12  140]]


In [22]:
# # grid search tuning steps

# params = {'learning_rate':[0.001, 0.01, 0.1],
#           'n_estimators': [500, 850, 1200],
#           'min_samples_split': [2, 8, 20],
#           'min_samples_leaf': [1, 5, 12]}
# Model details: GradientBoostingClassifier(learning_rate=0.001, max_features='auto',
#                            min_samples_leaf=12, min_samples_split=20,
#                            n_estimators=1200, random_state=0) 
# Train Set Recall Score: 92.66%
# Test Set Recall Score: 90.79%
# Train Set Confusion Matrix:
#  [[2971  462]
#  [  26  328]]
# Test Set Confusion Matrix:
#  [[1291  180]
#  [  14  138]]

# params = {'learning_rate':[0.0005, 0.001, 0.005],
#           'n_estimators': [50, 100, 150, 200],
#           'min_samples_split': [16, 20, 24],
#           'min_samples_leaf': [8, 12, 16]}
# Model details: GradientBoostingClassifier(learning_rate=0.005, max_features='auto',
#                            min_samples_leaf=12, min_samples_split=24,
#                            random_state=0) 
# Train Set Recall Score: 93.5%
# Test Set Recall Score: 92.11%
# Train Set Confusion Matrix:
#  [[2788  645]
#  [  23  331]]
# Test Set Confusion Matrix:
#  [[1216  255]
#  [  12  140]]

# params = {'learning_rate':[0.003, 0.005, 0.007],
#           'min_samples_split': [22, 24, 26],
#           'min_samples_leaf': [8, 12, 16],
#           'max_depth': [1, 2]}
# Model details: GradientBoostingClassifier(learning_rate=0.003, max_depth=2,
#                            max_features='auto', min_samples_leaf=8,
#                            min_samples_split=24, random_state=0) 
# Train Set Recall Score: 88.14%
# Test Set Recall Score: 91.45%
# Train Set Confusion Matrix:
#  [[2832  601]
#  [  42  312]]
# Test Set Confusion Matrix:
#  [[1231  240]
#  [  13  139]]

# params = {'learning_rate':[0.003, 0.005, 0.007],
#           'min_samples_split': [22, 24, 26],
#           'min_samples_leaf': [8, 12, 16],
#           'max_depth': [1, 2],
#           'n_estimators': [150, 300]}
# Model details: GradientBoostingClassifier(learning_rate=0.005, max_depth=2,
#                            max_features='auto', min_samples_leaf=16,
#                            min_samples_split=26, n_estimators=300,
#                            random_state=0) 

# Train Set Recall Score: 91.81%
# Test Set Recall Score: 92.11%

# Train Set Confusion Matrix:
#  [[2897  536]
#  [  29  325]]
# Test Set Confusion Matrix:
#  [[1267  204]
#  [  12  140]]

## Support Vector Machine

In [23]:
# # baseline model

# svm = \
#     SVC(random_state=0, class_weight='balanced').fit(X_train, y_train)

# dump(svm, './data/Iteration_1/svm.pkl')

svm = load('./data/Iteration_1/svm.pkl')
model_results(svm)

Model details: SVC(class_weight='balanced', random_state=0) 

Train Set Recall Score: 93.78999999999999%
Test Set Recall Score: 92.11%

Train Set Confusion Matrix:
 [[2722  711]
 [  22  332]]
Test Set Confusion Matrix:
 [[1177  294]
 [  12  140]]


In [24]:
# # grid search

# params = {'C': np.logspace(-3, 2, 100),
#           'kernel': ['linear', 'poly', 'rbf'],
#           'degree': [1, 3, 5],
#           'gamma': ['scale', 'auto']}

# svmRS = \
#     ms.RandomizedSearchCV(svm, param_grid=params,
#                           scoring='recall', cv=skfold,
#                           n_jobs=(-1), verbose=1,
#                           return_train_score=True
#                          ).fit(X_train, y_train)

# bestSVM = svmRS.best_estimator_
# dump(bestSVM, './data/Iteration_1/bestSVM.pkl')

bestSVM = load('./data/Iteration_1/bestSVM.pkl')
model_results(bestSVM)

# Model details: SVC(C=0.5689866029018299,
#                    class_weight='balanced', random_state=0)
# Train Set Recall Score: 92.09%
# Test Set Recall Score: 92.11%

Model details: SVC(C=0.5689866029018299, class_weight='balanced', random_state=0) 

Train Set Recall Score: 92.09%
Test Set Recall Score: 92.11%

Train Set Confusion Matrix:
 [[2598  835]
 [  28  326]]
Test Set Confusion Matrix:
 [[1131  340]
 [  12  140]]
