## Setup and Data Import

In [None]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score

import imblearn

import plotly.express as px

In [None]:
providers = load('./data/Providers_Final.pkl')
providers.set_index('Provider', inplace=True)

## Pre-processing

In [None]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

In [None]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

# # 70/30 split gives roughly the same baseline model results
# # but saves grid_search time

In [None]:
# # Scale only the training data to avoid data leakage
scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [None]:
# # Stratify folds so that classes always have the same sample ratio
skfold = ms.StratifiedKFold(n_splits=10, random_state=0, shuffle=True);

## Modeling

### Logistic Regression

#### Models

In [None]:
# baseline model

# L1 penalty for feature selection, liblinear solver faster than saga
# logRegCV = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1)

# logRegCV.fit(X_train, y_train)

# dump(logRegCV, './data/logRegCV.pkl')

In [None]:
# # grid search with accuracy scoring metric

# logRegGSAccuracy = ms.GridSearchCV(logRegModel, param_grid=params,
#                                    cv=skfold, n_jobs=(-1), verbose=1)

# logRegAccuracy = logRegGSAccuracy.fit(X_train, y_train)
# bestLogRegAccuracy = logRegAccuracy.best_estimator_

# # dump(bestLogRegAccuracy, './data/bestLogRegAccuracy.pkl')

In [None]:
# # grid search with recall scoring metric

# # can't use scoring param, need to use recall_score()
# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear',
#                        class_weight='balanced', random_state=0,
#                        n_jobs=(-1), verbose=1)

# params = {'C': np.logspace(-2, 2, 50),
#           'max_iter': [100, 500, 1000]}

# logRegGS = ms.GridSearchCV(logRegModel, param_grid=params,
#                            scoring='recall', cv=skfold, verbose=1)

# logReg = logRegGS.fit(X_train, y_train)
# bestLogReg = logReg.best_estimator_

# # dump(bestLogReg, './data/bestLogReg.pkl')

#### Results

In [None]:
# # baseline model
logRegCV = load('./data/logRegCV.pkl')

print(recall_score(y_train, logRegCV.predict(X_train)))
print(recall_score(y_test, logRegCV.predict(X_test)))

In [None]:
# # grid search with accuracy scoring metric
bestLogRegAccuracy = load('./data/bestLogRegAccuracy.pkl')

# print(bestLogRegAccuracy)
print(recall_score(y_train, bestLogRegAccuracy.predict(X_train)))
print(recall_score(y_test, bestLogRegAccuracy.predict(X_test)))

In [None]:
# # grid search with recall scoring metric
bestLogReg = load('./data/bestLogReg.pkl')

# print(bestLogReg)
print(recall_score(y_train, bestLogReg.predict(X_train)))
print(recall_score(y_test, bestLogReg.predict(X_test)))

In [None]:
coefficients = pd.DataFrame(bestLogReg.coef_.T, index=X.columns
                           ).rename(columns = {0:'Coefficient'}
                           ).abs().sort_values(by='Coefficient',
                                               ascending=False)
coefficients[coefficients.Coefficient > 0]

### Random Forest

In [None]:
def feature_importances(model):
    df = pd.DataFrame({'feature': np.array(X.columns),
                       'importance': model.feature_importances_}
                     ).sort_values('importance')
    return px.bar(df, 'importance', 'feature', height=1000)
feature_importances(bestRandForestReduced)

In [None]:
# weight balanced SUBSAMPLE

In [None]:
# max_features default is 'auto' (sqrt(n_features))
# randForestModel = \
#     RandomForestClassifier(class_weight='balanced', random_state=0)

# randForestModel.fit(X_train, y_train)

# dump(randForestModel, './data/randForestModel.pkl')

randForestModel = load('./data/randForestModel.pkl')
print(recall_score(y_train, randForestModel.predict(X_train)))
print(recall_score(y_test, randForestModel.predict(X_test)))

In [None]:
# params = {'n_estimators': [100, 500],
#           'max_depth': np.linspace(3, 5, 8)}

# randForestGS = ms.GridSearchCV(randForestModel, param_grid=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=-1, verbose=1)

# randForest = randForestGS.fit(X_train, y_train)

# bestRandForest = randForest.best_estimator_
# dump(bestRandForest, './data/bestRandForest.pkl')

bestRandForest = load('./data/bestRandForest.pkl')
print(bestRandForest)
print(recall_score(y_train, bestRandForest.predict(X_train)))
print(recall_score(y_test, bestRandForest.predict(X_test)))

In [None]:
X_train_reduced_manual = pd.DataFrame(X_train, columns=X.columns)
X_train_reduced_manual = X_train_reduced_manual.drop(['IP_Perc_RaceOne',
'IP_Perc_RaceThree',
'IP_Perc_RaceTwo',
'OP_Perc_RaceOne',
'OP_Perc_RaceThree',
'OP_Perc_RaceTwo',
'OP_Perc_Alzheimers_Chronic',
'OP_Perc_Cancer_Chronic',
'OP_Perc_Depression_Chronic',
'OP_Perc_Diabetes_Chronic',
'OP_Perc_HeartFailure_Chronic',
'OP_Perc_IschemicHeart_Chronic',
'OP_Perc_KidneyDisease_Chronic',
'OP_Perc_ObstrPulmonary_Chronic',
'OP_Perc_Osteoporosis_Chronic',
'OP_Perc_RheumatoidArthritis_Chronic',
'OP_Perc_Stroke_Chronic',
'OP_Mean_AgeAtClaim',
'IP_Mean_AgeAtClaim',
'PatientsPerAttPhys',
'PatientsPerOperPhys',
'PatientsPerOthPhys',
'IP_Mean_ClaimCost',
'OP_Mean_ClaimCost',
'OP_Perc_HasNoPhys',
'IP_Perc_HasNoPhys',
'IP_Perc_AttPhysIsOperPhys'], axis=1);



X_test_reduced_manual = pd.DataFrame(X_test, columns=X.columns)
X_test_reduced_manual = X_test_reduced_manual.drop(['IP_Perc_RaceOne',
'IP_Perc_RaceThree',
'IP_Perc_RaceTwo',
'OP_Perc_RaceOne',
'OP_Perc_RaceThree',
'OP_Perc_RaceTwo',
'OP_Perc_Alzheimers_Chronic',
'OP_Perc_Cancer_Chronic',
'OP_Perc_Depression_Chronic',
'OP_Perc_Diabetes_Chronic',
'OP_Perc_HeartFailure_Chronic',
'OP_Perc_IschemicHeart_Chronic',
'OP_Perc_KidneyDisease_Chronic',
'OP_Perc_ObstrPulmonary_Chronic',
'OP_Perc_Osteoporosis_Chronic',
'OP_Perc_RheumatoidArthritis_Chronic',
'OP_Perc_Stroke_Chronic',
'OP_Mean_AgeAtClaim',
'IP_Mean_AgeAtClaim',
'PatientsPerAttPhys',
'PatientsPerOperPhys',
'PatientsPerOthPhys',
'IP_Mean_ClaimCost',
'OP_Mean_ClaimCost',
'OP_Perc_HasNoPhys',
'IP_Perc_HasNoPhys',
'IP_Perc_AttPhysIsOperPhys'], axis=1);

In [None]:
X_train_reduced = pd.DataFrame(X_train, columns=X.columns)
X_train_reduced = X_train_reduced.iloc[:,:len(bestRandForestReduced.feature_importances_
                          [bestRandForestReduced.feature_importances_ > 0.01])];

X_test_reduced = pd.DataFrame(X_test, columns=X.columns)
X_test_reduced = X_test_reduced.iloc[:,:len(bestRandForestReduced.feature_importances_
                          [bestRandForestReduced.feature_importances_ > 0.01])];

In [None]:
params = {'n_estimators': [1000, 2000]}

randForestGS = ms.GridSearchCV(randForestModel, param_grid=params,
                                    scoring='recall', cv=skfold,
                                    n_jobs=-1, verbose=1)

randForestReduced = randForestGS.fit(X_train_reduced, y_train)

bestRandForestReduced = randForestReduced.best_estimator_
# dump(bestRandForestReduced, './data/bestRandForestReduced.pkl')

In [None]:
# bestRandForestReduced = load('./data/bestRandForestReduced.pkl')
print(bestRandForestReduced)
print(recall_score(y_train, bestRandForestReduced.predict(X_train_reduced)))
print(recall_score(y_test, bestRandForestReduced.predict(X_test_reduced)))

# 0.005, max_depth=3.0, n_estimators=500,
# 0.8757062146892656
# 0.868421052631579

# 0.02, max_depth=3.0, n_estimators=500,
# 0.8813559322033898
# 0.9013157894736842

# 0.01, max_depth=3.0, n_estimators=500,
# 0.8700564971751412
# 0.868421052631579

# 0.015, max_depth=3.0, n_estimators=500,
# 0.8728813559322034
# 0.875

# 0.015, max_depth=3.0, n_estimators=1000
# 0.8700564971751412
# 0.881578947368421