## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn.model_selection as ms
from sklearn import preprocessing as pp
from sklearn.linear_model import \
    LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score

In [2]:
providers = load('./data/Providers_Final.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size = 0.3, random_state = 0, stratify=y)

# # 70/30 split gives roughly the same baseline model results
# # but saves grid_search time

In [5]:
# # Scale only the training data to avoid data leakage
scaler = pp.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

## Modeling

In [6]:
# # Stratify folds so that classes always have the same sample ratio
skfold = ms.StratifiedKFold(n_splits=10, random_state=0, shuffle=True);

### Logistic Regression

In [12]:
# # baseline model

# # L1 penalty for feature selection, liblinear solver faster than saga
# logRegCV = \
#     LogisticRegressionCV(penalty='l1', solver='liblinear', cv=skfold,
#                          class_weight='balanced', scoring='recall',
#                          random_state=0, n_jobs=(-1), verbose=1)

# logRegCV.fit(X_train, y_train)

# # dump(logRegCV, './data/logRegCV.pkl')
logRegCV = load('./data/logRegCV.pkl')

print(logRegCV.score(X_train, y_train))
print(logRegCV.score(X_test, y_test))
# recall_score() gives the same results

0.9180790960451978
0.9144736842105263


In [10]:
# # grid search with accuracy scoring metric

# logRegGSAccuracy = ms.GridSearchCV(logRegModel, param_grid=params,
#                                    cv=skfold, n_jobs=(-1), verbose=1)

# logRegAccuracy = logRegGSAccuracy.fit(X_train, y_train)
# bestLogRegAccuracy = logRegAccuracy.best_estimator_

# # dump(bestLogRegAccuracy, './data/bestLogRegAccuracy.pkl')
bestLogRegAccuracy = load('./data/bestLogRegAccuracy.pkl')

print(bestLogRegAccuracy)
print(bestLogRegAccuracy.score(X_train, y_train))
print(bestLogRegAccuracy.score(X_test, y_test))
print(recall_score(y_train, bestLogRegAccuracy.predict(X_train)))
print(recall_score(y_test, bestLogRegAccuracy.predict(X_test)))

# 0.8698177977290732
# 0.8644485520640789
# 0.9152542372881356
# 0.8289473684210527

LogisticRegression(C=68.66488450042998, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l1',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=1,
                   warm_start=False)
0.8698177977290732
0.8644485520640789
0.9152542372881356
0.8289473684210527


In [11]:
# # grid search with recall scoring metric

# # can't use scoring param, need to use recall_score()
# logRegModel = \
#     LogisticRegression(penalty='l1', solver='liblinear',
#                        class_weight='balanced', random_state=0,
#                        n_jobs=(-1), verbose=1)

# params = {'C': np.logspace(-2, 2, 50),
#           'max_iter': [100, 500, 1000]}

# logRegGS = ms.GridSearchCV(logRegModel, param_grid=params,
#                            scoring='recall', cv=skfold, verbose=1)

# logReg = logRegGS.fit(X_train, y_train)
# bestLogReg = logReg.best_estimator_

# # dump(bestLogReg, './data/bestLogReg.pkl')
bestLogReg = load('./data/bestLogReg.pkl')

print(bestLogReg)
print(logReg.score(X_train, y_train))
print(logReg.score(X_test, y_test))
# recall_score() gives the same results

LogisticRegression(C=0.0655128556859551, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l1',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=1,
                   warm_start=False)
0.9265536723163842
0.9210526315789473


### Tree Models

In [None]:
# max_features default is 'auto' (sqrt(n_features))
randForestModel = RandomForestClassifier(class_weight='balanced', random_state=0,
                                         n_jobs=(-1))
randForestModel.fit(X_train, y_train)

print(recall_score(y_train, randForestModel.predict(X_train)))
print(recall_score(y_test, randForestModel.predict(X_test)))

In [None]:
# dump(randForestModel, './data/randForestModel.pkl')

#### Random Forest

In [None]:
# params = {'n_estimators': [100,1000,5000],
#           'max_depth': np.linspace(5,50,5),
#           'max_features' : np.arange(1,11)}

# randForestGS = ms.GridSearchCV(randForestModel, param_grid=params,
#                                     scoring='recall', cv=skfold,
#                                     n_jobs=-1, verbose=1)

# randForest = randForestGS.fit(X_train, y_train)

# bestRandForest = randForest.best_estimator_
# print(bestRandForest)

best_randForest = load('./data/best_randForest.pkl')

In [None]:
# print(best_randForest)
# print(best_randForest.score(X_train, y_train))
# print(best_randForest.score(X_test, y_test))

print(recall_score(y_train, best_randForest.predict(X_train)))
print(recall_score(y_test, best_randForest.predict(X_test)))

#### Gradient Boosting

In [None]:
gradBoostModel = GradientBoostingClassifier(max_features='auto', random_state=0)
gradBoostModel.fit(X_train, y_train)

print(recall_score(y_train, gradBoostModel.predict(X_train)))
print(recall_score(y_test, gradBoostModel.predict(X_test)))

In [None]:
dump(gradBoostModel, './data/gradBoostModel.pkl')

In [None]:
params = {'n_estimators': [100,1000,5000],
          'max_depth': np.linspace(1,30,5),
          'learning_rate':[0.01,0.05,0.1, 0.5],
          'max_features' : np.arange(1,11)}

gradBoostGS = ms.GridSearchCV(gradBoostModel, param_grid=params,
                                    scoring='recall', cv=skfold,
                                    n_jobs=-1, verbose=1)

gradBoost = gradBoostGS.fit(X_train, y_train)

bestGradBoost = gradBoost.best_estimator_
print(bestGradBoost)

In [None]:
dump(bestGradBoost, './data/bestGradBoost.pkl')

In [None]:
def feature_importances(model):
    df = pd.DataFrame({'feature': np.array(X.columns),
                       'importance': model.feature_importances_}
                     ).sort_values('importance')
    return px.bar(df, 'importance', 'feature', height=1000)
# feature_importances(gradBoost)

In [None]:
# X_train_reduced = pd.DataFrame(X_train, columns=X.columns)
# X_train_reduced = X_train_reduced.iloc[:,:len(gradBoost.feature_importances_
#                           [gradBoost.feature_importances_ > 0.005])];

# X_test_reduced = pd.DataFrame(X_test, columns=X.columns)
# X_test_reduced = X_test_reduced.iloc[:,:len(gradBoost.feature_importances_
#                           [gradBoost.feature_importances_ > 0.005])];

In [None]:
# gradBoost_reduced = gradBoost.fit(X_train_reduced, y_train)

# print(np.mean(ms.cross_val_score(gradBoost_reduced, X_train_reduced, y_train, cv=10)))
# print(gradBoost_reduced.score(X_test_reduced, y_test))

# FI at all FI: 0.9406473665086488/0.9426987060998152
# FI at 0.001: 0.9429039808688451/0.9408502772643254
# FI at 0.005: 0.9279121352701092/0.9297597042513863