## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import MinMaxScaler
import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import \
    RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score

import plotly.express as px
import plotly.graph_objects as go

In [2]:
providers = load('./data/Providers_Final.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
X = providers.drop('PotentialFraud', axis=1)
y = providers.PotentialFraud

## Pre-processing

In [4]:
X_train, X_test, y_train, y_test = \
    ms.train_test_split(X, y, test_size = 0.1, random_state = 0)

#        train/test splits
#  70/30       80/20       90/10
#       logistic regression
# 0.94398     0.92530     0.92308
# 0.86577     0.92308     0.92157
#          random forest
# 0.93240     0.93207     0.93243
# 0.93469     0.93623     0.93161
#        gradient boosting
# 0.93874     0.94108     0.94558
# 0.94763     0.95471     0.94455

In [5]:
# Scale only the training data to avoid data leakage
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# StandardScaler > RobustScaler < MinMaxScaler > StandardScaler
# StandardScaler < RobustScaler = MinMaxScaler = StandardScaler
# StandardScaler = RobustScaler = MinMaxScaler = StandardScaler

In [6]:
# pd.DataFrame(X_train, columns=X.columns)

### Functions

In [7]:
def feature_importances(model):
    df = pd.DataFrame({'feature': np.array(X.columns),
                       'importance': model.feature_importances_}
                     ).sort_values('importance')
    return px.bar(df, 'importance', 'feature', height=1000)

## Baseline Models

### Logistic Regression

In [8]:
log_reg = LogisticRegressionCV(max_iter=2250, class_weight='balanced',
                               scoring='recall', cv=10, random_state=0)
log_reg.fit(X_train, y_train)

print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

# Although data is scaled, without max_iter > 2000,
# 'ConvergenceWarning: lbfgs failed to converge (status=1):
# STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
# Increase the number of iterations (max_iter) or scale the data'
# Above max_iter 2000, score remains the same.

0.9142857142857143
0.9215686274509803


In [9]:
# max_features default is 'auto' (sqrt(n_features))
randForest = RandomForestClassifier(class_weight='balanced', random_state=0)
randForest.fit(X_train, y_train)

print(np.mean(ms.cross_val_score(randForest, X_train, y_train, cv=10)))
print(randForest.score(X_test, y_test))

0.9322255177833549
0.933456561922366


In [10]:
gradBoost = GradientBoostingClassifier(max_features='auto', random_state=0)
gradBoost.fit(X_train, y_train)

print(np.mean(ms.cross_val_score(gradBoost, X_train, y_train, cv=10)))
print(gradBoost.score(X_test, y_test))

0.9455750754176488
0.944547134935305


## Gradient Boosting

In [11]:
# feature_importances(gradBoost)

In [12]:
X_train_reduced = pd.DataFrame(X_train, columns=X.columns)
X_train_reduced = X_train_reduced.iloc[:,:len(gradBoost.feature_importances_
                          [gradBoost.feature_importances_ > 0.001])];

In [13]:
X_test_reduced = pd.DataFrame(X_test, columns=X.columns)
X_test_reduced = X_test_reduced.iloc[:,:len(gradBoost.feature_importances_
                          [gradBoost.feature_importances_ > 0.001])];

In [15]:
gradBoost_reduced = gradBoost.fit(X_train_reduced, y_train)

print(np.mean(ms.cross_val_score(gradBoost_reduced, X_train_reduced, y_train, cv=10)))
print(gradBoost_reduced.score(X_test_reduced, y_test))

# FI at 0.001: 0.9437257586128223/0.9390018484288355

0.9437257586128223
0.9390018484288355
