## Outline

#### Data set versions:
- Original dataset
- Synthesized dataset using SMOTE

#### Approaches:
- SVM + Grid Search
- SVM + Grid Search + Feature selection with F-score
- Anomaly detection SVM
- Ensemble SVM
  - Og data
  - Synthesized data
  - F-score filtered data

#### Evaluation Metrics:
- ROC
- F-score (Precision Recall)
- Confusion matrix
- Accuracy

In [179]:
import numpy as np
import pandas as pd

from collections import defaultdict

from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC
from sklearn.svm import OneClassSVM

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

DATA = 'dataset/loan_one_hot_encoded.csv'

In [204]:
drop_cols = ['loan_created', 'application_id',
#              'firm_type_Proprietorship',
             'average_business_inflow'
            ]
df = pd.read_csv(DATA)
Y = df['loan_created']
og_X = df.drop(drop_cols, axis=1)

imp = Imputer()
imputed_X = imp.fit_transform(og_X)

# X = imputed_X
scl = StandardScaler()
X = scl.fit_transform(imputed_X)

X.shape, Y.shape, np.unique(Y, return_counts=True)

((230, 240), (230,), (array([0, 1]), array([221,   9])))

In [205]:
sm = SMOTE(random_state = 44, ratio = {0:221, 1:24})
X_os, Y_os = sm.fit_sample(X, Y)

X_os.shape, Y_os.shape, np.unique(Y_os, return_counts=True)

((245, 240), (245,), (array([0, 1]), array([221,  24])))

In [218]:
class GridSearchCV:
    def __init__(self, param_grid, clf, pos_label, cv=5):
        self.param_list_ = list(ParameterGrid(param_grid))
        self.cv = cv
        self.pos_label = pos_label
        self._clf = clf
        
        self.overfit_ = [[] for _ in range(len(self.param_list_))]
        
        # evaluation scores
        self.accuracy_scores_ = ([[] for _ in range(len(self.param_list_))])
        self.precision_ = [[] for _ in range(len(self.param_list_))]
        self.recall_ = [[] for _ in range(len(self.param_list_))]
        self.f1_scores_ = [[] for _ in range(len(self.param_list_))]
        self.fpr_ = [[] for _ in range(len(self.param_list_))]
        self.tpr_ = [[] for _ in range(len(self.param_list_))]
        self.auc_scores_ = [[] for _ in range(len(self.param_list_))]        
        
    def fit(self, X, y):
        skf = StratifiedKFold(n_splits=self.cv)
        for train_index, test_index in skf.split(X, y):
            for idx, params in enumerate(self.param_list_):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                clf.set_params(**params)
                clf.fit(X_train, y_train) # probability=True, class_weight='balanced', 
                y_pred = clf.predict(X_test)
                if getattr(clf, "predict_proba", None):
                    y_probab = clf.predict_proba(X_test)
                else:
                    y_pred[y_pred == -1] = 0

                if np.all(y_pred[0] == y_pred):
                    # all values predicted are same;
                    self.overfit_[idx].append(True)

                    self.precision_[idx].append([])
                    self.recall_[idx].append([])
                    self.f1_scores_[idx].append(np.nan)

                    if getattr(clf, "predict_proba", None):
                        self.fpr_[idx].append([])
                        self.tpr_[idx].append([])
                        self.auc_scores_[idx].append(np.nan)
                else:
                    self.overfit_[idx].append(False)
                    precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred, pos_label=self.pos_label)
                    self.precision_[idx].append(precision)
                    self.recall_[idx].append(recall)
                    self.f1_scores_[idx].append(f1_score(y_test, y_pred))
                    if getattr(clf, "predict_proba", None):
                        fpr, tpr, roc_thresholds = roc_curve(y_test, y_probab[:,1], pos_label=self.pos_label)
                        self.fpr_[idx].append(fpr)
                        self.tpr_[idx].append(tpr)
                        self.auc_scores_[idx].append(auc(fpr, tpr))


                self.accuracy_scores_[idx].append(accuracy_score(y_test, y_pred))
        
        self.average_accuracy_param_ = np.array([np.average(_) for _ in self.accuracy_scores_])
    
        self.accuracy_scores_ = np.array(self.accuracy_scores_)
        self.precision_ = np.array(self.precision_)
        self.recall_ = np.array(self.recall_)
        self.f1_scores_ = np.array(self.f1_scores_)
        self.fpr_ = np.array(self.fpr_)
        self.tpr_ = np.array(self.tpr_)
        self.auc_scores_ = np.array(self.auc_scores_)
            
    def print_results(self, show_overfit=False):
        for _idx in range(len(self.average_accuracy_param_)):
            if not show_overfit and np.any(self.overfit_[_idx]):
                continue
            print('-'*40)
            print('overfit: ', np.any(self.overfit_[_idx]), np.unique(self.overfit_[_idx], return_counts=True))
            print('params:', self.param_list_[_idx])
            print('avg acc: ', self.average_accuracy_param_[_idx])
            print('f1 score:', np.average(gscv.f1_scores_[_idx]))
            print('auc: ', np.average(self.auc_scores_[_idx]))

## grid search on synthesized data

In [207]:
param_grid = [
  {'C': [.1, .5, 1, 5, 10], 
   'gamma': ['auto'], 'kernel': ['rbf', 'sigmoid'], 
   'probability': [True], 'class_weight': ['balanced']},
 ]
clf = SVC()
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=10)
gscv.fit(X_os, Y_os)
# vars(gscv)
gscv.print_results()

----------------------------------------
overfit:  False (array([False]), array([10]))
params: {'C': 0.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
avg acc:  0.9801410256410257
f1 score: 0.8899999999999999
auc:  0.9833333333333332
----------------------------------------
overfit:  False (array([False]), array([10]))
params: {'C': 0.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}
avg acc:  0.9309615384615386
f1 score: 0.7292857142857143
auc:  0.9540513833992094
----------------------------------------
overfit:  False (array([False]), array([10]))
params: {'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
avg acc:  0.9839871794871795
f1 score: 0.9257142857142858
auc:  0.9805006587615284
----------------------------------------
overfit:  False (array([False]), array([10]))
params: {'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability

## grid search on og data

In [208]:
param_grid = [
  {'C': [5, 10, 20, 40, 80], # .1, .5, 1, 
   'gamma': ['auto'], 'kernel': ['poly', 'sigmoid'], 
   'degree': [2, 3, 4, 5],
   'probability': [True], 'class_weight': ['balanced']},
 ]
clf = SVC()
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=9)
gscv.fit(X, Y)
# vars(gscv)
gscv.print_results()

----------------------------------------
overfit:  False (array([False]), array([9]))
params: {'C': 5, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}
avg acc:  0.8439316239316239
f1 score: 0.05555555555555555
auc:  0.3429629629629629
----------------------------------------
overfit:  False (array([False]), array([9]))
params: {'C': 5, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}
avg acc:  0.8439316239316239
f1 score: 0.05555555555555555
auc:  0.3622222222222222
----------------------------------------
overfit:  False (array([False]), array([9]))
params: {'C': 5, 'class_weight': 'balanced', 'degree': 4, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}
avg acc:  0.8439316239316239
f1 score: 0.05555555555555555
auc:  0.2974074074074074
----------------------------------------
overfit:  False (array([False]), array([9]))
params: {'C': 5, 'class_weight': 'balanced', 'degree'

## anomaly detection grid search on og data

In [219]:
param_grid = [
  {'nu': np.arange(.1, 1.0, 0.1), 
   'gamma': ['auto'], 'kernel': ['poly'],
   'degree': [2, 3, 4, 5, 6, 7]},
 ]
clf = OneClassSVM()
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)
gscv.fit(X, Y)
# vars(gscv)
gscv.print_results()

----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}
avg acc:  0.7961866584438277
f1 score: 0.06000000000000001
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}
avg acc:  0.8352286977078837
f1 score: 0.07272727272727272
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}
avg acc:  0.8568794326241134
f1 score: 0.0808080808080808
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}
avg acc:  0.8873183266522766
f1 score: 0.1015873015873016
auc:  nan


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


## anomaly detection grid search synthesized og data

In [210]:
param_grid = [
  {'nu': np.arange(.1, 1.0, 0.1), 
   'gamma': ['auto'], 'kernel': ['poly'],
   'degree': [2, 3, 4, 5, 6, 7]},
 ]
clf = OneClassSVM()
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)
gscv.fit(X_os, Y_os)
# vars(gscv)
gscv.print_results()

----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}
avg acc:  0.7923061224489796
f1 score: 0.35557354925775975
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}
avg acc:  0.8164659863945578
f1 score: 0.35058823529411764
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}
avg acc:  0.8450442176870748
f1 score: 0.3717733247145012
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}
avg acc:  0.8817006802721089
f1 score: 0.4522144522144522
auc:  nan
----------------------------------------
overfit:  False (array([False]), array([5]))
params: 

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


## ensemble grid search on og data

In [215]:
params = {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}
clf = BaggingClassifier(SVC(**params))
clf.fit(X, Y)
# clf.predict(X)
# cross_val_score(clf, X, Y, cv=5)
param_grid = [{'n_estimators': [10]}]
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)
gscv.fit(X, Y)
gscv.print_results(show_overfit=True)
print('-'*40)
vars(gscv)

----------------------------------------
overfit:  True (array([ True]), array([5]))
params: {'n_estimators': 10}
avg acc:  0.9609579607359441
f1 score: nan
auc:  nan
----------------------------------------


{'param_list_': [{'n_estimators': 10}],
 'cv': 5,
 'pos_label': 1,
 '_clf': BaggingClassifier(base_estimator=SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
   max_iter=-1, probability=True, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
          bootstrap=True, bootstrap_features=False, max_features=1.0,
          max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
          random_state=None, verbose=0, warm_start=False),
 'overfit_': [[True, True, True, True, True]],
 'accuracy_scores_': array([[0.95744681, 0.95652174, 0.95652174, 0.95652174, 0.97777778]]),
 'precision_': array([], shape=(1, 5, 0), dtype=float64),
 'recall_': array([], shape=(1, 5, 0), dtype=float64),
 'f1_scores_': array([[nan, nan, nan, nan, nan]]),
 'fpr_': array([], shape=(1, 5, 0), dtype=float64),
 'tpr_': array([], shape=(1, 5, 0), dtype=float64),
 'auc_scores_': array([[nan, nan, nan, nan, nan

## ensemble grid search on synthesized data

In [216]:
params = {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
clf = BaggingClassifier(SVC(**params))
clf.fit(X, Y)
param_grid = [{'n_estimators': [10]}]
gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)
gscv.fit(X_os, Y_os)
gscv.print_results(show_overfit=True)
print('-'*40)
vars(gscv)

----------------------------------------
overfit:  False (array([False]), array([5]))
params: {'n_estimators': 10}
avg acc:  0.9837517006802722
f1 score: 0.9055555555555556
auc:  0.9775303030303031
----------------------------------------


{'param_list_': [{'n_estimators': 10}],
 'cv': 5,
 'pos_label': 1,
 '_clf': BaggingClassifier(base_estimator=SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=True, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
          bootstrap=True, bootstrap_features=False, max_features=1.0,
          max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
          random_state=None, verbose=0, warm_start=False),
 'overfit_': [[False, False, False, False, False]],
 'accuracy_scores_': array([[0.96      , 0.97959184, 1.        , 1.        , 0.97916667]]),
 'precision_': array([[array([0.1, 1. , 1. ]),
         array([0.10204082, 1.        , 1.        ]), array([1., 1.]),
         array([1., 1.]), array([0.8, 1. ])]], dtype=object),
 'recall_': array([[array([1. , 0.6, 0. ]), array([1. , 0.8, 0. ]), array([1., 0.]),
         array([1., 0.]), array([1., 0.])]], dtyp