In [1]:
import os
import json
from itertools import product
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope, MinCovDet
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet, set_link_color_palette
from scipy.spatial.distance import squareform, mahalanobis, euclidean
from fastcluster import linkage, pdist

# saving models
from sklearn.externals import joblib

# incase we want to try some cleaning steps to see if it improves the model
import Clean_Function_Helpers as cfh

  from numpy.core.umath_tests import inner1d


In [2]:
plt.rcParams['figure.figsize'] = (9,6)
sns.set_style('darkgrid')

SEED = 1111

In [3]:
def get_sample(df, mix, seed = SEED):
    """
    Helper Function to get samples with different proportions of normal vs outliers.
    """
    if isinstance(mix, int):
        samp = df.sample(mix, random_state=SEED)
        return samp
    elif isinstance(mix, (tuple,list,np.ndarray)):
        n_normal, n_outlier = mix
        if n_normal is None: # use all 
            normal_samp = df[df.Class==0]
        else:
            normal_samp = df[df.Class==0].sample(n_normal, random_state=SEED)
        if n_outlier is None:
            outliers = df[df.Class==1]
        else:
            outliers = df[df.Class==1].sample(n_outlier, random_state=SEED)
        samp = pd.concat([normal_samp, outliers])
        return samp
    else:
        raise Exception('Invalid mix argument passed')


def outliers_grid(est, train, test, default_params = {}, predict_function=None, **params):
    param_keys = list(params.keys())
    
    initial_dic = dict.fromkeys(param_keys + ['f1','recall','precision'], 0)
    best_f1 = initial_dic.copy()
    best_recall = initial_dic.copy()
    best_precision = initial_dic.copy()
    
    xtrain = train[sub_cols]
    ytrain = train.Class

    xtest = test[sub_cols]
    ytest = test.Class
    
    all_params = product(*params.values())
    print(' | '.join(param_keys))
    print('-----------------------------')
    for p in all_params:
        p = dict(zip(param_keys, p))
        print(p)
        mod = est(**default_params, **p)
        mod.fit(xtrain, ytrain)
        if predict_function:
            ypred = predict_function(mod, xtest)
        else:
            pred = mod.predict(xtest)
            ypred = np.where(pred< 0, 1, 0)
        f1 = metrics.f1_score(ytest,ypred)
        recall = metrics.recall_score(ytest,ypred)
        precision = metrics.precision_score(ytest,ypred)
        if f1 > best_f1['f1']:
            best_f1 = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
        if recall > best_recall['recall']:
            best_recall = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
        if precision > best_precision['precision']:
            best_precision = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
    print('=========== DONE ==========')
    return best_f1, best_recall, best_precision

## Overview

Taking two different approaches. 

    1. Try to model the difference between real and fraudulent charges.
        - Classifiers like Logistic Regression, NaiveBayes, Tree Ensembles etc
        - Sampling approaches over vs undersampling
    2. Try to identify core boundary of real charges and identify anything outside this boundary as fraudulent.
        - Covariance estimates, Local Outlier Factor, Clustering, One Class SVM, Hierarchical Clustering, 
        Model-based bayesian clustering.
        
This notebook focuses on the second approach: using robust statistical methods as well as un-supervised learning to identify outliers. I'm also going to use the output of these models as inputs in ensemble models in the next notebook. 

In [4]:
outlier_ftr_df = pd.DataFrame()

### Read Data

In [5]:
df = pd.read_csv('creditcard.csv')
df.Class.value_counts()/df.Class.value_counts().sum()

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [6]:
# We will test different transforms of the data

sub_cols = df.columns.drop(['Time', 'Class'])

scaled_df = cfh.scale_data(df, MinMaxScaler(), sub_cols)
deskewed = cfh.deskew_df(scaled_df, topn=10)

In [7]:
x = df[sub_cols]
y = df.Class


### Minimum Covariance Determinant

MCD is a method to compute a robust estimate for mean and covariance of a multivariate gaussian distributed dataset.
Empirically computing mean and covariance is known to be very sensitive to outliers, so MCD finds a _core subset_ of the data that best represents the underlying distribution. From these robust estimates, we can determine outliers by a points (usually Mahalanobis) distance to the robust mean.

_NOTE Sklearn spits continually spits out RuntimeWarnings on the regular dataset leading me to believe that the data is not well approximated by a normal distribution. Running the Elliptic Envelope on scaled-deskewed data seems to solve the issue._

In [115]:
train = get_sample(deskewed, 25000)
test = train.copy()

params = {
    'contamination': [0.001,0.005, 0.01 0.1],
    'support_fraction': [0.6, 0.7, 0.8, 0.9]
}

default_params = {
    'assume_centered': True,
    'random_state': SEED
}

best_f1, best_recall, best_precision = outliers_grid(EllipticEnvelope, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

contamination | support_fraction
-----------------------------
{'contamination': 0.001, 'support_fraction': 0.6}


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


{'contamination': 0.001, 'support_fraction': 0.7}
{'contamination': 0.005, 'support_fraction': 0.6}
{'contamination': 0.005, 'support_fraction': 0.7}
{'contamination': 0.001, 'support_fraction': 0.6, 'f1': 0.5625, 'recall': 0.46153846153846156, 'precision': 0.72}

{'contamination': 0.005, 'support_fraction': 0.7, 'f1': 0.402439024390244, 'recall': 0.8461538461538461, 'precision': 0.264}

{'contamination': 0.001, 'support_fraction': 0.6, 'f1': 0.5625, 'recall': 0.46153846153846156, 'precision': 0.72}


Run on whole dataset to generate features

In [119]:
x = deskewed[sub_cols]
y = df.Class
contamination = best_f1['contamination'] # 0.001
support_frac = best_f1['support_fraction'] # 0.7

ee = EllipticEnvelope(assume_centered=True, contamination=contamination, support_fraction=support_frac,random_state=SEED)
ee.fit(x,y)
ypred = np.where(ee.predict(x)>0, 0, 1)
print(metrics.f1_score(y,ypred))
print(metrics.recall_score(y,ypred))
print(metrics.precision_score(y,ypred))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


0.5070785070785072
0.40040650406504064
0.6912280701754386


Fitting the Envelope on _only_ normal charges seems to improve the model even more!

In [120]:
x = deskewed.loc[deskewed.Class==0, sub_cols].copy()

ee = EllipticEnvelope(assume_centered=True, contamination=contamination, support_fraction=support_frac,random_state=SEED)
ee.fit(x)
ypred = np.where(ee.predict(deskewed[sub_cols])>0, 0, 1)
print(metrics.f1_score(y,ypred))
print(metrics.recall_score(y,ypred))
print(metrics.precision_score(y,ypred))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


0.6037735849056605
0.6829268292682927
0.5410628019323671


We can use predictions as features, or maybe even better would be to feed in Mahalanobis distances from best mcd. Or we can create a Multivariate Normal Distribution and use it's pdf to generate features for rows. Although I believe this is the same as the Mahalanobis distance just scaled.

In [121]:
# save mahalanobis dists for ensemble models
outlier_ftr_df['robust_mahalanobis_dists'] = ee.mahalanobis(deskewed[sub_cols])

### Local Outlier Factor

Local Outlier Factor is a way of scoring data points based on their relative densities to their nearest neighbors.
The theory is that a “normal" data point is expected to have a similar density to it’s neighbors, while data points with lower relative density (as compared to their neighbors) are more likely to be outliers. 

See for example below, points O1, O2, and O3 are outliers, but point O4 is not even though it's _distance_ to it's neighbors is comparable to O1 and O2. However the density of O4s neighbors is _not_ comparable to the neighbors of O1 and O2.

![LOFExample](https://i.stack.imgur.com/EFB37.jpg![image.png](attachment:image.png)

In [6]:
def lof_grid(x, y, n_neighbors_opts):
    best_f1 = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    best_recall = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    best_precision = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    
    for nn in n_neighbors:
        lof = LocalOutlierFactor(nn)
        lof.fit(x,y)
        for thresh in np.arange(-1,-3, -0.2):
            ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
            f1 = metrics.f1_score(y,ypred)
            recall = metrics.recall_score(y,ypred)
            precision = metrics.precision_score(y,ypred)
            if f1 > best_f1['f1']:
                best_f1 = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
            if recall > best_recall['recall']:
                best_recall = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
            if precision > best_precision['precision']:
                best_precision = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
    return best_f1, best_recall, best_precision

Distance Based methods are very costly, so performing the rest on sub sample of data

In [104]:
# Interestingly, this algorithm performs way better on _unscaled_ data
# This is very strange and suggests that dollar amount is a far more important
# factor than the other variables
samp = get_sample(df, (24750, 250))
print(samp.Class.value_counts())

x = samp[sub_cols]
y = samp.Class

0    24750
1      250
Name: Class, dtype: int64


In [133]:
n_neighbors = np.arange(230,300, 5)
best_f1, best_recall, best_precision = lof_grid(x,y, n_neighbors)

print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


{'nn': 255, 'thresh': -1.9999999999999998, 'f1': 0.5612153708668455, 'recall': 0.6382113821138211, 'precision': 0.5007974481658692}

{'nn': 285, 'thresh': -1.0, 'f1': 0.04986206191887198, 'recall': 0.991869918699187, 'precision': 0.025573839220207527}

{'nn': 295, 'thresh': -2.8, 'f1': 0.4625158831003813, 'recall': 0.3699186991869919, 'precision': 0.6169491525423729}


In [122]:
# Test on full dataset
x = df[sub_cols]
y = df.Class

lof = LocalOutlierFactor(best_f1['nn']) # use best f1 params
lof.fit(x)
for thresh in np.arange(-1,-3, -0.2):
    print('Thresh:', thresh)
    ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
    f1 = metrics.f1_score(y,ypred)
    recall = metrics.recall_score(y,ypred)
    precision = metrics.precision_score(y,ypred)
    print(f1)
    print(recall)
    print(precision)
    print()

Thresh: -1.0
0.004118191235841053
0.9817073170731707
0.002063423574293929

Thresh: -1.2
0.014377209953966243
0.8760162601626016
0.007248082873671465

Thresh: -1.4
0.04573458400399622
0.8373983739837398
0.023509272467902995

Thresh: -1.5999999999999999
0.1151016875811337
0.8109756097560976
0.061946902654867256

Thresh: -1.7999999999999998
0.20505920344456405
0.774390243902439
0.11817617866004963

Thresh: -1.9999999999999998
0.269120654396728
0.6686991869918699
0.16845878136200718

Thresh: -2.1999999999999997
0.30254957507082153
0.5426829268292683
0.20974076983503534

Thresh: -2.3999999999999995
0.31601731601731603
0.4451219512195122
0.24496644295302014

Thresh: -2.5999999999999996
0.288695652173913
0.33739837398373984
0.25227963525835867

Thresh: -2.8
0.280561122244489
0.2845528455284553
0.2766798418972332



In [123]:
# Test on full dataset pt2
x = df[sub_cols]
y = df.Class

lof = LocalOutlierFactor(best_precision['nn']) # use best precision params
lof.fit(x)
for thresh in np.arange(-1,-3, -0.2):
    print('Thresh:', thresh)
    ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
    f1 = metrics.f1_score(y,ypred)
    recall = metrics.recall_score(y,ypred)
    precision = metrics.precision_score(y,ypred)
    print(f1)
    print(recall)
    print(precision)
    print()

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Thresh: -1.0
0.004139104704810854
0.983739837398374
0.0020739153722549547

Thresh: -1.2
0.014288325109239232
0.8739837398373984
0.007203042028912676

Thresh: -1.4
0.045669729880256194
0.8333333333333334
0.023478211074843956

Thresh: -1.5999999999999999
0.11221309152734485
0.8048780487804879
0.060310691440755404

Thresh: -1.7999999999999998
0.20497725448220497
0.7784552845528455
0.11802773497688752

Thresh: -1.9999999999999998
0.29537223340040236
0.7459349593495935
0.18414450577019567

Thresh: -2.1999999999999997
0.36098069900886803
0.7032520325203252
0.24280701754385964

Thresh: -2.3999999999999995
0.4065146579804561
0.6341463414634146
0.29913710450623204

Thresh: -2.5999999999999996
0.39588281868566905
0.508130081300813
0.324254215304799

Thresh: -2.8
0.3458646616541353
0.37398373983739835
0.32167832167832167



In [18]:
# So best params are
nn = 295
thresh = -2.4 
lof = LocalOutlierFactor(nn)
lof.fit(x)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


LocalOutlierFactor(algorithm='auto', contamination=0.1, leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=1,
          n_neighbors=295, p=2)

In [28]:
# Save LOF score as feature
outlier_ftr_df['lof'] = lof.negative_outlier_factor_

### Isolation Forests

In [81]:
train = get_sample(df, 25000)
test = get_sample(df[~df.index.isin(train.index)], 25000)


n_est = [250, 300, 325]
max_samples = [0.6, 0.7, 0.75, 0.8]
max_ftrs = [0.2, 0.25, 0.3, 0.4]
contam = [0.0005, 0.001, 0.005, 0.01]
best_f1, best_recall, best_precision = outliers_grid(IsolationForest, train, test, 
                                                     n_estimators=n_est, 
                                                     max_samples=max_samples, 
                                                     max_features=max_ftrs, 
                                                     contamination=contam)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

N_EST: 250 MS: 0.6 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.6 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.7 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.75 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 250 MS: 0.8 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.6 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.75 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.8 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.6 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.75 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.2 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.2 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.2 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.2 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.25 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.25 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.25 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.25 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.4 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.4 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.4 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.8 MF: 0.4 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.0005, 'f1': 0.339622641509434, 'recall': 0.24324324324324326, 'precision': 0.5625}

{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.01, 'f1': 0.15224913494809686, 'recall': 0.5945945945945946, 'precision': 0.0873015873015873}

{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.0005, 'f1': 0.339622641509434, 'recall': 0.24324324324324326, 'precision': 0.5625}


Fitting the forest on a small sub-sample of normal data only seems to improve performance

In [83]:
train = get_sample(df, (25000, 0))
test = get_sample(df[~df.index.isin(train.index)], 25000)

n_est = [300, 325]
max_samples = [0.7]
max_ftrs = [0.3]
contam = [0.0005, 0.001, 0.005, 0.01]


# Seems to do much better!
best_f1, best_recall, best_precision = outliers_grid(IsolationForest, train, test, 
                                                     n_estimators=n_est, 
                                                     max_samples=max_samples, 
                                                     max_features=max_ftrs, 
                                                     contamination=contam)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

N_EST: 300 MS: 0.7 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 300 MS: 0.7 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.0005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.001


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.005


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


N_EST: 325 MS: 0.7 MF: 0.3 C: 0.01


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.001, 'f1': 0.43859649122807015, 'recall': 0.43859649122807015, 'precision': 0.43859649122807015}

{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.01, 'f1': 0.24064171122994654, 'recall': 0.7894736842105263, 'precision': 0.14195583596214512}

{'n_estimators': 325, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.0005, 'f1': 0.4086021505376344, 'recall': 0.3333333333333333, 'precision': 0.5277777777777778}


In [84]:
# Run Through Whole DS
train = get_sample(df, (20000, 0))
test = df

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

In [85]:
n_estimators = best_f1['n_estimators'] # ~ 300
ms = best_f1['max_samples'] # ~0.7
mf = best_f1['max_features'] # ~ 0.3
c = best_f1['contamination'] # ~ 0.001


isf = IsolationForest(n_estimators, max_samples=ms, max_features=mf, contamination=c)
isf.fit(xtrain)
preds = isf.predict(xtest)
ypred = np.where(preds< 0, 1, 0)
f1 = metrics.f1_score(ytest,ypred)
recall = metrics.recall_score(ytest,ypred)
precision = metrics.precision_score(ytest,ypred)

print('F1:', f1)
print('Recall:', recall)
print('Precision:', precision)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


F1: 0.3166823751178134
Recall: 0.34146341463414637
Precision: 0.29525483304042177


In [86]:
outlier_ftr_df['iso_forest'] = ypred

### One Class SVM

In [168]:
train = get_sample(df, (50000,0))
test = get_sample(df[~df.index.isin(train.index)], 50000)

In [169]:
params = {
    'nu': [0.0001, 0.0005, 0.001, 0.005],
    'gamma': [0.00001, 0.00005, 0.0001],
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.0001, 'gamma': 1e-05}
{'nu': 0.0001, 'gamma': 5e-05}
{'nu': 0.0001, 'gamma': 0.0001}
{'nu': 0.0005, 'gamma': 1e-05}
{'nu': 0.0005, 'gamma': 5e-05}
{'nu': 0.0005, 'gamma': 0.0001}
{'nu': 0.001, 'gamma': 1e-05}
{'nu': 0.001, 'gamma': 5e-05}
{'nu': 0.001, 'gamma': 0.0001}
{'nu': 0.005, 'gamma': 1e-05}
{'nu': 0.005, 'gamma': 5e-05}
{'nu': 0.005, 'gamma': 0.0001}
{'nu': 0.001, 'gamma': 1e-05, 'f1': 0.2737642585551331, 'recall': 0.3302752293577982, 'precision': 0.23376623376623376}

{'nu': 0.005, 'gamma': 0.0001, 'f1': 0.22920517560073939, 'recall': 0.5688073394495413, 'precision': 0.14351851851851852}

{'nu': 0.001, 'gamma': 1e-05, 'f1': 0.2737642585551331, 'recall': 0.3302752293577982, 'precision': 0.23376623376623376}


Test on Scaled and Deskewed

In [186]:
# Scaled
train = get_sample(scaled_df, (50000,0))
test = get_sample(scaled_df[~scaled_df.index.isin(train.index)], 50000)

In [187]:
params = {
    'nu': [0.00075, 0.001, 0.0025, 0.005],
    'gamma': [0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8]
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.00075, 'gamma': 0.1}
{'nu': 0.00075, 'gamma': 0.25}
{'nu': 0.00075, 'gamma': 0.5}
{'nu': 0.00075, 'gamma': 0.6}
{'nu': 0.00075, 'gamma': 0.7}
{'nu': 0.00075, 'gamma': 0.75}
{'nu': 0.00075, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.1}
{'nu': 0.001, 'gamma': 0.25}
{'nu': 0.001, 'gamma': 0.5}
{'nu': 0.001, 'gamma': 0.6}
{'nu': 0.001, 'gamma': 0.7}
{'nu': 0.001, 'gamma': 0.75}
{'nu': 0.001, 'gamma': 0.8}
{'nu': 0.0025, 'gamma': 0.1}
{'nu': 0.0025, 'gamma': 0.25}
{'nu': 0.0025, 'gamma': 0.5}
{'nu': 0.0025, 'gamma': 0.6}
{'nu': 0.0025, 'gamma': 0.7}
{'nu': 0.0025, 'gamma': 0.75}
{'nu': 0.0025, 'gamma': 0.8}
{'nu': 0.005, 'gamma': 0.1}
{'nu': 0.005, 'gamma': 0.25}
{'nu': 0.005, 'gamma': 0.5}
{'nu': 0.005, 'gamma': 0.6}
{'nu': 0.005, 'gamma': 0.7}
{'nu': 0.005, 'gamma': 0.75}
{'nu': 0.005, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.75, 'f1': 0.5116279069767442, 'recall': 0.6055045871559633, 'precision': 0.4429530201342282}

{'nu': 0.005, 'gamma': 0.1,

In [188]:
# Deskewed
train = get_sample(deskewed, (50000,0))
test = get_sample(deskewed[~deskewed.index.isin(train.index)], 50000)

In [189]:
params = {
    'nu': [ 0.001, 0.0025, 0.005, 0.0075],
    'gamma': [0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.75, 0.8]
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.001, 'gamma': 0.01}
{'nu': 0.001, 'gamma': 0.05}
{'nu': 0.001, 'gamma': 0.1}
{'nu': 0.001, 'gamma': 0.5}
{'nu': 0.001, 'gamma': 0.6}
{'nu': 0.001, 'gamma': 0.7}
{'nu': 0.001, 'gamma': 0.75}
{'nu': 0.001, 'gamma': 0.8}
{'nu': 0.0025, 'gamma': 0.01}
{'nu': 0.0025, 'gamma': 0.05}
{'nu': 0.0025, 'gamma': 0.1}
{'nu': 0.0025, 'gamma': 0.5}
{'nu': 0.0025, 'gamma': 0.6}
{'nu': 0.0025, 'gamma': 0.7}
{'nu': 0.0025, 'gamma': 0.75}
{'nu': 0.0025, 'gamma': 0.8}
{'nu': 0.005, 'gamma': 0.01}
{'nu': 0.005, 'gamma': 0.05}
{'nu': 0.005, 'gamma': 0.1}
{'nu': 0.005, 'gamma': 0.5}
{'nu': 0.005, 'gamma': 0.6}
{'nu': 0.005, 'gamma': 0.7}
{'nu': 0.005, 'gamma': 0.75}
{'nu': 0.005, 'gamma': 0.8}
{'nu': 0.0075, 'gamma': 0.01}
{'nu': 0.0075, 'gamma': 0.05}
{'nu': 0.0075, 'gamma': 0.1}
{'nu': 0.0075, 'gamma': 0.5}
{'nu': 0.0075, 'gamma': 0.6}
{'nu': 0.0075, 'gamma': 0.7}
{'nu': 0.0075, 'gamma': 0.75}
{'nu': 0.0075, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.8, 'f1': 0

So it looks like using scaled_df works the best

In [196]:
# Run Through Whole DS
train = get_sample(scaled_df, (50000,0))
test = scaled_df

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

In [197]:
# {'nu': 0.001, 'gamma': 0.75, 'f1': 0.5116279069767442, 'recall': 0.6055045871559633, 'precision': 0.4429530201342282}
best_params = {'nu': 0.001, 'gamma': 0.75}

ocsvm = OneClassSVM(**best_params)
ocsvm.fit(xtrain)
preds = ocsvm.predict(xtest)
ypred = np.where(preds <0, 1, 0)
print(metrics.f1_score(ytest, ypred))
print(metrics.recall_score(ytest, ypred))
print(metrics.precision_score(ytest, ypred))

0.45319335083114604
0.5264227642276422
0.3978494623655914


In [198]:
outlier_ftr_df['ocsvm'] = ypred

### DBSCAN

DBSCAN is a density based clustering method that labels points as outliers if they don't meet a certain threshold to belong to any nearby cluster. Points that are not within at least `eps` distance to `min_samples` other points are labeled outliers.

DBSCAN is complicated as there isn't directly a way to evaluate new samples. What we end up with at the end are cluster labels for each observation. Our ideal scenario is that all the fraudulent charges get labeled the same and all normal charges get labeled something different. 

_How to measure this??_

Target function is to maximize the std of the Class distribution accross cluster labels.

Consider the following example:


_Alternative would be some sort of purity metric? Gini? Entropy?_

In [101]:
# example counts of fraudulent charges in each cluster (counting -1 as a cluster)
ex1 = pd.Series([0.45,0.45,0.1,0,0,0], index=[-1,0,1,2,3,4])
ex2 = pd.Series([0.9,0.1,0,0,0,0], index=[-1,0,1,2,3,4])
ex3 = pd.Series([0.55,0.2,0.2,0.05,0,0], index=[-1,0,1,2,3,4])

# clearly the second one is the best 
# so my initial thought was to just find the labels that
# result in the largest percent of fraud in any cluster.
# HOWEVER, this method says ex3 is better than ex1, which is not the case.
# If they can't all be in 1 cluster, we'd prefer they all be in 2 rather than
# all over.
# Ranking by standard deviation accounts for this.
# This also punishes results with many clusters, as more cluster labels will bring the std down.

# Here you can see 18 ex2 is definitely the best,
# but ex1 is slightly better than ex3 since.
ex1.std(), ex2.std(), ex3.std() 



(0.22286019533929038, 0.36147844564602566, 0.2089657069154331)

In [102]:
def eval_clusters(ytrue, labels):
    df = pd.DataFrame({'Class':ytrue, 'cluster_label':labels})
    vcounts = df.groupby('cluster_label').Class.apply(lambda cls: cls.sum()/cls.size)
    return vcounts, vcounts.std()


Regular

In [100]:
nsamps=3
mix = (5000,50)

eps_opts = [3,5,10,15,25]
ms_opts = [2,3,5,8,15,25]

print('EPS | MIN SAMPLES')
print('--------------------')
best = {'eps':0, 'ms':0, 'std':0}
for eps in eps_opts:
    for ms in ms_opts:
        print(eps, ms)
        std = 0
        for i in range(nsamps):
            samp = get_sample(df, mix, seed = None)
            x = samp[sub_cols]
            y = samp.Class
            db = DBSCAN(eps,ms)
            db.fit(x)
            vcounts, s = eval_clusters(y, db.labels_)
            std+= s
        std/= nsamps
        if std > best['std']:
            best = {'eps': eps, 'ms':ms, 'std': std}
            print(vcounts, std)

best    

EPS | MIN SAMPLES
--------------------
3 2
cluster_label
-1      0.018785
 0      0.000667
 1      0.000000
 2      0.000000
 3      0.000000
 4      0.000000
 5      0.000000
 6      0.000000
 7      0.000000
 8      0.000000
 9      0.000000
 10     0.000000
 11     0.000000
 12     0.000000
 13     0.000000
 14     0.000000
 15     0.000000
 16     0.000000
 17     0.000000
 18     0.000000
 19     0.000000
 20     0.000000
 21     0.000000
 22     0.000000
 23     0.000000
 24     0.000000
 25     0.000000
 26     0.000000
 27     0.000000
 28     0.000000
          ...   
 231    0.000000
 232    0.000000
 233    0.000000
 234    0.000000
 235    0.000000
 236    0.000000
 237    0.000000
 238    0.000000
 239    0.000000
 240    0.000000
 241    0.000000
 242    0.000000
 243    0.000000
 244    0.000000
 245    0.000000
 246    0.000000
 247    0.000000
 248    0.000000
 249    0.000000
 250    0.000000
 251    0.000000
 252    0.000000
 253    0.000000
 254    0.000000
 255    

{'eps': 15, 'ms': 2, 'std': 0.4437490429445237}

Scaled

In [103]:
nsamps=3
mix = (5000,50)

eps_opts = [0.1, 0.25, 0.5, 0.75]
ms_opts = [2, 3, 5, 8, 15, 25]

print('EPS | MIN SAMPLES')
print('--------------------')
best = {'eps':0, 'ms':0, 'std':0}
for eps in eps_opts:
    for ms in ms_opts:
        print(eps, ms)
        std = 0
        for i in range(nsamps):
            samp = get_sample(scaled_df, mix, seed = None)
            x = samp[sub_cols]
            y = samp.Class
            db = DBSCAN(eps,ms)
            db.fit(x)
            vcounts, s = eval_clusters(y, db.labels_)
            std+= s
        std/= nsamps
        if std > best['std']:
            best = {'eps': eps, 'ms':ms, 'std': std}
            print(vcounts, std)

best    

EPS | MIN SAMPLES
--------------------
0.1 2
cluster_label
-1      0.027552
 0      0.001042
 1      0.000000
 2      0.000000
 3      0.000000
 4      0.000000
 5      0.000000
 6      0.000000
 7      0.000000
 8      0.000000
 9      0.000000
 10     0.000000
 11     0.000000
 12     0.000000
 13     0.000000
 14     0.000000
 15     0.000000
 16     0.000000
 17     0.000000
 18     0.000000
 19     0.000000
 20     0.000000
 21     0.000000
 22     0.000000
 23     0.000000
 24     0.000000
 25     0.000000
 26     0.000000
 27     0.000000
 28     0.000000
          ...   
 188    0.000000
 189    0.000000
 190    0.000000
 191    0.000000
 192    0.000000
 193    0.000000
 194    0.000000
 195    0.000000
 196    0.000000
 197    0.000000
 198    0.000000
 199    0.000000
 200    0.000000
 201    0.000000
 202    0.000000
 203    0.000000
 204    0.000000
 205    0.000000
 206    0.000000
 207    0.000000
 208    0.000000
 209    0.000000
 210    0.000000
 211    0.000000
 212  

{'eps': 0.5, 'ms': 8, 'std': 0.7009385221912644}

So Scaling the data seems to work best as it takes away all the extra weight from the charge dollar amount. Since these are only on subsets of the data, we are going to have to scale the `min_samples` paramter when we test on the full dataset.

In [None]:
## THIS CRASHES THE KERNEL...

# eps = best['eps']
# ms = int(best['ms'] * (df.shape[0]/sum(mix)))

# x = scaled_df[sub_cols]
# y = scaled_df.Class

# db = DBSCAN(eps, ms, n_jobs=-1)
# db.fit(x)

# distr, std = eval_clusters(y, db.labels_)

In [None]:
# add labels as feature
outlier_ftr_df['dbscan_label'] = db.labels_

### Hierarchical Clustering

### Model Based Clustering

In [None]:
outlier_ftr_df.to_csv('Outlier_Ftrs.csv',index=False)