In [1]:
import os
import json
from itertools import product
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope, MinCovDet
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN, KMeans
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.mixture import BayesianGaussianMixture as BayesGMM


from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet, set_link_color_palette
from scipy.spatial.distance import squareform, mahalanobis, euclidean
from fastcluster import linkage, pdist

# saving models
from sklearn.externals import joblib

# incase we want to try some cleaning steps to see if it improves the model
import Clean_Function_Helpers as cfh

  from numpy.core.umath_tests import inner1d


In [2]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (9,6)
sns.set_style('darkgrid')

SEED = 1111

In [3]:
def get_sample(df, mix, seed = SEED):
    """
    Helper Function to get samples with different proportions of normal vs outliers.
    """
    if isinstance(mix, int):
        samp = df.sample(mix, random_state=SEED)
        return samp
    elif isinstance(mix, (tuple,list,np.ndarray)):
        n_normal, n_outlier = mix
        if n_normal is None: # use all 
            normal_samp = df[df.Class==0]
        else:
            normal_samp = df[df.Class==0].sample(n_normal, random_state=SEED)
        if n_outlier is None:
            outliers = df[df.Class==1]
        else:
            outliers = df[df.Class==1].sample(n_outlier, random_state=SEED)
        samp = pd.concat([normal_samp, outliers])
        return samp
    else:
        raise Exception('Invalid mix argument passed')


def outliers_grid(est, train, test, default_params = {}, predict_function=None, **params):
    param_keys = list(params.keys())
    
    initial_dic = dict.fromkeys(param_keys + ['f1','recall','precision'], 0)
    best_f1 = initial_dic.copy()
    best_recall = initial_dic.copy()
    best_precision = initial_dic.copy()
    
    xtrain = train[sub_cols]
    ytrain = train.Class

    xtest = test[sub_cols]
    ytest = test.Class
    
    all_params = product(*params.values())
    print(' | '.join(param_keys))
    print('-----------------------------')
    for p in all_params:
        p = dict(zip(param_keys, p))
        print(p)
        mod = est(**default_params, **p)
        mod.fit(xtrain, ytrain)
        if predict_function:
            ypred = predict_function(mod, xtest)
        else:
            pred = mod.predict(xtest)
            ypred = np.where(pred< 0, 1, 0)
        f1 = metrics.f1_score(ytest,ypred)
        recall = metrics.recall_score(ytest,ypred)
        precision = metrics.precision_score(ytest,ypred)
        if f1 > best_f1['f1']:
            best_f1 = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
        if recall > best_recall['recall']:
            best_recall = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
        if precision > best_precision['precision']:
            best_precision = {**p, 'f1':f1, 'recall':recall, 'precision':precision}
    print('=========== DONE ==========')
    return best_f1, best_recall, best_precision

## Overview

Taking two different approaches. 

    1. Try to model the difference between real and fraudulent charges.
        - Classifiers like Logistic Regression, NaiveBayes, Tree Ensembles etc
        - Sampling approaches over vs undersampling
    2. Try to identify core boundary of real charges and identify anything outside this boundary as fraudulent.
        - Covariance estimates, Local Outlier Factor, Clustering, One Class SVM, K-means, 
        Model-based bayesian clustering.
        
This notebook focuses on the second approach: using robust statistical methods as well as un-supervised learning to identify outliers. I'm also going to use the output of these models as inputs in ensemble models in the next notebook. 

In [4]:
outlier_ftr_df = pd.DataFrame()

### Read Data

In [5]:
df = pd.read_csv('creditcard.csv')
df.Class.value_counts()/df.Class.value_counts().sum()

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [6]:
# We will test different transforms of the data

sub_cols = df.columns.drop(['Time', 'Class'])

scaled_df = cfh.scale_data(df, MinMaxScaler(), sub_cols)
deskewed = cfh.deskew_df(scaled_df, topn=10)

In [7]:
x = df[sub_cols]
y = df.Class


### Minimum Covariance Determinant

MCD is a method to compute a robust estimate for mean and covariance of a multivariate gaussian distributed dataset.
Empirically computing mean and covariance is known to be very sensitive to outliers, so MCD finds a _core subset_ of the data that best represents the underlying distribution. From these robust estimates, we can determine outliers by a points (usually Mahalanobis) distance to the robust mean.

_NOTE Sklearn spits continually spits out RuntimeWarnings on the regular dataset leading me to believe that the data is not well approximated by a normal distribution. Running the Elliptic Envelope on scaled-deskewed data seems to solve the issue._

In [8]:
train = get_sample(deskewed, 25000)
test = train.copy()

params = {
    'contamination': [0.001,0.005, 0.01, 0.1],
    'support_fraction': [0.6, 0.7, 0.8, 0.9]
}

default_params = {
    'assume_centered': True,
    'random_state': SEED
}

best_f1, best_recall, best_precision = outliers_grid(EllipticEnvelope, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

contamination | support_fraction
-----------------------------
{'contamination': 0.001, 'support_fraction': 0.6}
{'contamination': 0.001, 'support_fraction': 0.7}
{'contamination': 0.001, 'support_fraction': 0.8}
{'contamination': 0.001, 'support_fraction': 0.9}
{'contamination': 0.005, 'support_fraction': 0.6}
{'contamination': 0.005, 'support_fraction': 0.7}
{'contamination': 0.005, 'support_fraction': 0.8}
{'contamination': 0.005, 'support_fraction': 0.9}
{'contamination': 0.01, 'support_fraction': 0.6}
{'contamination': 0.01, 'support_fraction': 0.7}
{'contamination': 0.01, 'support_fraction': 0.8}
{'contamination': 0.01, 'support_fraction': 0.9}
{'contamination': 0.1, 'support_fraction': 0.6}
{'contamination': 0.1, 'support_fraction': 0.7}
{'contamination': 0.1, 'support_fraction': 0.8}
{'contamination': 0.1, 'support_fraction': 0.9}
{'contamination': 0.001, 'support_fraction': 0.6, 'f1': 0.5625, 'recall': 0.46153846153846156, 'precision': 0.72}

{'contamination': 0.1, 'support_fr

Run on whole dataset to generate features

In [9]:
x = deskewed[sub_cols]
y = df.Class
contamination = best_f1['contamination'] # 0.001
support_frac = best_f1['support_fraction'] # 0.7

ee = EllipticEnvelope(assume_centered=True, contamination=contamination, support_fraction=support_frac,random_state=SEED)
ee.fit(x,y)
ypred = np.where(ee.predict(x)>0, 0, 1)
print(metrics.f1_score(y,ypred))
print(metrics.recall_score(y,ypred))
print(metrics.precision_score(y,ypred))

0.5070785070785072
0.40040650406504064
0.6912280701754386


Fitting the Envelope on _only_ normal charges seems to improve the model even more!

In [10]:
x = deskewed.loc[deskewed.Class==0, sub_cols].copy()

ee = EllipticEnvelope(assume_centered=True, contamination=contamination, support_fraction=support_frac,random_state=SEED)
ee.fit(x)
ypred = np.where(ee.predict(deskewed[sub_cols])>0, 0, 1)
print(metrics.f1_score(y,ypred))
print(metrics.recall_score(y,ypred))
print(metrics.precision_score(y,ypred))

0.6037735849056605
0.6829268292682927
0.5410628019323671


We can use predictions as features, or maybe even better would be to feed in Mahalanobis distances from best mcd. Or we can create a Multivariate Normal Distribution and use it's pdf to generate features for rows. Although I believe this is the same as the Mahalanobis distance just scaled.

In [11]:
# save mahalanobis dists for ensemble models
outlier_ftr_df['robust_mahalanobis_dists'] = ee.mahalanobis(deskewed[sub_cols])

### Local Outlier Factor

Local Outlier Factor is a way of scoring data points based on their relative densities to their nearest neighbors.
The theory is that a “normal" data point is expected to have a similar density to it’s neighbors, while data points with lower relative density (as compared to their neighbors) are more likely to be outliers. 

See for example below, points O1, O2, and O3 are outliers, but point O4 is not even though it's _distance_ to it's neighbors is comparable to O1 and O2. However the density of O4s neighbors is _not_ comparable to the neighbors of O1 and O2.

![LOFExample](https://i.stack.imgur.com/EFB37.jpg![image.png](attachment:image.png)

In [12]:
def lof_grid(x, y, n_neighbors_opts):
    best_f1 = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    best_recall = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    best_precision = {'nn':0, 'thresh':0, 'f1':0, 'recall':0, 'precision':0}
    
    for nn in n_neighbors:
        lof = LocalOutlierFactor(nn)
        lof.fit(x,y)
        for thresh in np.arange(-1,-3, -0.2):
            ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
            f1 = metrics.f1_score(y,ypred)
            recall = metrics.recall_score(y,ypred)
            precision = metrics.precision_score(y,ypred)
            if f1 > best_f1['f1']:
                best_f1 = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
            if recall > best_recall['recall']:
                best_recall = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
            if precision > best_precision['precision']:
                best_precision = {'nn':nn, 'thresh':thresh, 'f1':f1, 'recall':recall, 'precision':precision}
    return best_f1, best_recall, best_precision

Distance Based methods are very costly, so performing the rest on sub sample of data

In [13]:
# Interestingly, this algorithm performs way better on _unscaled_ data
# This is very strange and suggests that dollar amount is a far more important
# factor than the other variables
samp = get_sample(df, (24750, 250))
print(samp.Class.value_counts())

x = samp[sub_cols]
y = samp.Class

0    24750
1      250
Name: Class, dtype: int64


In [14]:
n_neighbors = np.arange(230,300, 5)
best_f1, best_recall, best_precision = lof_grid(x,y, n_neighbors)

print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

{'nn': 230, 'thresh': -2.3999999999999995, 'f1': 0.5206463195691203, 'recall': 0.58, 'precision': 0.4723127035830619}

{'nn': 275, 'thresh': -1.0, 'f1': 0.02606997730750963, 'recall': 0.988, 'precision': 0.013209262527407882}

{'nn': 230, 'thresh': -2.8, 'f1': 0.5147679324894514, 'recall': 0.488, 'precision': 0.5446428571428571}


In [15]:
# Test on full dataset
x = df[sub_cols]
y = df.Class

lof = LocalOutlierFactor(best_f1['nn']) # use best f1 params
lof.fit(x)
for thresh in np.arange(-1,-3, -0.2):
    print('Thresh:', thresh)
    ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
    f1 = metrics.f1_score(y,ypred)
    recall = metrics.recall_score(y,ypred)
    precision = metrics.precision_score(y,ypred)
    print(f1)
    print(recall)
    print(precision)
    print()

Thresh: -1.0
0.004134803106643818
0.9857723577235772
0.0020717465036607973

Thresh: -1.2
0.014387288446773708
0.8760162601626016
0.00725320588334287

Thresh: -1.4
0.04571935859734783
0.8373983739837398
0.023501226398950432

Thresh: -1.5999999999999999
0.11535125758889853
0.8109756097560976
0.06209150326797386

Thresh: -1.7999999999999998
0.19609967497291442
0.7357723577235772
0.113125

Thresh: -1.9999999999999998
0.2601828761429759
0.6361788617886179
0.16353187042842215

Thresh: -2.1999999999999997
0.2847222222222222
0.5
0.19902912621359223

Thresh: -2.3999999999999995
0.3037417461482025
0.42073170731707316
0.23765786452353616

Thresh: -2.5999999999999996
0.2857142857142857
0.32926829268292684
0.2523364485981308

Thresh: -2.8
0.27281845536609833
0.2764227642276423
0.2693069306930693



In [16]:
# Test on full dataset pt2
x = df[sub_cols]
y = df.Class

lof = LocalOutlierFactor(best_precision['nn']) # use best precision params
lof.fit(x)
for thresh in np.arange(-1,-3, -0.2):
    print('Thresh:', thresh)
    ypred = np.where(lof.negative_outlier_factor_ < thresh, 1, 0)
    f1 = metrics.f1_score(y,ypred)
    recall = metrics.recall_score(y,ypred)
    precision = metrics.precision_score(y,ypred)
    print(f1)
    print(recall)
    print(precision)
    print()

Thresh: -1.0
0.004134803106643818
0.9857723577235772
0.0020717465036607973

Thresh: -1.2
0.014387288446773708
0.8760162601626016
0.00725320588334287

Thresh: -1.4
0.04571935859734783
0.8373983739837398
0.023501226398950432

Thresh: -1.5999999999999999
0.11535125758889853
0.8109756097560976
0.06209150326797386

Thresh: -1.7999999999999998
0.19609967497291442
0.7357723577235772
0.113125

Thresh: -1.9999999999999998
0.2601828761429759
0.6361788617886179
0.16353187042842215

Thresh: -2.1999999999999997
0.2847222222222222
0.5
0.19902912621359223

Thresh: -2.3999999999999995
0.3037417461482025
0.42073170731707316
0.23765786452353616

Thresh: -2.5999999999999996
0.2857142857142857
0.32926829268292684
0.2523364485981308

Thresh: -2.8
0.27281845536609833
0.2764227642276423
0.2693069306930693



In [17]:
# So best params are
nn = 295
thresh = -2.4 
lof = LocalOutlierFactor(nn)
lof.fit(x)

LocalOutlierFactor(algorithm='auto', contamination=0.1, leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=1,
          n_neighbors=295, p=2)

In [18]:
# Save LOF score as feature
outlier_ftr_df['lof'] = lof.negative_outlier_factor_

### Isolation Forests

In [19]:
train = get_sample(df, 25000)
test = get_sample(df[~df.index.isin(train.index)], 25000)


n_est = [250, 300, 325]
max_samples = [0.6, 0.7, 0.75, 0.8]
max_ftrs = [0.2, 0.25, 0.3, 0.4]
contam = [0.0005, 0.001, 0.005, 0.01]
best_f1, best_recall, best_precision = outliers_grid(IsolationForest, train, test, 
                                                     n_estimators=n_est, 
                                                     max_samples=max_samples, 
                                                     max_features=max_ftrs, 
                                                     contamination=contam)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

n_estimators | max_samples | max_features | contamination
-----------------------------
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.0005}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.001}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.005}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.01}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.25, 'contamination': 0.0005}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.25, 'contamination': 0.001}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.25, 'contamination': 0.005}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.25, 'contamination': 0.01}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.3, 'contamination': 0.0005}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.3, 'contamination': 0.001}
{'n_estimators': 250, 'max_samples': 

{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.4, 'contamination': 0.001}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.4, 'contamination': 0.005}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.4, 'contamination': 0.01}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.2, 'contamination': 0.0005}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.2, 'contamination': 0.001}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.2, 'contamination': 0.005}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.2, 'contamination': 0.01}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.25, 'contamination': 0.0005}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.25, 'contamination': 0.001}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.25, 'contamination': 0.005}
{'n_estimators': 300, 'max_samples': 0.75, 'max_features': 0.25, 'contamination': 0.01}
{'n_estimators': 300, 'max_sampl

{'n_estimators': 325, 'max_samples': 0.8, 'max_features': 0.3, 'contamination': 0.01}
{'n_estimators': 325, 'max_samples': 0.8, 'max_features': 0.4, 'contamination': 0.0005}
{'n_estimators': 325, 'max_samples': 0.8, 'max_features': 0.4, 'contamination': 0.001}
{'n_estimators': 325, 'max_samples': 0.8, 'max_features': 0.4, 'contamination': 0.005}
{'n_estimators': 325, 'max_samples': 0.8, 'max_features': 0.4, 'contamination': 0.01}
{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.4, 'contamination': 0.0005, 'f1': 0.33333333333333337, 'recall': 0.24324324324324326, 'precision': 0.5294117647058824}

{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.2, 'contamination': 0.01, 'f1': 0.1506849315068493, 'recall': 0.5945945945945946, 'precision': 0.08627450980392157}

{'n_estimators': 250, 'max_samples': 0.6, 'max_features': 0.4, 'contamination': 0.0005, 'f1': 0.33333333333333337, 'recall': 0.24324324324324326, 'precision': 0.5294117647058824}


Fitting the forest on a small sub-sample of only normal data seems to improve performance

In [20]:
train = get_sample(df, (25000, 0))
test = get_sample(df[~df.index.isin(train.index)], 25000)

n_est = [300, 325]
max_samples = [0.7]
max_ftrs = [0.3]
contam = [0.0005, 0.001, 0.005, 0.01]


# Seems to do much better!
best_f1, best_recall, best_precision = outliers_grid(IsolationForest, train, test, 
                                                     n_estimators=n_est, 
                                                     max_samples=max_samples, 
                                                     max_features=max_ftrs, 
                                                     contamination=contam)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

n_estimators | max_samples | max_features | contamination
-----------------------------
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.0005}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.001}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.005}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.01}
{'n_estimators': 325, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.0005}
{'n_estimators': 325, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.001}
{'n_estimators': 325, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.005}
{'n_estimators': 325, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.01}
{'n_estimators': 300, 'max_samples': 0.7, 'max_features': 0.3, 'contamination': 0.001, 'f1': 0.4444444444444444, 'recall': 0.45614035087719296, 'precision': 0.43333333333333335}

{'n_estimators': 300, 'max_samples': 

In [21]:
# Run Through Whole DS
train = get_sample(df, (20000, 0))
test = df

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

In [22]:
n_estimators = best_f1['n_estimators'] # ~ 300
ms = best_f1['max_samples'] # ~0.7
mf = best_f1['max_features'] # ~ 0.3
c = best_f1['contamination'] # ~ 0.001


isf = IsolationForest(n_estimators, max_samples=ms, max_features=mf, contamination=c)
isf.fit(xtrain)
preds = isf.predict(xtest)
ypred = np.where(preds< 0, 1, 0)
f1 = metrics.f1_score(ytest,ypred)
recall = metrics.recall_score(ytest,ypred)
precision = metrics.precision_score(ytest,ypred)

print('F1:', f1)
print('Recall:', recall)
print('Precision:', precision)

F1: 0.3311258278145695
Recall: 0.3556910569105691
Precision: 0.30973451327433627


In [23]:
outlier_ftr_df['iso_forest'] = ypred

### One Class SVM

In [24]:
train = get_sample(df, (50000,0))
test = get_sample(df[~df.index.isin(train.index)], 50000)

In [25]:
params = {
    'nu': [0.0001, 0.0005, 0.001, 0.005],
    'gamma': [0.00001, 0.00005, 0.0001],
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.0001, 'gamma': 1e-05}
{'nu': 0.0001, 'gamma': 5e-05}
{'nu': 0.0001, 'gamma': 0.0001}
{'nu': 0.0005, 'gamma': 1e-05}
{'nu': 0.0005, 'gamma': 5e-05}
{'nu': 0.0005, 'gamma': 0.0001}
{'nu': 0.001, 'gamma': 1e-05}
{'nu': 0.001, 'gamma': 5e-05}
{'nu': 0.001, 'gamma': 0.0001}
{'nu': 0.005, 'gamma': 1e-05}
{'nu': 0.005, 'gamma': 5e-05}
{'nu': 0.005, 'gamma': 0.0001}
{'nu': 0.001, 'gamma': 1e-05, 'f1': 0.2737642585551331, 'recall': 0.3302752293577982, 'precision': 0.23376623376623376}

{'nu': 0.005, 'gamma': 0.0001, 'f1': 0.22920517560073939, 'recall': 0.5688073394495413, 'precision': 0.14351851851851852}

{'nu': 0.001, 'gamma': 1e-05, 'f1': 0.2737642585551331, 'recall': 0.3302752293577982, 'precision': 0.23376623376623376}


Test on Scaled and Deskewed

In [26]:
# Scaled
train = get_sample(scaled_df, (50000,0))
test = get_sample(scaled_df[~scaled_df.index.isin(train.index)], 50000)

In [27]:
params = {
    'nu': [0.00075, 0.001, 0.0025, 0.005],
    'gamma': [0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8]
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.00075, 'gamma': 0.1}
{'nu': 0.00075, 'gamma': 0.25}
{'nu': 0.00075, 'gamma': 0.5}
{'nu': 0.00075, 'gamma': 0.6}
{'nu': 0.00075, 'gamma': 0.7}
{'nu': 0.00075, 'gamma': 0.75}
{'nu': 0.00075, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.1}
{'nu': 0.001, 'gamma': 0.25}
{'nu': 0.001, 'gamma': 0.5}
{'nu': 0.001, 'gamma': 0.6}
{'nu': 0.001, 'gamma': 0.7}
{'nu': 0.001, 'gamma': 0.75}
{'nu': 0.001, 'gamma': 0.8}
{'nu': 0.0025, 'gamma': 0.1}
{'nu': 0.0025, 'gamma': 0.25}
{'nu': 0.0025, 'gamma': 0.5}
{'nu': 0.0025, 'gamma': 0.6}
{'nu': 0.0025, 'gamma': 0.7}
{'nu': 0.0025, 'gamma': 0.75}
{'nu': 0.0025, 'gamma': 0.8}
{'nu': 0.005, 'gamma': 0.1}
{'nu': 0.005, 'gamma': 0.25}
{'nu': 0.005, 'gamma': 0.5}
{'nu': 0.005, 'gamma': 0.6}
{'nu': 0.005, 'gamma': 0.7}
{'nu': 0.005, 'gamma': 0.75}
{'nu': 0.005, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.75, 'f1': 0.5116279069767442, 'recall': 0.6055045871559633, 'precision': 0.4429530201342282}

{'nu': 0.005, 'gamma': 0.1,

In [28]:
# Deskewed
train = get_sample(deskewed, (50000,0))
test = get_sample(deskewed[~deskewed.index.isin(train.index)], 50000)

In [29]:
params = {
    'nu': [ 0.001, 0.0025, 0.005, 0.0075],
    'gamma': [0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.75, 0.8]
}

default_params = {
    'random_state':SEED
}




best_f1, best_recall, best_precision = outliers_grid(OneClassSVM, train, test, default_params, **params)
print(best_f1)
print()
print(best_recall)
print()
print(best_precision)

nu | gamma
-----------------------------
{'nu': 0.001, 'gamma': 0.01}
{'nu': 0.001, 'gamma': 0.05}
{'nu': 0.001, 'gamma': 0.1}
{'nu': 0.001, 'gamma': 0.5}
{'nu': 0.001, 'gamma': 0.6}
{'nu': 0.001, 'gamma': 0.7}
{'nu': 0.001, 'gamma': 0.75}
{'nu': 0.001, 'gamma': 0.8}
{'nu': 0.0025, 'gamma': 0.01}
{'nu': 0.0025, 'gamma': 0.05}
{'nu': 0.0025, 'gamma': 0.1}
{'nu': 0.0025, 'gamma': 0.5}
{'nu': 0.0025, 'gamma': 0.6}
{'nu': 0.0025, 'gamma': 0.7}
{'nu': 0.0025, 'gamma': 0.75}
{'nu': 0.0025, 'gamma': 0.8}
{'nu': 0.005, 'gamma': 0.01}
{'nu': 0.005, 'gamma': 0.05}
{'nu': 0.005, 'gamma': 0.1}
{'nu': 0.005, 'gamma': 0.5}
{'nu': 0.005, 'gamma': 0.6}
{'nu': 0.005, 'gamma': 0.7}
{'nu': 0.005, 'gamma': 0.75}
{'nu': 0.005, 'gamma': 0.8}
{'nu': 0.0075, 'gamma': 0.01}
{'nu': 0.0075, 'gamma': 0.05}
{'nu': 0.0075, 'gamma': 0.1}
{'nu': 0.0075, 'gamma': 0.5}
{'nu': 0.0075, 'gamma': 0.6}
{'nu': 0.0075, 'gamma': 0.7}
{'nu': 0.0075, 'gamma': 0.75}
{'nu': 0.0075, 'gamma': 0.8}
{'nu': 0.001, 'gamma': 0.8, 'f1': 0

So it looks like using scaled_df works the best

In [30]:
# Run Through Whole DS
train = get_sample(scaled_df, (50000,0))
test = scaled_df

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

In [31]:
# {'nu': 0.001, 'gamma': 0.75, 'f1': 0.5116279069767442, 'recall': 0.6055045871559633, 'precision': 0.4429530201342282}
best_params = {'nu': 0.001, 'gamma': 0.75}

ocsvm = OneClassSVM(**best_params)
ocsvm.fit(xtrain)
preds = ocsvm.predict(xtest)
ypred = np.where(preds <0, 1, 0)
print(metrics.f1_score(ytest, ypred))
print(metrics.recall_score(ytest, ypred))
print(metrics.precision_score(ytest, ypred))

0.45319335083114604
0.5264227642276422
0.3978494623655914


In [32]:
outlier_ftr_df['ocsvm'] = ypred

### KMeans

In [33]:
def eval_clusters(ytrue, labels, agg='sum'):
    df = pd.DataFrame({'Class':ytrue, 'cluster_label':labels})
    vcounts = df.groupby('cluster_label').Class.agg(['sum','size'])
    rel_probs = vcounts['sum']*(vcounts['sum']/vcounts['size'])
    score = rel_probs.sum() if agg=='sum' else rel_probs.mean() # mean penalizes larger number of clusters
    return vcounts, score


In [34]:
def km_grid(Ks, xtrain, xtest, ytest, agg_func='sum', verbose=False):
    best = {'k': 0, 'score':0, 'vcounts':None}
    for k in Ks:
        print('K=',k)
        km = KMeans(n_clusters=k, n_init=5, n_jobs=-1, random_state=SEED)
        km.fit(xtrain)
        labels = km.predict(xtest)
        vcounts, score = eval_clusters(ytest, labels, agg=agg_func)
        if verbose:
            print('score=',score)
            print(vcounts)
            print()
        if score > best['score']:
            best['k'] = k
            best['score'] = score
            best['vcounts'] = vcounts
        
    return best

In [35]:
# small training set
train = get_sample(scaled_df, 15000, seed = SEED)
test = scaled_df.copy()

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

best = km_grid([15,25,50,75,99,100], xtrain,xtest,ytest)
best

K= 15
K= 25
K= 50
K= 75
K= 99
K= 100


{'k': 99, 'score': 298.48608499090767, 'vcounts':                sum  size
 cluster_label           
 0                2  2877
 1                0  4194
 2                2  3166
 3                2  2915
 4                4  3896
 5                2  2638
 6                0  3506
 7                3  2676
 8                0  3320
 9                2   558
 10               0  2976
 11               0  3545
 12               1  2044
 13               0  1353
 14               0  3204
 15               2  2506
 16               2  3758
 17              38  2572
 18               2  4310
 19               0  1877
 20               0  3887
 21             237   270
 22               0  2583
 23               1  2567
 24               1   849
 25               0  3163
 26               7  1651
 27               0  2233
 28               0  3125
 29               0  2751
 ...            ...   ...
 69               0  3589
 70               0   532
 71               0  4425
 72            

In [36]:
# larger training set
train = get_sample(scaled_df, 50000, seed = SEED)
test = scaled_df.copy()

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

best = km_grid([10,15,25,50, 75, 99, 100], xtrain,xtest,ytest)
best

K= 10
K= 15
K= 25
K= 50
K= 75
K= 99
K= 100


{'k': 99, 'score': 326.75882809772486, 'vcounts':                sum  size
 cluster_label           
 0                2  1557
 1                0  4692
 2                0  2953
 3                1  2224
 4                1  3075
 5                0  4102
 6                1  2788
 7                3  3900
 8                1  2244
 9                1  4960
 10               1  2519
 11               2  3953
 12               1  6121
 13               0  2232
 14               0  2536
 15               0  1828
 16               1  6981
 17               2  3484
 18               0  3203
 19               9  2769
 20               0  6078
 21             107   134
 22               2  3692
 23               0  3610
 24               1  3591
 25               1  3437
 26              29  2439
 27               0  3270
 28               6  1101
 29               4  3211
 ...            ...   ...
 69               0  3018
 70               0   898
 71               0  3493
 72            

In [37]:
km = KMeans(n_clusters=best['k'], n_init=5, n_jobs=-1, random_state=SEED)
km.fit(xtrain)
labels = km.predict(xtest)
outlier_ftr_df['km_labels'] = labels

### Model Based Clustering

In [38]:
train = get_sample(scaled_df, 15000, seed = SEED)
test = scaled_df.copy()

xtrain = train[sub_cols]
xtest = test[sub_cols]
ytest = test.Class

In [39]:
def gmm_grid(Ks, xtrain, xtest, ytest, verbose=False):
    best = {'k': 0, 'score':0, 'probs':None}
    for k in Ks:
        print('K=',k)
        mix = BayesGMM(n_components=k, n_init=5, covariance_type='full', random_state=SEED)
        mix.fit(xtrain)
        labels = mix.predict(xtest)
        vcounts, score = eval_clusters(ytest, labels)
        if verbose:
            print('score=',score)
            print(vcounts)
            print()
        if score > best['score']:
            best['k'] = k
            best['score'] = score
            best['vcounts'] = vcounts
        
    return best

In [40]:
best = gmm_grid([5,10,15,25,50], xtrain,xtest, ytest)

K= 5
K= 10
K= 15
K= 25
K= 50


In [41]:
best

{'k': 15,
 'score': 326.91138561750154,
 'probs': None,
 'vcounts':                sum   size
 cluster_label            
 0                2   2271
 1                9   8739
 2                4  34418
 3                6   5747
 4                5  34534
 5                2   6934
 6                5  23881
 7                0  13377
 8              394    475
 9               14  12802
 10              37  23122
 11               2  23857
 12               5  76511
 13               6  10262
 14               1   7877}

In [42]:
# add to feature df
mix = BayesGMM(n_components=best['k'], n_init=5, covariance_type='full', random_state = SEED)
mix.fit(xtrain)
labels = mix.predict(xtest)
outlier_ftr_df['gmm_labels'] = labels

Write New Features To CSV

In [43]:
outlier_ftr_df.to_csv('Outlier_Ftrs.csv',index=False)