In [1]:
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from pyod.models.combination import aom, moa, average, maximization
from pyod.utils.utility import standardizer
from pyod.models import abod,hbos,knn,mcd
from pyod.models.knn import KNN
from sklearn.utils import shuffle

In [3]:
train = pd.read_csv('./application_train.csv/application_train.csv')
train = train.drop(columns=['SK_ID_CURR'],axis=1)
train.shape

(307511, 121)

In [4]:
numerical_for_od = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
 'DAYS_EMPLOYED','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','FLOORSMAX_AVG','FLOORSMIN_AVG',
 'DAYS_REGISTRATION','DAYS_ID_PUBLISH','OWN_CAR_AGE', 'TARGET']

In [5]:
len(numerical_for_od)

13

In [6]:
for col in numerical_for_od:
    train[col].fillna(train[col].mean(),inplace=True)

In [12]:
example_df = pd.DataFrame({'a':[4,5,6,7], 'b':[6,0,-1,2]})

In [13]:
example_df

Unnamed: 0,a,b
0,4,6
1,5,0
2,6,-1
3,7,2


In [14]:
tres = pd.DataFrame(np.array([-1,1,1,-1]))

In [17]:
tres

Unnamed: 0,0
0,-1
1,1
2,1
3,-1


In [15]:
def dominus1(digit):
    if digit == -1:
        return 1
    else:
        return 0

In [18]:
tres = tres[0].apply(lambda digit:dominus1(digit) )

In [19]:
tres

0    1
1    0
2    0
3    1
Name: 0, dtype: int64

In [39]:
def main_function(df):
    def f1(df):
        df['outlier_for0'] = df.iloc[:,0]*10
        return df

    def f2(df):
        df['outlier_for1'] = df.iloc[:,1]-20
        return df
    f1(df)
    f2(df)

In [40]:
main_function(example_df)

In [41]:
example_df

Unnamed: 0,a,b,outlier_for0,outlier_for1
0,4,6,40,-14
1,5,0,50,-20
2,6,-1,60,-21
3,7,2,70,-18


In [42]:
example_df.shape

(4, 4)

In [None]:
example_df

In [22]:
def main_OD(df):
    def calc_percent(num_outliers,total):
        print("% outliers: {0}".format( (num_outliers*100)/total))
    
    def dominus1(digit):
        if digit == -1:
            return 1
        else:
            return 0
    def isolationforest_simple(df,num_samples2):
        num_samples = num_samples2
        df2 = df[numerical_for_od]    
        df2 = df2.iloc[:num_samples,:]
        df2 = df2.drop(columns = ['TARGET'])

        clf = IsolationForest(behaviour = 'new', max_features=12,max_samples=100, random_state = 1, 
                              contamination= 'auto', n_jobs=6,bootstrap=True)
        preds = clf.fit_predict(df2)
        preds = pd.DataFrame(preds)
        outliers_num = preds[0].value_counts()
        #preds = preds[0]
        #print(outliers_num)
        calc_percent(int(outliers_num[-1]), int(df2.shape[0]))
        col_name = 'isolation_forest' + str(num_samples2)
        df[col_name] = preds
        return df, preds
    


# isolation forest - simple sklearn

In [41]:
%%time
def dominus1(digit):
    if digit == -1:
        return 1
    else:
        return 0
    
def calc_percent(num_outliers,total):
    print("% outliers: {0}".format( (num_outliers*100)/total) )
    
def isolationforest_simple(df,num_samples2):
    num_samples = num_samples2
    df2 = df[numerical_for_od]    
    df2 = df2.iloc[:num_samples,:]
    df2 = df2.drop(columns = ['TARGET'])

    clf = IsolationForest(behaviour = 'new', max_features=9,max_samples=100, random_state = 1, 
                          contamination= 'auto', n_jobs=6)
    
    
    preds = clf.fit_predict(df2)
    preds = pd.DataFrame(preds)
    preds[0] = preds[0].apply(lambda digit: dominus1(digit))
    outliers_num = preds[0].value_counts()    

    calc_percent(int(outliers_num[1]), int(df2.shape[0]))
    col_name = 'isolation_forest_' + str(num_samples2)
    df[col_name] = preds
    return df
    
train = isolationforest_simple(train,300000)

% outliers: 12.396333333333333
Wall time: 32.9 s


In [42]:
train.sample(3)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,isolation_forest_300000
55208,0,Cash loans,F,N,Y,0,157500.0,284256.0,28111.5,270000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,7.0,0.0
299366,0,Cash loans,F,Y,Y,0,360000.0,1305000.0,38155.5,1305000.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0,0.0
262289,1,Cash loans,F,N,N,2,112500.0,450000.0,22018.5,450000.0,...,0,0,0,0.0,0.0,0.0,1.0,4.0,2.0,0.0


In [43]:
train['isolation_forest_300000'].value_counts()

0.0    262811
1.0     37189
Name: isolation_forest_300000, dtype: int64

# dbscan 

In [73]:
%%time
num_samples = 300000
df = train[numerical_for_od]    
df = df.iloc[:num_samples,:]
#df = df.drop(columns = ['TARGET'])
outlier_detection = DBSCAN(min_samples = 2000,n_jobs=6,algorithm='ball_tree')
#algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}
clusters = outlier_detection.fit_predict(df)
print(list(clusters).count(-1))

print(clusters)

300000
[-1 -1 -1 ... -1 -1 -1]
Wall time: 21.9 s


## dbscan works pretty badly

# abod simple predict

In [55]:
def abod_simple(df, num_samples2, neibors):
    print('abod...')
    num_samples = num_samples2
    df2 = df[numerical_for_od]
    df2 = df2.iloc[:num_samples,:]
    X = df2.drop(['TARGET'],axis=1)
    y = df2['TARGET']

    X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.3)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    clf_name = 'abod'
    abod_erjan = abod.ABOD(n_neighbors=neibors)
    abod_erjan.fit(X_train_norm)
    
    num_samples = 30000
    x2 = train[numerical_for_od]
    #x2 = x2.iloc[:num_samples, :]
    X = x2.drop(['TARGET'],axis=1)
    X_norm, _ = standardizer(X, X)
    print('x norm shape: {0}'.format(X_norm.shape))
#     res2 = pd.DataFrame(X_norm)
#     res2['outliers'] = abod_erjan.predict(X_norm)
#     ee = res2['outliers'].value_counts()
    
    
    col_name = 'abod_neighbors_' + str(neibors) + "_" + str(num_samples2)
    preds = pd.DataFrame(abod_erjan.predict(X_norm))
    ee = preds[0].value_counts()
    df[col_name] = preds
    calc_percent( int(ee[1]), int(num_samples))

    return df


In [56]:
%%time
res2 = abod_simple(train, 1000,5)

abod...


MemoryError: 

In [53]:
res2.shape

(307511, 123)

In [52]:
res2['abod_neighbors_5_10000'].value_counts()

0.0    28092
1.0     1908
Name: abod_neighbors_5_10000, dtype: int64

In [54]:
res2

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,isolation_forest_300000,abod_neighbors_5_10000
0,1,Cash loans,M,N,Y,0,202500.000,406597.5,24700.5,351000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0,Cash loans,F,N,N,0,270000.000,1293502.5,35698.5,1129500.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.000,135000.0,6750.0,135000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.000,312682.5,29686.5,297000.0,...,0,0,,,,,,,0.0,0.0
4,0,Cash loans,M,N,Y,0,121500.000,513000.0,21865.5,513000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,Cash loans,M,N,Y,0,99000.000,490495.5,27517.5,454500.0,...,0,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6,0,Cash loans,F,Y,Y,1,171000.000,1560726.0,41301.0,1395000.0,...,0,0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0
7,0,Cash loans,M,Y,Y,0,360000.000,1530000.0,42075.0,1530000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,Cash loans,F,N,Y,0,112500.000,1019610.0,33826.5,913500.0,...,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0,Revolving loans,M,N,Y,0,135000.000,405000.0,20250.0,405000.0,...,0,0,,,,,,,0.0,0.0


In [70]:
%%time
res2 = abod_simple(train, 10000,10)

% outliers: 6.216666666666667
Wall time: 1min 45s


In [71]:
%%time
abod_simple(train,10000,20)

% outliers: 6.8133333333333335
Wall time: 7min 11s


In [None]:
# %%time
# abod_simple(train,10000,30)

In [None]:
# %%time
# abod_simple(train,10000,40)

# HBOS Simple predict

In [74]:
def hbos_simple(df, num_samples2):
    print('hbos...')
    num_samples = num_samples2
    df2 = df[numerical_for_od]
    df2 = df2.iloc[:num_samples,:]
    X = df2.drop(['TARGET'],axis=1)
    y = df2['TARGET']

    X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    clf_name = 'HBOS'
    hbos_erjan = hbos.HBOS()
    hbos_erjan.fit(X_train_norm)
    
    num_samples = 3000
    x2 = df[numerical_for_od]
    x2 = x2.iloc[:num_samples, :]
    X = x2.drop(['TARGET'],axis=1)
    X_norm, _ = standardizer(X, X)

    res2 = pd.DataFrame(X)
    res2['outliers'] = hbos_erjan.predict(X_norm)
    ee = res2['outliers'].value_counts()
    
    calc_percent( int(ee[1]), int(num_samples))
    #return res2['outliers'].value_counts()

In [75]:
%%time
res2 = hbos_simple(train, 1000)

hbos...
% outliers: 10.966666666666667
Wall time: 6.8 s


In [78]:
%%time
hbos_simple(train,10000)

hbos...
% outliers: 10.433333333333334
Wall time: 414 ms


In [79]:
%%time
hbos_simple(train,100000)

hbos...
% outliers: 11.433333333333334
Wall time: 816 ms


In [14]:
%%time

num_samples = 100000
df = train[numerical_for_od]
df = df.iloc[:num_samples,:]
X = df.drop(['TARGET'],axis=1)
y = df['TARGET']

X_all = train[numerical_for_od]
X_all = X_all.iloc[:num_samples,:]
X_all = X_all.drop(['TARGET'],axis=1)


X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)
X_train_norm, X_test_norm = standardizer(X_train, X_test)
print('x all shape')
print(X_all.shape)
abod_erjan = abod.ABOD(n_neighbors=5)
abod_erjan.fit(X_train_norm)

res = pd.DataFrame(X_all)
res['all_outliers_300k'] = abod_erjan.predict(X_all)
res['all_outliers_300k'].value_counts()

x all shape
(100000, 12)
Wall time: 10min 43s


In [15]:
res['all_outliers_300k'].value_counts()

1    100000
Name: all_outliers_300k, dtype: int64

# knn predict

In [80]:
def knn_simple(df, num_samples2):
    print('knn...')
    num_samples = num_samples2
    df2 = df[numerical_for_od]
    df2 = df2.iloc[:num_samples,:]
    X = df2.drop(['TARGET'],axis=1)
    y = df2['TARGET']

    X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    clf_name = 'knn'
    knn_erjan = knn.KNN()
    knn_erjan.fit(X_train_norm)
    
    num_samples = 3000
    x2 = df[numerical_for_od]
    x2 = x2.iloc[:num_samples, :]
    X = x2.drop(['TARGET'],axis=1)
    X_norm, _ = standardizer(X, X)

    res2 = pd.DataFrame(X)
    res2['outliers'] = knn_erjan.predict(X_norm)
    ee = res2['outliers'].value_counts()
    
    calc_percent( int(ee[1]), int(num_samples))
    #return res2['outliers'].value_counts()
    

In [84]:
%%time
res2 = knn_simple(train, 1000)

knn...
% outliers: 9.166666666666666
Wall time: 1.58 s


In [85]:
%%time
res2 = knn_simple(train, 10000)

knn...
% outliers: 9.266666666666667
Wall time: 7.46 s


In [86]:
%%time
res2 = knn_simple(train, 100000)

knn...
% outliers: 17.233333333333334
Wall time: 7min 14s


In [None]:
%%time
res2 = knn_simple(train, 300000)

knn...


In [None]:
40000- 2min 15s
50000 - 3min 26s, 3min 11s
60000 - 4min13s
100000 - 12min 19s

In [22]:
y_by_average = average(test_scores_norm)
df_test = pd.DataFrame(X_test)
df_test['y_by_average_score'] = y_by_average
df_test['y_by_average_cluster'] = np.where(df_test['y_by_average_score']<0, 0, 1)
df_test['y_by_average_cluster'].value_counts()

0    11713
1     8287
Name: y_by_average_cluster, dtype: int64

In [23]:
y_by_maximization = maximization(test_scores_norm)
results = pd.DataFrame(X_test)
results['y_by_maximization_score'] = y_by_maximization
results['y_by_maximization_cluster'] = np.where(results['y_by_maximization_score']<0, 0, 1)
results['y_by_maximization_cluster'].value_counts()

0    11713
1     8287
Name: y_by_maximization_cluster, dtype: int64

In [24]:
tt = train[numerical_for_od] 
tt = df.drop(['TARGET'],axis=1)
res = clf.decision_function(tt)
res

KeyboardInterrupt: 

In [None]:
clf.decision_function()

In [32]:
%%time
num_samples = 10000

df2 = train[numerical_for_od]    
df2 = df2.sample(10000)
df2 = df2.iloc[:num_samples,:]
X_all = df2.drop(['TARGET'],axis=1)
X_test = X_all
X_train_norm, X_test_norm = standardizer(X_all, X_all)
#X_train_norm, X_test_norm = X_train, X_test
train_scores = clf.decision_scores_
test_scores = clf.decision_function(X_test_norm)
train_scores=np.reshape(train_scores, (-1,1))
test_scores=np.reshape(test_scores, (-1,1))

train_scores_norm , test_scores_norm = standardizer(train_scores, test_scores)
y_by_average = average(test_scores_norm)

print(test_scores.shape)
print('x test shape')
print(X_test.shape)

print('------')
print('x all shape')
print(X_all.shape)
res = pd.DataFrame(X_test)
print("y by avg")
print(y_by_average.shape)
res['y_by_average_score2'] = y_by_average
res['y_by_average_cluster2'] = np.where(res['y_by_average_score2']<0, 0, 1)
t_res = res['y_by_average_cluster2'].value_counts()

(10000, 1)
x test shape
(10000, 12)
------
x all shape
(10000, 12)
y by avg
(10000,)
Wall time: 2min 1s


In [33]:
t_res

1    5103
0    4897
Name: y_by_average_cluster2, dtype: int64

In [34]:
res

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_EMPLOYED,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,FLOORSMAX_AVG,FLOORSMIN_AVG,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,y_by_average_score2,y_by_average_cluster2
137991,0.502130,0.602336,0.510853,-5548,135000.0,240660.0,11835.0,0.166700,0.208300,-5561.0,-3604,12.061091,-0.548679,0
261088,0.502130,0.541384,0.510853,-811,90000.0,808650.0,26217.0,0.226282,0.231894,-7610.0,-366,12.061091,-0.288411,0
8828,0.502130,0.691294,0.646330,-2417,54000.0,71955.0,8667.0,0.166700,0.231894,-502.0,-3118,12.061091,-0.019433,0
160435,0.507894,0.738231,0.513694,-4560,112500.0,450000.0,16965.0,0.166700,0.208300,-5587.0,-4601,12.061091,-0.417628,0
14680,0.186060,0.062395,0.820383,-742,202500.0,888840.0,35815.5,0.000000,0.231894,-1805.0,-117,12.061091,1.300358,1
18732,0.502130,0.723462,0.617826,-1874,112500.0,508495.5,21541.5,0.226282,0.231894,-6080.0,-4120,4.000000,-0.242146,0
261546,0.502130,0.662489,0.358951,365243,112500.0,348264.0,17820.0,0.333300,0.231894,-8668.0,-4084,6.000000,0.127427,1
185864,0.216678,0.391696,0.641368,-2595,90000.0,180000.0,9000.0,0.226282,0.231894,-1137.0,-4641,12.061091,-0.084727,0
108580,0.801897,0.661681,0.598926,-1465,157500.0,450000.0,44640.0,0.000000,0.231894,-1880.0,-1140,12.061091,0.627285,1
268531,0.502130,0.437300,0.363945,-1159,135000.0,299250.0,8226.0,0.226282,0.231894,-496.0,-3185,12.000000,-0.441779,0


In [10]:
y_by_aom = aom(test_scores_norm)
df_test = pd.DataFrame(X_test)
df_test['y_by_aom_score'] = y_by_aom
df_test['y_by_aom_cluster'] = np.where(df_test['y_by_aom_score']<0, 0, 1)
df_test['y_by_aom_cluster'].value_counts()

ValueError: Lower bound > Higher bound

In [None]:
y_by_moa = moa(test_scores_norm,method='dynamic')
df_test = pd.DataFrame(X_test)
df_test['y_by_moa_score'] = y_by_moa
df_test['y_by_moa_cluster'] = np.where(df_test['y_by_moa_score']<0, 0, 1)
df_test['y_by_moa_cluster'].value_counts()

In [113]:
df_test['y_by_average_cluster'].value_counts()

1    2000
Name: y_by_average_cluster, dtype: int64

In [None]:
def std_knn_and_index(df, numerical_for_od, num_samples):
    df = df[numerical_for_od]    
    df = df.iloc[:num_samples,:]
    X = df.drop(['TARGET'],axis=1)
    y = df['TARGET']
    X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    clf_name = 'KNN'
    clf = KNN()    
    
    clf.fit(X_train

In [59]:
def std_knn_and_index(df, numerical_for_od, num_samples):
    df = df[numerical_for_od]    
    df = df.iloc[:num_samples,:]
    X = df.drop(['TARGET'],axis=1)
    y = df['TARGET']
    X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    clf_name = 'KNN'
    clf = KNN()    
    
    clf.fit(X_train_norm)
    test_scores = clf.decision_function(X_test_norm)
    #print(type(test_scores))
    #print(test_scores.shape)
    
    
    y_by_aom = aom(test_scores)

    results = pd.DataFrame(X_test)
    results['y_by_aom_score'] = y_by_aom
    results['y_by_aom_cluster'] = np.where(results['y_by_aom_score']<0, 0, 1)
    #results['y_by_aom_cluster'].value_counts()
    
    y_by_moa = moa(test_scores)

    results = pd.DataFrame(X_test)
    results['y_by_moa_score'] = y_by_moa
    results['y_by_moa_cluster'] = np.where(results['y_by_moa_score']<0, 0, 1)
    #results['y_by_moa_cluster'].value_counts()
    
    
    
    
    
    
#     y_by_aom = aom(test_scores, n_buckets=5)

#     results = pd.DataFrame(X_test)
#     results['y_by_aom_score'] = y_by_aom
#     results['y_by_aom_cluster'] = np.where(results['y_by_aom_score']<0, 0, 1)

    #X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
#     print('x train type and shape')
#     print('{0} , {1}'.format(type(X_train), X_train.shape))
    
    
#     print('x train norm type and shape')
#     print('{0}, {1}'.format(type(X_train_norm), X_train_norm.shape))

    
#     print("x train shape")
#     print(X_train.shape)
#     print('x test shape')
#     print(X_test.shape)
    
#     print("x train_norm shape")
#     print(X_train_norm.shape)
#     print('x test_norm shape')
#     print(X_test_norm.shape)
    
    
    #return X_train, X_train_norm
#     train_scores = np.zeros(X_train.shape)
#     test_scores = np.zeros(X_test.shape)
    
#     print('*****')
#     print('train scores shape')
#     print(train_scores.shape)
    
#     clf.fit(X_train_norm)
#     #y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
    
#     train_scores = clf.decision_scores_
#     test_scores = clf.decision_function(X_test_norm)
#     #train_scores
#     train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores)
    
#     print('train_scores_norm')
#     print(train_scores_norm.shape)
    
#     print('test scores norm')
#     print(test_scores_norm.shape)
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
#     y_by_average = average(test_scores_norm)
         
#     results = pd.DataFrame(X_test)
#     results['y_by_average_score'] = y_by_average
#     results['y_by_average_cluster'] = np.where(results['y_by_average_score']<0, 0, 1)
#     #results['y_by_average_cluster'].value_counts()
    
#     y_by_maximization = maximization(test_scores_norm)
    
#     results = pd.DataFrame(X_test)
#     results['y_by_maximization_score'] = y_by_maximization
#     results['y_by_maximization_cluster'] = np.where(results['y_by_maximization_score']<0, 0, 1)
#     #results['y_by_maximization_cluster'].value_counts()
    
#     y_by_aom = aom(test_scores_norm, n_buckets=5)

#     results = pd.DataFrame(X_test)
#     results['y_by_aom_score'] = y_by_aom
#     results['y_by_aom_cluster'] = np.where(results['y_by_aom_score']<0, 0, 1)
#     #results['y_by_aom_cluster'].value_counts()
    
#     y_by_moa = moa(test_scores_norm, n_buckets=5)

#     results = pd.DataFrame(X_test)
#     results['y_by_moa_score'] = y_by_moa
#     results['y_by_moa_cluster'] = np.where(results['y_by_moa_score']<0, 0, 1)
#     #results['y_by_moa_cluster'].value_counts()
#     return results

In [60]:
%%time
clusters = std_knn_and_index(train, numerical_for_od,100)

<class 'numpy.ndarray'>
(20,)


ValueError: Expected 2D array, got 1D array instead:
array=[162125.39460568  68149.83769088  64174.63578502  73741.79031642
  44420.25493914  74163.939393   105572.32604401  42247.84925308
 380472.50844384  47777.96738326  64318.4835804  115481.79162795
 227115.96754871 129878.45812955 376151.71255134  64693.04569462
  40417.08023963 139226.19616502 109822.6416233   45433.30941167].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [45]:
pd.DataFrame(clusters[0].head(10))

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_EMPLOYED,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,FLOORSMAX_AVG,FLOORSMIN_AVG,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE
65,0.286783,0.647348,0.56206,-579,180000.0,1256400.0,40657.5,0.3333,0.375,-6105.0,-1,12.061091
61,0.50213,0.026541,0.434733,-1324,180000.0,1080000.0,44118.0,0.1667,0.2083,-4557.0,-586,12.061091
63,0.299606,0.664544,0.49206,-3720,112500.0,95940.0,10462.5,0.226282,0.231894,-5246.0,-4541,12.061091
90,0.765154,0.25476,0.411849,-8862,193500.0,225000.0,23755.5,0.226282,0.231894,-4493.0,-3043,12.061091
62,0.50213,0.733051,0.234015,365243,324000.0,1130760.0,40189.5,0.6667,0.231894,-1042.0,-3967,10.0
76,0.50213,0.585174,0.408359,-4066,112500.0,135000.0,6750.0,0.226282,0.231894,-4623.0,-2975,12.061091
41,0.50213,0.037315,0.510853,-475,202500.0,604152.0,29196.0,0.226282,0.231894,-3148.0,-513,12.061091
43,0.50213,0.766138,0.684828,365243,108000.0,746280.0,42970.5,0.3333,0.375,-5745.0,-4576,12.061091
17,0.50213,0.683513,0.510853,-191,81000.0,270000.0,13500.0,0.226282,0.231894,-4143.0,-2427,12.061091
64,0.50213,0.584887,0.09507,-2546,180000.0,315000.0,9679.5,0.0417,0.0417,-5448.0,-4097,1.0


In [46]:
pd.DataFrame(clusters[1]).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.587567,0.628781,0.405242,-0.339999,0.115907,1.749841,1.232518,1.021078,1.57241,-0.661958,1.961149,-0.0687
1,0.066569,-2.465803,-0.274675,-0.34641,0.115907,1.279111,1.516036,-0.541644,-0.236623,-0.073073,1.544455,-0.0687
2,-1.48907,0.7145,0.031446,-0.367031,-0.557766,-1.346885,-1.241348,0.017239,0.019414,-0.33518,-1.272678,-0.0687
3,2.086928,-1.328182,-0.396878,-0.411283,0.250641,-1.002484,-0.152257,0.017239,0.019414,-0.048727,-0.205658,-0.0687
4,0.066569,1.055993,-1.346495,2.808296,1.553074,1.414566,1.194175,4.148399,0.019414,1.264091,-0.86382,-0.355676
5,0.066569,0.318859,-0.415513,-0.370008,-0.557766,-1.242652,-1.545512,0.017239,0.019414,-0.098181,-0.157222,-0.0687
6,0.066569,-2.412095,0.131798,-0.339104,0.340464,0.009296,0.293481,0.017239,0.019414,0.462934,1.596453,-0.0687
7,0.066569,1.220923,1.06081,2.808296,-0.602677,0.388569,1.422022,1.021078,1.57241,-0.525008,-1.297609,-0.0687
8,0.066569,0.809058,0.131798,-0.33666,-0.872146,-0.8824,-0.992487,0.017239,0.019414,0.084419,0.233117,-0.0687
9,0.066569,0.31743,-2.088449,-0.356927,0.115907,-0.762316,-1.305499,-1.714155,-2.044571,-0.412024,-0.956419,-1.608791


In [18]:
clusters

array([200], dtype=int64)

In [None]:
outliers_by_algos = clusters[ 
          (clusters['y_by_average_cluster'] == 1) 
          
          ]

In [None]:
outliers_by_algos

In [None]:
pd.DataFrame(clusters)

In [None]:
y_by_average = average(test_scores_norm)
plt.hist(y_by_average, bins='auto') # arguments are passed to np.histogram
plt.title("Combination by average")
plt.show()

# std ABOD with no params

In [None]:
def std_abod(df,num_samples):
    df = df.loc[:num_samples,: ]
    print("ABOD with num samples {}".format(num_samples))
    clf_name = 'ABOD'        
    clf = abod.ABOD()
    clf.fit(train)
    y_test_pred = clf.predict(df) # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(df)
    aa = pd.DataFrame(y_test_pred,columns=['outlier_yes_no'])
    results = aa['outlier_yes_no'].value_counts().values
    print('% outliers from total is: {0:.2f} '.format( (results[1]*100)/results[0]))



In [None]:
def abod_neighbors(df,num_samples, do_neighbors=False):
    df = df.loc[:num_samples,: ]
    print("ABOD with num samples {}".format(num_samples))
    clf_name = 'ABOD'
    if do_neighbors:
        for neighbor in [10,20,30,40,50]:
            print("num neighbors: {}".format(neighbor))
            clf = abod.ABOD(n_neighbors=neighbor)
            clf.fit(train)
            y_test_pred = clf.predict(df)
            y_test_scores = clf.decision_function(df)
            aa = pd.DataFrame(y_test_pred,columns=['outlier_yes_no'])
            results = aa['outlier_yes_no'].value_counts().values
            print('% outliers from total is: {0:.2f} '.format( (results[1]*100)/results[0]))
    else:
        print('running 1 time....')
        clf = abod.ABOD()
        clf.fit(train)
        y_test_pred = clf.predict(df) # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(df)
        aa = pd.DataFrame(y_test_pred,columns=['outlier_yes_no'])
        results = aa['outlier_yes_no'].value_counts().values
        print('% outliers from total is: {0:.2f} '.format( (results[1]*100)/results[0]))
        

In [None]:
%%time
std_abod(train, 1000)

In [None]:
%%time
std_abod(train, 10000)

In [None]:
%%time
std_abod(train, 100000,False)

In [None]:
%%time
std_abod(train, 300000)