In [1]:
%load_ext autoreload
%autoreload 2

* [M31](#M31)
* [DEEP](#DEEP)
* [DISK](#DISK)

In [2]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import lru_cache

from zwad.ad.postprocess import *

from IPython.display import display, HTML
pd.set_option('display.max_rows', 2000)

In [3]:
data_dir = '../data/'


def fakes_report(table, fake_names):
    names = set(fake_names.values())
    count = sum(table['oid'].map(lambda s: isinstance(s, str)))
    found = [s for s in table['oid'].values if isinstance(s, str)]
    not_found = names - set(found)
    report = """
    {found_count} / {total} fakes are found

    Found
    -----
    {found}
    
    Not found
    ---------
    {not_found}
    """.format(
        found_count=len(found),
        total=len(names),
        found=', '.join(found),
        not_found=', '.join(not_found),
    )
    return report
    

def table_path(field_name, alco):
    filename = '{}_{}_fake.csv'.format(field_name, alco)
    return os.path.join(data_dir, filename)


@lru_cache()
def load_fake_names(field_name):
    fake_filename = 'fakes_{}_fake.csv'.format(field_name)
    df = pd.read_csv(os.path.join(data_dir, 'fakes', fake_filename))
    fake_names = {}
    for _, (oid, name) in df.iterrows():
        fake_names[oid] = name
    return fake_names


def get_tables(field_name, alcos):
    fake_names = load_fake_names(field_name)
    
    def apply_fake_names(table):
        table['oid'] = table['oid'].map(lambda oid: fake_names.get(oid, oid))
    
    tables = {}
    for alco in alcos:
        tables[alco] = load_ad_tables_by_patterns([table_path(field_name, alco)])
        apply_fake_names(tables[alco])

    combined_table = load_ad_tables_by_patterns([table_path(field_name, alco)
                                             for alco in alcos])
    apply_fake_names(combined_table)
    
    return tables, combined_table

# M31

In [4]:
FIELD = 'm31'
tables, combined_table = get_tables(FIELD, alcos=['iso', 'gmm', 'lof', 'svm'])

## Isolation Forest fakes

In [5]:
ALCO = 'iso'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    10 / 16 fakes are found

    Found
    -----
    step, ZTF18abhjrcf_format_r, Gaia16aye_3_format_r, ZTF18abaqxrt_format_r, MACHO-6.6696.60_format_B, ZTF18aaztjyd_format_r, Gaia16aye_format_r, Gaia16aye_2_format_r, MACHO-6.6696.60_format_R, ZTF18acskgwu_format_r
    
    Not found
    ---------
    OGLE-LMC-CEP-0227_format_V, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18aasszwr_format_r, flat, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,m31_iso_fake
0,step,-0.75309
1,ZTF18abhjrcf_format_r,-0.740498
2,Gaia16aye_3_format_r,-0.737781
3,695211400034403,-0.707322
4,695211400124577,-0.702643
5,695211400053697,-0.696436
6,695211400102351,-0.693752
7,695211400132963,-0.692504
8,ZTF18abaqxrt_format_r,-0.691396
9,695211400088968,-0.691137


## Gaussian Mixture Models fakes

In [6]:
ALCO = 'gmm'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    9 / 16 fakes are found

    Found
    -----
    step, Gaia16aye_3_format_r, MACHO-6.6696.60_format_R, ZTF18abhjrcf_format_r, ZTF18abaqxrt_format_r, Gaia16aye_format_r, MACHO-6.6696.60_format_B, Gaia16aye_2_format_r, flat
    
    Not found
    ---------
    OGLE-LMC-CEP-0227_format_V, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18aasszwr_format_r, ZTF18aaztjyd_format_r, ZTF18acskgwu_format_r, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,m31_gmm_fake
0,step,-1509.605536
1,695211400034403,-393.006754
2,695211200009221,-324.942218
3,695211400124577,-314.184142
4,Gaia16aye_3_format_r,-300.175128
5,695211400000352,-299.543117
6,695211400102351,-274.96699
7,695211200020939,-260.667099
8,695211400053697,-260.108951
9,695211200008801,-252.255062


## Local Outlier Factor fakes

In [7]:
ALCO = 'lof'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    12 / 16 fakes are found

    Found
    -----
    step, flat, MACHO-6.6696.60_format_R, flat_noise, Gaia16aye_2_format_r, MACHO-6.6696.60_format_B, Gaia16aye_3_format_r, ZTF18abaqxrt_format_r, ZTF18acskgwu_format_r, ZTF18abhjrcf_format_r, ZTF18aaztjyd_format_r, Gaia16aye_format_r
    
    Not found
    ---------
    ZTF18aasszwr_format_r, OGLE-LMC-CEP-0227_format_V, OGLE-LMC-CEP-0227_format_I, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,m31_lof_fake
0,step,-8.912194
1,flat,-7.551189
2,MACHO-6.6696.60_format_R,-4.858569
3,flat_noise,-4.466071
4,Gaia16aye_2_format_r,-4.022277
5,MACHO-6.6696.60_format_B,-3.876315
6,695211200009221,-3.545021
7,695211400034403,-3.505094
8,695211100002984,-3.195999
9,Gaia16aye_3_format_r,-3.186308


## One class Support Vector Machines fakes

In [8]:
ALCO = 'svm'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    12 / 16 fakes are found

    Found
    -----
    step, MACHO-6.6696.60_format_R, Gaia16aye_3_format_r, ZTF18abhjrcf_format_r, ZTF18abaqxrt_format_r, MACHO-6.6696.60_format_B, flat, Gaia16aye_2_format_r, Gaia16aye_format_r, ZTF18acskgwu_format_r, ZTF18aaztjyd_format_r, flat_noise
    
    Not found
    ---------
    ZTF18aasszwr_format_r, OGLE-LMC-CEP-0227_format_V, OGLE-LMC-CEP-0227_format_I, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,m31_svm_fake
0,step,1.0
1,MACHO-6.6696.60_format_R,1.0
2,Gaia16aye_3_format_r,1.0
3,ZTF18abhjrcf_format_r,1.0
4,ZTF18abaqxrt_format_r,1.0
5,MACHO-6.6696.60_format_B,1.0
6,flat,1.000038
7,Gaia16aye_2_format_r,1.000055
8,Gaia16aye_format_r,1.000098
9,695211200075348,1.000543


## Combined table

In [9]:
print(fakes_report(combined_table, load_fake_names(FIELD)))
display(combined_table)


    12 / 16 fakes are found

    Found
    -----
    step, ZTF18abhjrcf_format_r, Gaia16aye_3_format_r, ZTF18abaqxrt_format_r, MACHO-6.6696.60_format_B, Gaia16aye_format_r, Gaia16aye_2_format_r, MACHO-6.6696.60_format_R, ZTF18aaztjyd_format_r, ZTF18acskgwu_format_r, flat, flat_noise
    
    Not found
    ---------
    ZTF18aasszwr_format_r, OGLE-LMC-CEP-0227_format_V, OGLE-LMC-CEP-0227_format_I, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,m31_iso_fake,m31_gmm_fake,m31_lof_fake,m31_svm_fake
0,step,-0.75309,-1509.605536,-8.912194,1.0
1,ZTF18abhjrcf_format_r,-0.740498,-199.798629,-2.539371,1.0
2,Gaia16aye_3_format_r,-0.737781,-300.175128,-3.186308,1.0
3,695211400034403,-0.707322,-393.006754,-3.505094,1.00494
4,695211400124577,-0.702643,-314.184142,-2.995712,1.010084
5,695211400053697,-0.696436,-260.108951,-2.624966,1.016399
6,695211400102351,-0.693752,-274.96699,-2.749282,1.004267
7,ZTF18abaqxrt_format_r,-0.691396,-194.866011,-2.847367,1.0
8,695211400088968,-0.691137,-197.432146,-2.130518,1.357826
9,695211400028274,-0.689623,-223.583553,-2.262296,1.015475


# DEEP

In [10]:
FIELD = 'deep'
tables, combined_table = get_tables(FIELD, alcos=['iso', 'gmm', 'svm'])

## Isolation Forest fakes

In [11]:
ALCO = 'iso'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    7 / 16 fakes are found

    Found
    -----
    Gaia16aye_3_format_r, ZTF18abhjrcf_format_r, step, Gaia16aye_2_format_r, Gaia16aye_format_r, OGLE-LMC-CEP-0227_format_V, ZTF18acskgwu_format_r
    
    Not found
    ---------
    MACHO-6.6696.60_format_B, MACHO-6.6696.60_format_R, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18abaqxrt_format_r, ZTF18aasszwr_format_r, ZTF18aaztjyd_format_r, flat, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,deep_iso_fake
0,795203200009604,-0.793968
1,795205400022890,-0.786204
2,Gaia16aye_3_format_r,-0.785896
3,ZTF18abhjrcf_format_r,-0.784745
4,step,-0.783777
5,Gaia16aye_2_format_r,-0.778045
6,795211200035931,-0.776418
7,795205400027537,-0.772259
8,Gaia16aye_format_r,-0.769659
9,795209200003484,-0.764336


## Gaussian Mixture Models fakes

In [12]:
ALCO = 'gmm'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    8 / 16 fakes are found

    Found
    -----
    step, Gaia16aye_2_format_r, Gaia16aye_format_r, Gaia16aye_3_format_r, OGLE-LMC-CEP-0227_format_V, ZTF18abaqxrt_format_r, ZTF18abhjrcf_format_r, MACHO-6.6696.60_format_R
    
    Not found
    ---------
    MACHO-6.6696.60_format_B, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18aasszwr_format_r, ZTF18aaztjyd_format_r, ZTF18acskgwu_format_r, flat, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,deep_gmm_fake
0,795205400022890,-4064.041954
1,step,-1277.544351
2,795206400012603,-757.730547
3,Gaia16aye_2_format_r,-753.111612
4,Gaia16aye_format_r,-736.149904
5,795206400001319,-731.415392
6,795206400033829,-728.557332
7,795215300016556,-617.750242
8,Gaia16aye_3_format_r,-587.962372
9,795206400000500,-579.472321


## One class Support Vector Machines fakes

In [13]:
ALCO = 'svm'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    15 / 16 fakes are found

    Found
    -----
    step, Gaia16aye_2_format_r, Gaia16aye_3_format_r, Gaia16aye_format_r, MACHO-6.6696.60_format_R, OGLE-LMC-CEP-0227_format_V, ZTF18abaqxrt_format_r, MACHO-6.6696.60_format_B, flat, ZTF18abhjrcf_format_r, ZTF18acskgwu_format_r, ZTF18ablruzq_format_r, ZTF18aaztjyd_format_r, OGLE-LMC-CEP-0227_format_V, OGLE-LMC-CEP-0227_format_I
    
    Not found
    ---------
    flat_noise, ZTF18aasszwr_format_r
    


Unnamed: 0,oid,deep_svm_fake
0,step,1.0
1,795205400022890,1.0
2,Gaia16aye_2_format_r,1.0
3,Gaia16aye_3_format_r,1.0
4,Gaia16aye_format_r,1.0
5,MACHO-6.6696.60_format_R,1.0
6,795215300016556,1.0
7,795203200009604,1.0
8,OGLE-LMC-CEP-0227_format_V,1.0
9,ZTF18abaqxrt_format_r,1.0


## Combined table

In [14]:
print(fakes_report(combined_table, load_fake_names(FIELD)))
display(combined_table)


    15 / 16 fakes are found

    Found
    -----
    Gaia16aye_3_format_r, ZTF18abhjrcf_format_r, step, Gaia16aye_2_format_r, Gaia16aye_format_r, OGLE-LMC-CEP-0227_format_V, ZTF18acskgwu_format_r, ZTF18abaqxrt_format_r, MACHO-6.6696.60_format_R, MACHO-6.6696.60_format_B, flat, ZTF18ablruzq_format_r, ZTF18aaztjyd_format_r, OGLE-LMC-CEP-0227_format_V, OGLE-LMC-CEP-0227_format_I
    
    Not found
    ---------
    flat_noise, ZTF18aasszwr_format_r
    


Unnamed: 0,oid,deep_iso_fake,deep_gmm_fake,deep_svm_fake
0,795203200009604,-0.793968,-485.153878,1.0
1,795205400022890,-0.786204,-4064.041954,1.0
2,Gaia16aye_3_format_r,-0.785896,-587.962372,1.0
3,ZTF18abhjrcf_format_r,-0.784745,-310.590566,1.000005
4,step,-0.783777,-1277.544351,1.0
5,Gaia16aye_2_format_r,-0.778045,-753.111612,1.0
6,795205400027537,-0.772259,-253.215538,1.000001
7,Gaia16aye_format_r,-0.769659,-736.149904,1.0
8,795204200026512,-0.762649,-300.155435,1.017498
9,795211400021366,-0.761189,-243.214327,1.010791


# DISK

In [15]:
FIELD = 'disk'
tables, combined_table = get_tables(FIELD, alcos=['iso', 'gmm'])

## Isolation Forest fakes

In [16]:
ALCO = 'iso'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    3 / 16 fakes are found

    Found
    -----
    step, ZTF18abhjrcf_format_r, MACHO-6.6696.60_format_B
    
    Not found
    ---------
    Gaia16aye_2_format_r, Gaia16aye_format_r, OGLE-LMC-CEP-0227_format_I, Gaia16aye_3_format_r, ZTF18abaqxrt_format_r, ZTF18aaztjyd_format_r, flat, ZTF18ablruzq_format_r, OGLE-LMC-CEP-0227_format_V, MACHO-6.6696.60_format_R, flat_noise, ZTF18aasszwr_format_r, ZTF18acskgwu_format_r
    


Unnamed: 0,oid,disk_iso_fake
0,807208200059506,-0.8197
1,807206200014645,-0.818362
2,807209400037670,-0.817158
3,807206400014916,-0.814398
4,807206200004116,-0.813919
5,step,-0.813721
6,807210100028861,-0.813222
7,807202400045768,-0.812757
8,807211300006190,-0.812333
9,807202400056014,-0.811738


## Gaussian Mixture Models fakes

In [17]:
ALCO = 'gmm'
print(fakes_report(tables[ALCO], load_fake_names(FIELD)))
display(tables[ALCO])


    7 / 16 fakes are found

    Found
    -----
    step, Gaia16aye_3_format_r, Gaia16aye_format_r, ZTF18abhjrcf_format_r, OGLE-LMC-CEP-0227_format_V, ZTF18aaztjyd_format_r, MACHO-6.6696.60_format_B
    
    Not found
    ---------
    MACHO-6.6696.60_format_R, Gaia16aye_2_format_r, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18abaqxrt_format_r, ZTF18aasszwr_format_r, ZTF18acskgwu_format_r, flat, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,disk_gmm_fake
0,807205200017536,-23258.64811
1,807206400014916,-9770.701092
2,807209300037143,-8671.06845
3,807214100007080,-8286.324066
4,807209400037670,-5045.072637
5,807208200059506,-4576.861571
6,807210100028861,-4474.921391
7,step,-3974.107728
8,807203300044912,-3698.349759
9,807209300012026,-3675.598897


## Combined table

In [19]:
print(fakes_report(combined_table, load_fake_names(FIELD)))
display(combined_table)


    7 / 16 fakes are found

    Found
    -----
    step, ZTF18abhjrcf_format_r, MACHO-6.6696.60_format_B, Gaia16aye_3_format_r, Gaia16aye_format_r, OGLE-LMC-CEP-0227_format_V, ZTF18aaztjyd_format_r
    
    Not found
    ---------
    MACHO-6.6696.60_format_R, Gaia16aye_2_format_r, flat_noise, OGLE-LMC-CEP-0227_format_I, ZTF18abaqxrt_format_r, ZTF18aasszwr_format_r, ZTF18acskgwu_format_r, flat, ZTF18ablruzq_format_r
    


Unnamed: 0,oid,disk_iso_fake,disk_gmm_fake
0,807208200059506,-0.8197,-4576.861571
1,807206200014645,-0.818362,-2348.876639
2,807209400037670,-0.817158,-5045.072637
3,807206400014916,-0.814398,-9770.701092
4,step,-0.813721,-3974.107728
5,807210100028861,-0.813222,-4474.921391
6,807211300006190,-0.812333,-3259.270838
7,807216100038423,-0.811519,-1435.655117
8,807202300038681,-0.811007,-2720.749385
9,807208300016714,-0.810458,-3363.639584
