# Evaluation

In [1]:
from functools import partial

import h5py
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from torch.utils.data import DataLoader, TensorDataset

import convnets

In [2]:
N_WAVES = 2048

## Different Architectures of VGG Net on SDSS

- metrics on validation set of SDSS

In [3]:
with h5py.File("sdss_dr14.hdf5", "r") as datafile:
    grp = datafile['2048_zwarning==0']
    X_sdss_va = torch.from_numpy(grp["X_va"][:].reshape(-1, 1, N_WAVES))
    y_sdss_va = grp["y_va"][:]

In [4]:
def forward(model, X, dev=torch.device("cuda")):
    model = model.to(dev)
    model.eval()
    batch_size = 2 ** 15
    dl = DataLoader(TensorDataset(X), batch_size=batch_size)
    outputs = torch.zeros(X.size(0))
    with torch.no_grad():
        for i, [xb] in enumerate(dl):
            xb = xb.to(dev)
            start = i * batch_size
            end = start + batch_size
            outputs[start:end] = model(xb).cpu().detach().squeeze()
    probas_pred = torch.sigmoid(outputs).numpy()
    return probas_pred

In [5]:
def evaluate(y_true, y_prob):
    y_pred = y_prob > 0.5
    return {
        "accuracy": metrics.accuracy_score(y_true, y_pred),
        "f1_score": metrics.f1_score(y_true, y_pred),
        "precision": metrics.precision_score(y_true, y_pred),
        "recall": metrics.recall_score(y_true, y_pred),
        "roc_auc": metrics.roc_auc_score(y_true, y_prob)
    }

In [6]:
sdss_metrics = pd.DataFrame()
for convnet in convnets.CONVNETS:
    model = convnets.get_convnet(convnet)
    model.load_state_dict(torch.load("models/{}.pt".format(convnet)))
    series = pd.Series(evaluate(y_sdss_va, forward(model, X_sdss_va)), name=convnet)
    sdss_metrics = sdss_metrics.append(series)
sdss_metrics.sort_values("f1_score", ascending=False)

Unnamed: 0,accuracy,f1_score,precision,recall,roc_auc
VGG-Net-A,0.9871,0.958071,0.927035,0.991256,0.995796
VGG-Net-B,0.98708,0.958036,0.926382,0.991929,0.996127
VGG-Net-E,0.987,0.957682,0.927959,0.989373,0.995838
VGG-Net-D,0.9869,0.957371,0.927374,0.989373,0.995776
VGG-Net-5,0.9869,0.957371,0.927374,0.989373,0.996394
VGG-Net-4,0.98638,0.955719,0.924984,0.988566,0.996411
VGG-Net-3,0.98488,0.950614,0.924054,0.978746,0.99603
VGG-Net-1,0.98464,0.949817,0.923507,0.97767,0.995432
VGG-Net-0,0.98332,0.945802,0.914886,0.978881,0.994245
VGG-Net-2,0.98308,0.944612,0.920153,0.970406,0.995417


## VGG Net-A on SDSS

- evaluation on test set of SDSS
- correct precision
- candidates from all filtered data of SDSS

In [7]:
with h5py.File("sdss_dr14.hdf5", "r") as datafile:
    grp = datafile['2048_zwarning==0']
    X_sdss_te = torch.from_numpy(grp["X_te"][:].reshape(-1, 1, N_WAVES))
    y_sdss_te = grp["y_te"][:]

model_sdss = convnets.get_convnet("VGG-Net-A")
model_sdss.load_state_dict(torch.load("models/VGG-Net-A.pt"))
y_sdss_prob_te = forward(model_sdss, X_sdss_te)
evaluate(y_sdss_te, y_sdss_prob_te)

{'accuracy': 0.98651,
 'f1_score': 0.9549748005740796,
 'precision': 0.9221348459455975,
 'recall': 0.9902401882743822,
 'roc_auc': 0.9959838932114625}

In [8]:
metrics.confusion_matrix(y_sdss_te, y_sdss_prob_te > 0.5)

array([[84345,  1208],
       [  141, 14306]])

In [9]:
# TODO correct presion: discuss with Petr first

In [10]:
with h5py.File("sdss_dr14.hdf5", "r") as datafile:
    grp = datafile["2048_zwarning==0"]
    X_tr = grp["X_tr"][:].reshape(-1, 1, N_WAVES)
    X_va = grp["X_va"][:].reshape(-1, 1, N_WAVES)
    X_te = grp["X_te"][:].reshape(-1, 1, N_WAVES)
    id_tr, id_va, id_te = grp["id_tr"][:], grp["id_va"][:], grp["id_te"][:]
    y_tr, y_va, y_te = grp["y_tr"][:], grp["y_va"][:], grp["y_te"][:]

X_sdss = torch.from_numpy(np.concatenate((X_tr, X_va, X_te)))
id_sdss = np.concatenate((id_tr, id_va, id_te))
y_sdss = np.concatenate((y_tr, y_va, y_te))

In [11]:
y_sdss_prob = forward(model_sdss, X_sdss)
y_sdss_pred = y_sdss_prob > 0.5
y_sdss_pred.sum()

643356

In [12]:
metrics.confusion_matrix(y_sdss, y_sdss_pred)

array([[3491811,   49111],
       [   5348,  594245]])

In [13]:
id_sdss[y_sdss_pred]

array([(8189, 57448, 341), (8747, 57401, 644), (8290, 57364, 989), ...,
       ( 440, 51885, 511), (7300, 56707,  89), (6060, 56074, 569)],
      dtype=[('plate', '<i4'), ('mjd', '<i4'), ('fiberid', '<i4')])

In [14]:
savecsv = partial(np.savetxt, fmt="%i", delimiter=',', header="plate,mjd,fiberid", comments="")
savecsv("pred/sdss_pred.csv", id_sdss[y_sdss_pred])

In [15]:
sdss_candidates = id_sdss[(y_sdss == False) & (y_sdss_pred == True)]
savecsv("pred/sdss_false_positive.csv", sdss_candidates)

## Training on LAMOST

- evaluation on test set of LAMOST

In [16]:
with h5py.File("lamost_dr5.hdf5", "r") as datafile:
    grp = datafile["2048_nofilter"]
    X_lamost_te = torch.from_numpy(grp["X_te"][:].reshape(-1, 1, N_WAVES))
    y_lamost_te = grp["y_te"][:]

In [17]:
model_lamost = convnets.get_convnet("VGG-Net-A")
model_lamost.load_state_dict(torch.load("models/lamost.pt"))
y_lamost_prob_te = forward(model_lamost, X_lamost_te)
evaluate(y_lamost_te, y_lamost_prob_te)

{'accuracy': 0.99692,
 'f1_score': 0.5925925925925927,
 'precision': 0.5586034912718204,
 'recall': 0.6309859154929578,
 'roc_auc': 0.9960557726407621}

In [18]:
metrics.confusion_matrix(y_lamost_te, y_lamost_prob_te > 0.5)

array([[99468,   177],
       [  131,   224]])

## Transfer Learning from SDSS to LAMOST

- evaluation on test set of SDSS
- correct precision
- candidats from all filtered data of SDSS

In [19]:
model_tl = convnets.get_convnet("VGG-Net-A")
model_tl.load_state_dict(torch.load("models/sdss_transfer.pt"))
y_sdss_prob_te = forward(model_tl, X_sdss_te)
evaluate(y_sdss_te, y_sdss_prob_te)

{'accuracy': 0.98607,
 'f1_score': 0.9534844892643671,
 'precision': 0.9210967741935484,
 'recall': 0.9882328511109573,
 'roc_auc': 0.9955160373082799}

In [20]:
metrics.confusion_matrix(y_sdss_te, y_sdss_prob_te > 0.5)

array([[84330,  1223],
       [  170, 14277]])

In [21]:
y_tl_prob = forward(model_tl, X_sdss)
y_tl_pred = y_tl_prob > 0.5
y_tl_pred.sum()

642550

In [22]:
metrics.confusion_matrix(y_sdss, y_tl_pred)

array([[3491193,   49729],
       [   6772,  592821]])

In [23]:
# TODO correct presion: discuss with Petr first

In [24]:
savecsv("pred/tl_pred.csv", id_sdss[y_tl_pred])

In [25]:
tl_candidates = id_sdss[(y_sdss == False) & (y_tl_pred == True)]
savecsv("pred/tl_false_positive.csv", tl_candidates)

## Benefits of Transfer Learning

- comparison of SDSS and transfer learning candidates

In [26]:
only_tl_idx = ~np.isin(tl_candidates, sdss_candidates)
np.sum(only_tl_idx)

3019

In [27]:
only_tl_candidates = tl_candidates[only_tl_idx]
savecsv("pred/only_tl_false_positive.csv", only_tl_candidates)