## This notebook is for finding highly misclassified data points.

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

from astropy.io import fits
from glob import glob

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer, confusion_matrix
from sklearn import warnings

from IPython.display import SVG
from keras.utils.vis_utils import plot_model

from keras import backend as K
from keras.models import Sequential
from keras.layers import Input,Dense, Activation, Flatten, Convolution1D, Dropout, MaxPooling1D, Conv2D, Conv1D, InputLayer, Dropout, MaxPooling2D, BatchNormalization
from keras.optimizers import SGD, Adam
from keras.callbacks import TensorBoard
from keras.constraints import Constraint
from keras.initializers import Initializer, glorot_uniform
from keras.utils import np_utils
from keras.layers import Concatenate
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model

import tensorflow as tf

from Modeling import CrossValidationModeler

In [4]:
def create_conv_flux_model():
    model = Sequential()
    model.add(InputLayer(batch_input_shape=(None,4639, 1)))
    model.add(Conv1D(filters=64, kernel_size=2,activation= 'relu'))
    model.add(Conv1D(filters=24, kernel_size=2,activation= 'relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(30,activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [18]:
eboss = h5py.File('../../../AstroProj/Data/eboss_flux_full+.hdf5')
X, Y = np.array(eboss['flux_values']), np.array(eboss['flux_labels'])
res_flux = np.array(eboss['res_flux_values'])
ivar = np.array(eboss['ivar_flux_values'])
ids = np.array(eboss['flux_ids'])

res_flux = np.expand_dims(res_flux, -1)
ivar = np.expand_dims(ivar, -1)
X = np.expand_dims(X, -1)
eboss.close()

In [7]:
mods = [create_conv_flux_model() for _ in range(5)]
for mod in mods:
    X_train, X_test, y_train, y_test = train_test_split(res_flux, Y)
    mod.fit(X_train, y_train, epochs=10, batch_size=8, validation_data=(X_test, y_test), verbose=2)

Train on 210 samples, validate on 71 samples
Epoch 1/10
 - 7s - loss: 0.6745 - acc: 0.7714 - val_loss: 0.2316 - val_acc: 0.9014
Epoch 2/10
 - 4s - loss: 0.3448 - acc: 0.8000 - val_loss: 0.2049 - val_acc: 0.9718
Epoch 3/10
 - 4s - loss: 0.3116 - acc: 0.8524 - val_loss: 0.2129 - val_acc: 0.9014
Epoch 4/10
 - 4s - loss: 0.2852 - acc: 0.8429 - val_loss: 0.1427 - val_acc: 0.9718
Epoch 5/10
 - 4s - loss: 0.2444 - acc: 0.8667 - val_loss: 0.0982 - val_acc: 0.9859
Epoch 6/10
 - 4s - loss: 0.2082 - acc: 0.8905 - val_loss: 0.0729 - val_acc: 1.0000
Epoch 7/10
 - 4s - loss: 0.2405 - acc: 0.9095 - val_loss: 0.0688 - val_acc: 0.9718
Epoch 8/10
 - 4s - loss: 0.1414 - acc: 0.9333 - val_loss: 0.0483 - val_acc: 0.9859
Epoch 9/10
 - 4s - loss: 0.1581 - acc: 0.9286 - val_loss: 0.0477 - val_acc: 0.9859
Epoch 10/10
 - 4s - loss: 0.1656 - acc: 0.9000 - val_loss: 0.0438 - val_acc: 0.9859
Train on 210 samples, validate on 71 samples
Epoch 1/10
 - 5s - loss: 0.9580 - acc: 0.5762 - val_loss: 0.5039 - val_acc: 0.9

In [8]:
len(mods)

5

In [19]:
preds_classes = mods[0].predict_classes(res_flux)
preds_probas = mods[0].predict(res_flux)

In [33]:
bad_sets = [set() for _ in range(len(mods))]
for ix, (mod, bad_set) in enumerate(zip(mods, bad_sets)):
    print('Model Number: {}'.format(ix+1))
    preds_classes = mod.predict_classes(res_flux)
    preds_probas = mod.predict(res_flux)
    for pred_class, pred_proba, label, id in zip(preds_classes, preds_probas, Y, ids):
        if pred_class != label:
            bad_set.add(id[0])
            print("Predicted {} when it was really {}. ID: {}, PROBA: {}".format(pred_class, label, id, pred_proba))

    print(confusion_matrix(Y, mod.predict_classes(res_flux)))
    print('-----------------------------------------------')

Model Number: 1
Predicted [1] when it was really [0.]. ID: [b'4029_55618_17_oneline'], PROBA: [0.7180245]
[[ 67   1]
 [  0 213]]
-----------------------------------------------
Model Number: 2
[[ 68   0]
 [  0 213]]
-----------------------------------------------
Model Number: 3
Predicted [1] when it was really [0.]. ID: [b'3586_55181_980_oneline'], PROBA: [0.6003894]
Predicted [1] when it was really [0.]. ID: [b'3589_55186_104_oneline'], PROBA: [0.54114014]
Predicted [1] when it was really [0.]. ID: [b'3590_55201_531_multiline'], PROBA: [0.5300186]
Predicted [1] when it was really [0.]. ID: [b'4276_55505_32_oneline'], PROBA: [0.5171944]
Predicted [1] when it was really [0.]. ID: [b'4767_55946_72_oneline'], PROBA: [0.59787476]
Predicted [1] when it was really [0.]. ID: [b'5131_55835_97_multiline'], PROBA: [0.6140185]
Predicted [1] when it was really [0.]. ID: [b'5131_55835_97_oneline'], PROBA: [0.6140185]
Predicted [1] when it was really [0.]. ID: [b'5292_55926_107_oneline'], PROBA: [0

In [43]:
bad_sets[2].intersection(bad_sets[4]).intersection(bad_sets[3])

{b'3932_55337_331_oneline'}

In [39]:
master_set = set()
for set_ in bad_sets:
    master_set = master_set.union(set_)

In [44]:
master_set

{b'3586_55181_980_oneline',
 b'3588_55184_204_multiline',
 b'3589_55186_104_oneline',
 b'3589_55186_428_multiline',
 b'3590_55201_531_multiline',
 b'3677_55205_551_multiline',
 b'3677_55205_551_oneline',
 b'3932_55337_331_oneline',
 b'4029_55618_16_multiline',
 b'4029_55618_16_oneline',
 b'4029_55618_17_oneline',
 b'4276_55505_32_oneline',
 b'4767_55946_72_oneline',
 b'5131_55835_97_multiline',
 b'5131_55835_97_oneline',
 b'5292_55926_107_oneline',
 b'7295_57067_227_oneline'}