In [1]:
import os
import numpy as np
np.random.seed(69)
import pandas as pd
import random
import pickle as pkl
import matplotlib.pyplot as plt
import matplotlib.image as img
import seaborn as sns
sns.set()
import tensorflow as tf
from tqdm.notebook import tqdm
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,concatenate, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, ZeroPadding2D, LeakyReLU, ReLU, AveragePooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# import kerastuner as kt
# from kerastuner import HyperModel
import time

## Make X stuff

In [2]:
loc = "fits_files/"
df = pd.read_csv("SDSS_Query_allconds_DF.csv",index_col=0)

In [3]:
os.chdir(loc)

In [4]:
all_files = os.listdir()
random.shuffle(all_files)

In [5]:
X = []
y = []
objlist = []

for fn in tqdm(all_files):
    objid = int(fn.split(".")[0])
    curclass = df.loc[objid]["class"]
    y.append(curclass)
    objlist.append(objid)
    arr = np.load(fn)
    X.append(arr)

X = np.array(X)
y = np.array(y)
objlist = np.array(objlist)

  0%|          | 0/398 [00:00<?, ?it/s]

In [6]:
print(f"Shape of X is {X.shape}")

Shape of X is (398, 32, 32, 5)


## DL stuff

In [7]:
photodf = df.loc[:,['dered_u', 'deVRad_u', 'psffwhm_u', 'extinction_u',
       'dered_g', 'deVRad_g', 'psffwhm_g', 'extinction_g', 'dered_r',
       'deVRad_r', 'psffwhm_r', 'extinction_r', 'dered_i', 'deVRad_i',
       'psffwhm_i', 'extinction_i', 'dered_z', 'deVRad_z', 'psffwhm_z',
       'extinction_z', 'u_g', 'g_r', 'r_i', 'i_z']]

dnnx=[]
dnny=[]
for i,objnum in tqdm(enumerate(objlist),total=len(objlist)):
    dnny.append(df.loc[objnum,"class"])
    dnnx.append(photodf.loc[objnum].values)
dnny=np.array(dnny)
dnnx=np.array(dnnx)

  0%|          | 0/398 [00:00<?, ?it/s]

In [8]:
newinpsdf = pd.DataFrame(df.loc[:,["extinction_u","extinction_g","extinction_r","extinction_i","extinction_z"]].mean(axis=1),columns = ["mean_extinction"])
newinps = []
for i,objnum in tqdm(enumerate(objlist),total=len(objlist)):
    newinps.append(newinpsdf.loc[objnum].values)
newinps=np.array(newinps)

  0%|          | 0/398 [00:00<?, ?it/s]

In [9]:
del(df,newinpsdf,photodf)

In [10]:
# # DROP QSO

# idx_drop = np.where(dnny=="QSO")[0]
# X = np.delete(X,idx_drop,axis=0)
# dnnx = np.delete(dnnx,idx_drop,axis=0)
# newinps = np.delete(newinps,idx_drop,axis=0)
# y = np.delete(y,idx_drop,axis=0)
# objlist = np.delete(objlist,idx_drop,axis=0)

# del(dnny)

In [11]:
# # DROP repeats

# rep_idx = []

# for i,test2obj in enumerate(objlist):
#     if test2obj in objlist_train or test2obj in objlist_val:
#         rep_idx.append(i)

# X = np.delete(X,rep_idx,axis=0)
# dnnx = np.delete(dnnx,rep_idx,axis=0)
# newinps = np.delete(newinps,rep_idx,axis=0)
# y = np.delete(y,rep_idx,axis=0)
# objlist = np.delete(objlist,rep_idx,axis=0)


In [12]:
pd.Series(y).value_counts()

STAR      199
GALAXY    199
dtype: int64

In [13]:
y, label_strings = pd.factorize(y,sort=True)
y = to_categorical(y)
print(label_strings)

['GALAXY' 'STAR']


In [14]:
def get_metrics(y_pred, y_test, labels, to_print=True):
    correct_labels = np.where(y_pred==y_test)[0]
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred,average='macro')
    recall = metrics.recall_score(y_test, y_pred,average='macro')
    f1score = metrics.f1_score(y_test, y_pred,average='macro')
    # rocscore = metrics.roc_auc_score(y_test, y_pred,average='micro',multi_class="ovo")
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)  
    classification_report = metrics.classification_report(y_test, y_pred)

    if to_print:
        print("Identified {} correct labels out of {} labels".format(len(correct_labels), y_test.shape[0]))
        print("Accuracy:",accuracy)
        print("Precision:",precision)
        print("Recall:",recall)
        print("F1 Score:",f1score)
        # print("ROC AUC Score:",rocscore)
        print(f"Labels are: {labels}")
        print("Confusion Matrix:\n", confusion_matrix)
        print("Classification_Report:\n", classification_report)

    return (correct_labels, accuracy, precision, recall, confusion_matrix, classification_report)

def plot_model_change(history,fname="time.pdf"):
    # summarize history for accuracy
    plt.plot(history.history['accuracy'],label="Training Acc")
    plt.plot(history.history['val_accuracy'],label="Val Acc")
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend()
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'],label="Training Loss")
    plt.plot(history.history['val_loss'],label="Val Loss")
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend()
    plt.savefig(fname)
    plt.show()


In [15]:
# model = load_model("../../EnsembleClassifier.h5")
model = load_model("../../star-galaxy EnsembleClassifier.h5")

In [16]:
model.evaluate([X,newinps, dnnx],y)



[0.3283045291900635, 0.9020100235939026]

In [17]:
preds = model.predict([X,newinps, dnnx],batch_size=512, verbose = 0)
print(get_metrics(preds.argmax(axis=1), y.argmax(axis=1),label_strings))

Identified 359 correct labels out of 398 labels
Accuracy: 0.9020100502512562
Precision: 0.913377648525135
Recall: 0.9020100502512562
F1 Score: 0.9013317229761942
Labels are: ['GALAXY' 'STAR']
Confusion Matrix:
 [[196   3]
 [ 36 163]]
Classification_Report:
               precision    recall  f1-score   support

           0       0.84      0.98      0.91       199
           1       0.98      0.82      0.89       199

    accuracy                           0.90       398
   macro avg       0.91      0.90      0.90       398
weighted avg       0.91      0.90      0.90       398

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,
        14,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  31,  32,  33,  35,  36,  37,  38,  39,  40,  41,  42,
        43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
        56,  58,  59,  60,  61,  62,  64,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  79,  8

In [18]:
# paperobjlist = np.load("../../paperobjlist.npy")
# objlist_test = np.load("../../objlist_test.npy")

# # paperobjlist -> All TRAIN+VAL+TEST1 OBJS
# # objlist_test -> TEST 1 OBJS
# # objlist -> TEST 2 OBJS


# # TEST2 OBJS NOT in TEST1
# print("Already pretrained/valed are:")
# len(np.intersect1d(objlist,paperobjlist)) - len(np.intersect1d(objlist,objlist_test))

In [19]:
dnnmodel = load_model("../../star-galaxy DNNClassifier.h5")

In [20]:
dnnmodel.evaluate(dnnx,y)



[0.276912659406662, 0.8894472122192383]

In [21]:
dnnx.shape

(398, 24)