In [3]:
import os
import numpy as np
import pandas as pd
import pickle as pkl
import imageio
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from PIL import Image
import PIL
from astropy.nddata.utils import Cutout2D
from scipy import ndimage
import time
import matplotlib.pyplot as plt

In [2]:
# define functions to normalize image
def normalize(arr):
    arr = np.array(arr)
    arr = arr - np.min(arr)
    arr = arr / np.max(arr)
    return np.array(arr)

def gaussian_normalize(arr):
    arr = np.array(arr)
    arr = arr - np.mean(arr)
    arr = arr / np.std(arr)
    return np.array(arr)

# Bubble or no bubble

In [27]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)


In [28]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
])

In [45]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
])
weights = {0:1, 1:1}
# weights = {0:len(y)/np.sum(y==0), 1:len(y)/np.sum(y==1)}
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train, class_weight=weights,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_train, y_train, verbose = 3)
print("\nTrain set:\tLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("Test set:\tLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.2538 - acc: 0.9455
Epoch 2/20
18667/18667 - 0s - loss: 0.1652 - acc: 0.9540
Epoch 3/20
18667/18667 - 0s - loss: 0.1587 - acc: 0.9562
Epoch 4/20
18667/18667 - 0s - loss: 0.1547 - acc: 0.9577
Epoch 5/20
18667/18667 - 0s - loss: 0.1515 - acc: 0.9585
Epoch 6/20
18667/18667 - 0s - loss: 0.1489 - acc: 0.9592
Epoch 7/20
18667/18667 - 0s - loss: 0.1466 - acc: 0.9599
Epoch 8/20
18667/18667 - 0s - loss: 0.1450 - acc: 0.9602
Epoch 9/20
18667/18667 - 0s - loss: 0.1436 - acc: 0.9607
Epoch 10/20
18667/18667 - 0s - loss: 0.1427 - acc: 0.9609
Epoch 11/20
18667/18667 - 0s - loss: 0.1421 - acc: 0.9607
Epoch 12/20
18667/18667 - 0s - loss: 0.1415 - acc: 0.9608
Epoch 13/20
18667/18667 - 0s - loss: 0.1411 - acc: 0.9606
Epoch 14/20
18667/18667 - 0s - loss: 0.1408 - acc: 0.9613
Epoch 15/20
18667/18667 - 0s - loss: 0.1406 - acc: 0.9606
Epoch 16/20
18667/18667 - 0s - loss: 0.1404 - acc: 0.9608
Epoch 17/20
18667/18667 - 0s - loss: 0.1404 - acc: 0.9611
Epoch 18/20
18667/18667

# Number of bubbles

In [128]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
# throw away -1 count events with more than 1 peaks
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = np.array(df['bubblecount'])
# group 0 and -1
y = np.array([i if i>=0 else 0 for i in y])
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
for i in range(5):
    print("Number of events with %d bubbles: %d"%(i,np.sum(y==i)))

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)
Number of events with 0 bubbles: 16055
Number of events with 1 bubbles: 7003
Number of events with 2 bubbles: 260
Number of events with 3 bubbles: 15
Number of events with 4 bubbles: 1


In [129]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(5)
])

In [130]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.7457 - acc: 0.8860
Epoch 2/20
18667/18667 - 0s - loss: 0.2624 - acc: 0.9373
Epoch 3/20
18667/18667 - 0s - loss: 0.2190 - acc: 0.9428
Epoch 4/20
18667/18667 - 0s - loss: 0.2046 - acc: 0.9455
Epoch 5/20
18667/18667 - 0s - loss: 0.1976 - acc: 0.9459
Epoch 6/20
18667/18667 - 0s - loss: 0.1936 - acc: 0.9465
Epoch 7/20
18667/18667 - 0s - loss: 0.1912 - acc: 0.9474
Epoch 8/20
18667/18667 - 0s - loss: 0.1896 - acc: 0.9479
Epoch 9/20
18667/18667 - 0s - loss: 0.1885 - acc: 0.9481
Epoch 10/20
18667/18667 - 0s - loss: 0.1874 - acc: 0.9480
Epoch 11/20
18667/18667 - 0s - loss: 0.1869 - acc: 0.9484
Epoch 12/20
18667/18667 - 0s - loss: 0.1862 - acc: 0.9484
Epoch 13/20
18667/18667 - 0s - loss: 0.1855 - acc: 0.9486
Epoch 14/20
18667/18667 - 0s - loss: 0.1853 - acc: 0.9488
Epoch 15/20
18667/18667 - 0s - loss: 0.1850 - acc: 0.9490
Epoch 16/20
18667/18667 - 0s - loss: 0.1843 - acc: 0.9489
Epoch 17/20
18667/18667 - 0s - loss: 0.1843 - acc: 0.9489
Epoch 18/20
18667/18667

# 0, 1, or 2+ bubbles

In [19]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
# throw away -1 count events with more than 1 peaks
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = np.array(df['bubblecount'])
# group 0 and -1
y = np.array([i if i>=0 else 0 for i in y])
y = np.array([i if i<2 else 2 for i in y])
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
for i in range(3):
    print("Number of events with %d bubbles: %d"%(i,np.sum(y==i)))

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)
Number of events with 0 bubbles: 16055
Number of events with 1 bubbles: 7003
Number of events with 2 bubbles: 276


In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(3)
])
weights = {0:len(y)/np.sum(y==0), 1:len(y)/np.sum(y==1), 2:len(y)/np.sum(y==2)}

In [21]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.5306 - acc: 0.8664
Epoch 2/20
18667/18667 - 0s - loss: 0.2517 - acc: 0.9333
Epoch 3/20
18667/18667 - 0s - loss: 0.2205 - acc: 0.9402
Epoch 4/20
18667/18667 - 0s - loss: 0.2055 - acc: 0.9442
Epoch 5/20
18667/18667 - 0s - loss: 0.1958 - acc: 0.9457
Epoch 6/20
18667/18667 - 0s - loss: 0.1893 - acc: 0.9477
Epoch 7/20
18667/18667 - 0s - loss: 0.1851 - acc: 0.9485
Epoch 8/20
18667/18667 - 0s - loss: 0.1824 - acc: 0.9493
Epoch 9/20
18667/18667 - 0s - loss: 0.1806 - acc: 0.9502
Epoch 10/20
18667/18667 - 0s - loss: 0.1796 - acc: 0.9503
Epoch 11/20
18667/18667 - 0s - loss: 0.1789 - acc: 0.9503
Epoch 12/20
18667/18667 - 0s - loss: 0.1784 - acc: 0.9506
Epoch 13/20
18667/18667 - 0s - loss: 0.1781 - acc: 0.9509
Epoch 14/20
18667/18667 - 0s - loss: 0.1776 - acc: 0.9510
Epoch 15/20
18667/18667 - 0s - loss: 0.1773 - acc: 0.9511
Epoch 16/20
18667/18667 - 0s - loss: 0.1771 - acc: 0.9509
Epoch 17/20
18667/18667 - 0s - loss: 0.1769 - acc: 0.9511
Epoch 18/20
18667/18667

# Cross validation (probably don't need)

In [9]:
#10 fold cross-validation 
# import kfold package
from sklearn.model_selection import KFold
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)

kfold = KFold(n_splits=10, random_state = 10, shuffle = True)
cvscores = []
for train, test in kfold.split(x, y):
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
    ])
    model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ['accuracy'])
    model.fit(x[train],y[train],epochs = 20, verbose=0)
    # evaluate the model
    scores = model.evaluate(x[test], y[test], verbose = 3)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("Total accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

(23334, 3) (23334,)
acc: 96.83%
acc: 96.40%
acc: 96.19%
acc: 97.30%
acc: 95.16%
acc: 95.89%
acc: 95.71%
acc: 95.37%
acc: 97.00%
acc: 95.80%
Total accuracy: 96.16% (+/- 0.67%)


# 5 inputs

In [10]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature0', 'edgefeature1','edgefeature2','blobfeature', 'blobpeakfeature']])
for i in range(5):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 5) (23334,)
(18667, 5) (4667, 5) (18667,) (4667,)


In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(5,)),
    tf.keras.layers.Dense(2)
])

In [12]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.2617 - acc: 0.9214
Epoch 2/20
18667/18667 - 0s - loss: 0.1594 - acc: 0.9547
Epoch 3/20
18667/18667 - 0s - loss: 0.1507 - acc: 0.9581
Epoch 4/20
18667/18667 - 0s - loss: 0.1466 - acc: 0.9596
Epoch 5/20
18667/18667 - 0s - loss: 0.1431 - acc: 0.9606
Epoch 6/20
18667/18667 - 0s - loss: 0.1405 - acc: 0.9611
Epoch 7/20
18667/18667 - 0s - loss: 0.1384 - acc: 0.9613
Epoch 8/20
18667/18667 - 0s - loss: 0.1371 - acc: 0.9613
Epoch 9/20
18667/18667 - 0s - loss: 0.1359 - acc: 0.9618
Epoch 10/20
18667/18667 - 0s - loss: 0.1353 - acc: 0.9609
Epoch 11/20
18667/18667 - 0s - loss: 0.1347 - acc: 0.9614
Epoch 12/20
18667/18667 - 0s - loss: 0.1345 - acc: 0.9614
Epoch 13/20
18667/18667 - 0s - loss: 0.1341 - acc: 0.9615
Epoch 14/20
18667/18667 - 0s - loss: 0.1339 - acc: 0.9614
Epoch 15/20
18667/18667 - 0s - loss: 0.1337 - acc: 0.9618
Epoch 16/20
18667/18667 - 0s - loss: 0.1335 - acc: 0.9618
Epoch 17/20
18667/18667 - 0s - loss: 0.1333 - acc: 0.9618
Epoch 18/20
18667/18667

In [13]:
#10 fold cross-validation 
# import kfold package
kfold = KFold(n_splits=10, random_state = 10, shuffle = True)
cvscores = []
for train, test in kfold.split(x, y):
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(5,)),
    tf.keras.layers.Dense(2)
    ])
    model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ['accuracy'])
    model.fit(x[train],y[train],epochs = 20, verbose=0)
    # evaluate the model
    scores = model.evaluate(x[test], y[test], verbose = 3)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("Total accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

acc: 96.83%
acc: 96.27%
acc: 96.32%
acc: 97.17%
acc: 95.29%
acc: 96.10%
acc: 95.76%
acc: 95.37%
acc: 96.96%
acc: 95.59%
Total accuracy: 96.16% (+/- 0.63%)
