In [2]:
import os
import numpy as np
import pandas as pd
import pickle as pkl
import imageio
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from PIL import Image
import PIL
from astropy.nddata.utils import Cutout2D
from scipy import ndimage
import time
import matplotlib.pyplot as plt

In [3]:
# define functions to normalize image
def normalize(arr):
    arr = np.array(arr)
    arr = arr - np.min(arr)
    arr = arr / np.max(arr)
    return np.array(arr)

def gaussian_normalize(arr):
    arr = np.array(arr)
    arr = arr - np.mean(arr)
    arr = arr / np.std(arr)
    return np.array(arr)

# Bubble or no bubble

In [4]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)


In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
])

In [7]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.2934 - acc: 0.8912
Epoch 2/20
18667/18667 - 0s - loss: 0.1727 - acc: 0.9471
Epoch 3/20
18667/18667 - 0s - loss: 0.1634 - acc: 0.9516
Epoch 4/20
18667/18667 - 0s - loss: 0.1581 - acc: 0.9547
Epoch 5/20
18667/18667 - 0s - loss: 0.1543 - acc: 0.9560
Epoch 6/20
18667/18667 - 0s - loss: 0.1510 - acc: 0.9578
Epoch 7/20
18667/18667 - 0s - loss: 0.1482 - acc: 0.9585
Epoch 8/20
18667/18667 - 0s - loss: 0.1463 - acc: 0.9591
Epoch 9/20
18667/18667 - 0s - loss: 0.1448 - acc: 0.9596
Epoch 10/20
18667/18667 - 0s - loss: 0.1438 - acc: 0.9597
Epoch 11/20
18667/18667 - 0s - loss: 0.1429 - acc: 0.9604
Epoch 12/20
18667/18667 - 0s - loss: 0.1424 - acc: 0.9604
Epoch 13/20
18667/18667 - 0s - loss: 0.1421 - acc: 0.9605
Epoch 14/20
18667/18667 - 0s - loss: 0.1417 - acc: 0.9603
Epoch 15/20
18667/18667 - 0s - loss: 0.1415 - acc: 0.9608
Epoch 16/20
18667/18667 - 0s - loss: 0.1413 - acc: 0.9605
Epoch 17/20
18667/18667 - 0s - loss: 0.1411 - acc: 0.9608
Epoch 18/20
18667/18667

# Number of bubbles

In [8]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = np.array(df['bubblecount'])+1
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)


In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(6)
])

In [10]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.6437 - acc: 0.8724
Epoch 2/20
18667/18667 - 0s - loss: 0.3176 - acc: 0.9330
Epoch 3/20
18667/18667 - 0s - loss: 0.2822 - acc: 0.9359
Epoch 4/20
18667/18667 - 0s - loss: 0.2596 - acc: 0.9363
Epoch 5/20
18667/18667 - 0s - loss: 0.2408 - acc: 0.9379
Epoch 6/20
18667/18667 - 0s - loss: 0.2269 - acc: 0.9394
Epoch 7/20
18667/18667 - 0s - loss: 0.2187 - acc: 0.9438
Epoch 8/20
18667/18667 - 0s - loss: 0.2145 - acc: 0.9452
Epoch 9/20
18667/18667 - 0s - loss: 0.2117 - acc: 0.9458
Epoch 10/20
18667/18667 - 0s - loss: 0.2098 - acc: 0.9459
Epoch 11/20
18667/18667 - 0s - loss: 0.2084 - acc: 0.9464
Epoch 12/20
18667/18667 - 0s - loss: 0.2072 - acc: 0.9469
Epoch 13/20
18667/18667 - 0s - loss: 0.2064 - acc: 0.9468
Epoch 14/20
18667/18667 - 0s - loss: 0.2061 - acc: 0.9469
Epoch 15/20
18667/18667 - 0s - loss: 0.2055 - acc: 0.9470
Epoch 16/20
18667/18667 - 0s - loss: 0.2050 - acc: 0.9471
Epoch 17/20
18667/18667 - 0s - loss: 0.2047 - acc: 0.9471
Epoch 18/20
18667/18667

# Cross validation (probably don't need)

In [11]:
#10 fold cross-validation 
# import kfold package
from sklearn.model_selection import KFold
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)

kfold = KFold(n_splits=10, random_state = 10, shuffle = True)
cvscores = []
for train, test in kfold.split(x, y):
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
    ])
    model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ['accuracy'])
    model.fit(x[train],y[train],epochs = 20, verbose=0)
    # evaluate the model
    scores = model.evaluate(x[test], y[test], verbose = 3)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("Total accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

(23334, 3) (23334,)
acc: 95.72%
acc: 95.76%
acc: 95.89%
acc: 96.06%
acc: 96.49%
acc: 95.97%
acc: 96.19%
acc: 96.06%
acc: 96.70%
acc: 96.27%
Total accuracy: 96.11% (+/- 0.30%)
