In [14]:
import os
import numpy as np
import pandas as pd
import pickle as pkl
import imageio
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from PIL import Image
import PIL
from astropy.nddata.utils import Cutout2D
from scipy import ndimage
import time
import matplotlib.pyplot as plt

In [2]:
# define functions to normalize image
def normalize(arr):
    arr = np.array(arr)
    arr = arr - np.min(arr)
    arr = arr / np.max(arr)
    return np.array(arr)

def gaussian_normalize(arr):
    arr = np.array(arr)
    arr = arr - np.mean(arr)
    arr = arr / np.std(arr)
    return np.array(arr)

# Bubble or no bubble

In [3]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)


In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
])

In [5]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.3258 - acc: 0.8658
Epoch 2/20
18667/18667 - 0s - loss: 0.1643 - acc: 0.9523
Epoch 3/20
18667/18667 - 0s - loss: 0.1560 - acc: 0.9563
Epoch 4/20
18667/18667 - 0s - loss: 0.1516 - acc: 0.9579
Epoch 5/20
18667/18667 - 0s - loss: 0.1485 - acc: 0.9586
Epoch 6/20
18667/18667 - 0s - loss: 0.1460 - acc: 0.9589
Epoch 7/20
18667/18667 - 0s - loss: 0.1440 - acc: 0.9598
Epoch 8/20
18667/18667 - 0s - loss: 0.1426 - acc: 0.9598
Epoch 9/20
18667/18667 - 0s - loss: 0.1414 - acc: 0.9599
Epoch 10/20
18667/18667 - 0s - loss: 0.1406 - acc: 0.9607
Epoch 11/20
18667/18667 - 0s - loss: 0.1401 - acc: 0.9605
Epoch 12/20
18667/18667 - 0s - loss: 0.1396 - acc: 0.9607
Epoch 13/20
18667/18667 - 0s - loss: 0.1394 - acc: 0.9606
Epoch 14/20
18667/18667 - 0s - loss: 0.1391 - acc: 0.9609
Epoch 15/20
18667/18667 - 0s - loss: 0.1388 - acc: 0.9612
Epoch 16/20
18667/18667 - 0s - loss: 0.1388 - acc: 0.9606
Epoch 17/20
18667/18667 - 0s - loss: 0.1386 - acc: 0.9611
Epoch 18/20
18667/18667

# Number of bubbles

In [6]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = np.array(df['bubblecount'])+1
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 3) (23334,)
(18667, 3) (4667, 3) (18667,) (4667,)


In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(6)
])

In [8]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.7840 - acc: 0.8183
Epoch 2/20
18667/18667 - 0s - loss: 0.3612 - acc: 0.9271
Epoch 3/20
18667/18667 - 0s - loss: 0.2965 - acc: 0.9307
Epoch 4/20
18667/18667 - 0s - loss: 0.2632 - acc: 0.9360
Epoch 5/20
18667/18667 - 0s - loss: 0.2489 - acc: 0.9392
Epoch 6/20
18667/18667 - 0s - loss: 0.2400 - acc: 0.9404
Epoch 7/20
18667/18667 - 0s - loss: 0.2338 - acc: 0.9417
Epoch 8/20
18667/18667 - 0s - loss: 0.2289 - acc: 0.9429
Epoch 9/20
18667/18667 - 0s - loss: 0.2247 - acc: 0.9438
Epoch 10/20
18667/18667 - 0s - loss: 0.2212 - acc: 0.9443
Epoch 11/20
18667/18667 - 0s - loss: 0.2189 - acc: 0.9441
Epoch 12/20
18667/18667 - 0s - loss: 0.2167 - acc: 0.9445
Epoch 13/20
18667/18667 - 0s - loss: 0.2150 - acc: 0.9456
Epoch 14/20
18667/18667 - 0s - loss: 0.2136 - acc: 0.9456
Epoch 15/20
18667/18667 - 0s - loss: 0.2124 - acc: 0.9457
Epoch 16/20
18667/18667 - 0s - loss: 0.2116 - acc: 0.9456
Epoch 17/20
18667/18667 - 0s - loss: 0.2110 - acc: 0.9458
Epoch 18/20
18667/18667

# Cross validation (probably don't need)

In [9]:
#10 fold cross-validation 
# import kfold package
from sklearn.model_selection import KFold
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature', 'blobfeature', 'blobpeakfeature']])
for i in range(3):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)

kfold = KFold(n_splits=10, random_state = 10, shuffle = True)
cvscores = []
for train, test in kfold.split(x, y):
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(3,)),
    tf.keras.layers.Dense(2)
    ])
    model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ['accuracy'])
    model.fit(x[train],y[train],epochs = 20, verbose=0)
    # evaluate the model
    scores = model.evaluate(x[test], y[test], verbose = 3)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("Total accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

(23334, 3) (23334,)
acc: 96.83%
acc: 96.40%
acc: 96.19%
acc: 97.30%
acc: 95.16%
acc: 95.89%
acc: 95.71%
acc: 95.37%
acc: 97.00%
acc: 95.80%
Total accuracy: 96.16% (+/- 0.67%)


# 5 inputs

In [10]:
df = pkl.load(open("FeaturesDataFrame.p", "rb"))
df = df.query('not (bubblecount == -1 & blobpeakfeature > 1)').copy()
df.reset_index(drop=True, inplace=True)
y = 1-(np.array(df['bubblecount'])<=0)
x = np.array(df[['edgefeature0', 'edgefeature1','edgefeature2','blobfeature', 'blobpeakfeature']])
for i in range(5):
    x[:,i] = gaussian_normalize(x[:,i])
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(23334, 5) (23334,)
(18667, 5) (4667, 5) (18667,) (4667,)


In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(5,)),
    tf.keras.layers.Dense(2)
])

In [12]:
model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ["accuracy"])
model.fit(x_train,y_train,epochs = 20, verbose=2)
test_accuracy = model.evaluate(x_test, y_test, verbose = 3)
print("\nLoss: %.4f, Accuracy: %.4f"%tuple(test_accuracy))
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test,y_pred))

Epoch 1/20
18667/18667 - 0s - loss: 0.2617 - acc: 0.9214
Epoch 2/20
18667/18667 - 0s - loss: 0.1594 - acc: 0.9547
Epoch 3/20
18667/18667 - 0s - loss: 0.1507 - acc: 0.9581
Epoch 4/20
18667/18667 - 0s - loss: 0.1466 - acc: 0.9596
Epoch 5/20
18667/18667 - 0s - loss: 0.1431 - acc: 0.9606
Epoch 6/20
18667/18667 - 0s - loss: 0.1405 - acc: 0.9611
Epoch 7/20
18667/18667 - 0s - loss: 0.1384 - acc: 0.9613
Epoch 8/20
18667/18667 - 0s - loss: 0.1371 - acc: 0.9613
Epoch 9/20
18667/18667 - 0s - loss: 0.1359 - acc: 0.9618
Epoch 10/20
18667/18667 - 0s - loss: 0.1353 - acc: 0.9609
Epoch 11/20
18667/18667 - 0s - loss: 0.1347 - acc: 0.9614
Epoch 12/20
18667/18667 - 0s - loss: 0.1345 - acc: 0.9614
Epoch 13/20
18667/18667 - 0s - loss: 0.1341 - acc: 0.9615
Epoch 14/20
18667/18667 - 0s - loss: 0.1339 - acc: 0.9614
Epoch 15/20
18667/18667 - 0s - loss: 0.1337 - acc: 0.9618
Epoch 16/20
18667/18667 - 0s - loss: 0.1335 - acc: 0.9618
Epoch 17/20
18667/18667 - 0s - loss: 0.1333 - acc: 0.9618
Epoch 18/20
18667/18667

In [13]:
#10 fold cross-validation 
# import kfold package
kfold = KFold(n_splits=10, random_state = 10, shuffle = True)
cvscores = []
for train, test in kfold.split(x, y):
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation=tf.nn.relu, input_shape=(5,)),
    tf.keras.layers.Dense(2)
    ])
    model.compile(optimizer = 'adam', 
               loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
               metrics = ['accuracy'])
    model.fit(x[train],y[train],epochs = 20, verbose=0)
    # evaluate the model
    scores = model.evaluate(x[test], y[test], verbose = 3)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("Total accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

acc: 96.83%
acc: 96.27%
acc: 96.32%
acc: 97.17%
acc: 95.29%
acc: 96.10%
acc: 95.76%
acc: 95.37%
acc: 96.96%
acc: 95.59%
Total accuracy: 96.16% (+/- 0.63%)
