In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load the required libraries

# Basic algebra and matrix mathematics libraries
import numpy as np
import pandas as pd

# Basic visualization libraries
import seaborn as sns
from matplotlib import pyplot as plt

# Basic image manipulation libraries
import imageio
import cv2
from skimage.transform import rescale, resize, rotate
from skimage.color import rgb2gray
from sklearn.metrics import confusion_matrix, auc, accuracy_score
from sklearn.model_selection import train_test_split

# Libraries to build and test the neural network
import tensorflow as tf
from keras import models
from keras import callbacks
import pickle
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.optimizers import Adam,SGD,RMSprop
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator

import warnings
warnings.filterwarnings('ignore')

random_state = 42

# Setting a random state for reproducibility
import random
np.random.seed(random_state)

In [None]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

In [None]:
#load the data into a pandas dataframe and have a look at it
raw_data = pd.read_csv('/kaggle/input/honey-bee-annotated-images/bee_data.csv')
raw_data.head()

# get value counts of the number of health conditions
raw_data.health.value_counts(normalize = True)

# there are too many categories in the unhealthy bees, it would be better if they are all taken as one class - 'unhealthy'
# this will solve the obvious imbalance in data as well
health_counts = raw_data.health.value_counts()
health_counts

# if a row has its health set to anything other than "healthy", set it to unhealthy
# now we have only 2 classes - healthy or unhealthy
raw_data.loc[raw_data.health != 'healthy','health'] = 0 #unhealthy
raw_data.loc[raw_data.health == 'healthy','health'] = 1 #healthy
raw_data.head()

# let us check the distribution of values
# As we can see, there is a 65/35 distribution between the classes which is not so skewed
raw_data.health.value_counts(normalize = True)

# we only need the filename and the health condition of the bees so we can drop the other columns
raw_data.drop(columns=['date','time','location','zip code','subspecies','pollen_carrying','caste'], inplace=True)

# check out the changed dataframe
raw_data.head()

In [None]:
# Read in all the images and store them as np arrays
# Make sure all images are read as greyscale which makes computations easy for the neural network
# It also reduces errors which arise from it learning background color
# Define a function which fetches an image from the specified path based on
# an argument which is the name of the file in str format
def FetchImage(filename):
    # Define the path of the images
    IMAGE_FILE_ROOT = '/kaggle/input/honey-bee-annotated-images/bee_imgs/bee_imgs/'
    return cv2.imread(IMAGE_FILE_ROOT+filename, cv2.IMREAD_GRAYSCALE)


# use the function to get image data and store it inside an np array
image_data_array = list()
for i in range(raw_data.shape[0]):
    image_data_array.append(FetchImage(raw_data.file.values[i]))

image_data_array = np.asarray(image_data_array)

# check out a random image
plt.imshow(image_data_array[42])

In [None]:
# We now need to resize the images such that they can be input to the neural net
# in a consistent manner. The most ideal size is (54, 50) - experimentation
# we set the image height and width and resize it using OpenCV's resize function
# we also normalize the images between 0 and 1 to signify brightness of the pixels
def normalize(image):
    im_width = 54
    im_height = 50
    dsize = (im_width, im_height)
    resized = cv2.resize(image, dsize)

    return ((resized/255.0))


def create_datagens(data, datagen_params, target_shape, batch_size, random_state = 42, preprocessing_function = None, x_col="file", y_col="health", IMAGE_FILE_ROOT = '/kaggle/input/honey-bee-annotated-images/bee_imgs/bee_imgs/'):
    data[y_col] = data[y_col].astype(str) # coercion needed for datagen
    # train/test split
    train, test = train_test_split(data,test_size = 1/3,
    stratify = data.iloc[:,-1], # assumed last column is target variable
    random_state = random_state)

    # training ImageDataGenerator
    datagen = ImageDataGenerator(horizontal_flip  = datagen_params.get("horizontal_flip") or False,
    vertical_flip = datagen_params.get("vertical_flip") or False,
    rotation_range   = datagen_params.get("rotation_range") or False,
    brightness_range = datagen_params.get("brightness_range"),
    preprocessing_function = preprocessing_function)

    datagen_iter_train = datagen.flow_from_dataframe(train,
    directory = IMAGE_FILE_ROOT,
    x_col = x_col,
    y_col = y_col,
    target_size = target_shape,
    class_mode = 'binary',
    batch_size = batch_size,
    shuffle = True,
    seed = random_state)

    # testing ImageDataGenerator
    datagen_test = ImageDataGenerator(preprocessing_function = preprocessing_function)
    datagen_iter_test = datagen_test.flow_from_dataframe(test,
    directory = IMAGE_FILE_ROOT,
    x_col = x_col,
    y_col = y_col,
    target_size = target_shape,
    class_mode  = 'binary',
    batch_size  = 1,
    shuffle = False)

    return datagen_iter_train, datagen_iter_test


def permutate_params(grid_params):
    '''Returns a list of all combinations of unique parameters from the given dictionary'''
    out = [{}]

    # loop through each key/val pair
    for param_name, param_list in grid_params.items():
        # shortcircut - no need to permute single items
        if len(param_list) == 1:
            for item in out:
                item[param_name] = param_list[0]
        else:
            temp_out = []
            # for each item in the param, clone entire growing list and add param to each
            for param_val in param_list:
                for item in out:
                    cloned_item = item.copy()
                    cloned_item[param_name] = param_val
                    temp_out.append(cloned_item)
            out = temp_out
    return out


def build_model_from_datagen(params = dict(),input_shape = (),datagen_iter_train = None,datagen_iter_val = None,file_name = None, optimizer = "adam"):
    kernel_size = 3
    dropout = .5
    activation_func = "relu"

    conv__filters_1 = params.get('conv__filters_1') or 32
    conv__filters_2 = params.get('conv__filters_2') or 16
    conv__filters_3 = params.get('conv__filters_3') or 32
    density_units_1 = params.get('density_units_1') or 32
    density_units_2 = params.get('density_units_2') or 32
    epochs          = params.get('epochs') or 10

    # instantiating model
    model = Sequential([
        # Conv layer #1
        Conv2D(
            filters = conv__filters_1,
            kernel_size = kernel_size + 4,
            activation  = activation_func,
            input_shape = input_shape, #input layer
            padding     = "same"
        ),
        Conv2D(filters = conv__filters_1, kernel_size = kernel_size + 4, activation = activation_func, padding = "same"),
        MaxPooling2D(pool_size=(2,2)),
        Dropout(dropout/2),

        # Conv layer #2
        Conv2D(filters = conv__filters_2, kernel_size = kernel_size + 2, activation=activation_func, padding = "same"),
        Conv2D(filters = conv__filters_2, kernel_size = kernel_size + 2, activation = activation_func, padding = "same"),
        MaxPooling2D(pool_size=(2,2)),
        Dropout(dropout/2),

        # Conv layer #3
        Conv2D(filters = conv__filters_3, kernel_size = kernel_size, activation=activation_func, padding = "same"),
        Conv2D(filters = conv__filters_3, kernel_size = kernel_size, activation = activation_func, padding = "same"),
        MaxPooling2D(pool_size=(2,2)),
        Dropout(dropout/2),

        # Dense layer #1
        Flatten(),
        Dense(density_units_1, activation=activation_func),
        Dropout(dropout),

        # Dense layer #2
        Dense(density_units_2, activation=activation_func),
        Dropout(dropout),

        # Output layer
        Dense(1, activation='sigmoid')
    ])

    # compiling model
    model.compile(
        loss      = 'binary_crossentropy',
        optimizer = optimizer,
        metrics   = ['binary_accuracy']
    )

    # fitting model w/ImageDataGenerator
    STEP_SIZE_TRAIN= np.ceil(datagen_iter_train.n/datagen_iter_train.batch_size)
    STEP_SIZE_VALID= np.ceil(datagen_iter_val.n/datagen_iter_val.batch_size)

    # NOTE: the best model is saved to disk via callbacks, and is a retrievable file
    history = model.fit_generator(
        generator           = datagen_iter_train,
        steps_per_epoch     = STEP_SIZE_TRAIN,
        validation_data     = datagen_iter_val,
        validation_steps    = STEP_SIZE_VALID,
        epochs              = epochs,
        callbacks           = [callbacks.ModelCheckpoint(file_name, save_best_only=True, mode='auto', period=1)]
    )

    return (model, history)



def gridSearchCNN(datagens,grid_params,file_name,random_state = None,optimizer = "adam",):
    # list of all parameter combinations
    all_params = permutate_params(grid_params)

    # establishing variables
    best_model   = None
    best_score   = 0.0 # no accuracy to start
    best_params  = None
    best_history = None
    test_scores  = None
    train_scores = None

    datagen_iter_train, datagen_iter_test = datagens

    # for each permuted parameter, try fitting a model (NOTE: the best model is saved to disk with file_name)
    for params in all_params:
        model, history = build_model_from_datagen(
            params,
            input_shape        = datagen_iter_train.image_shape,
            datagen_iter_train = datagen_iter_train,
            datagen_iter_val   = datagen_iter_test,
            optimizer          = optimizer,
            file_name          = file_name
        )

        acc = max(history.history["val_binary_accuracy"])

        # only keeping best
        if acc > best_score:
            print("***Good Accurary found: {:.2%}***".format(acc))
            best_score   = acc
            test_scores  = history.history["val_binary_accuracy"]
            train_scores = history.history["binary_accuracy"]
            best_model   = model
            best_params  = params
            best_history = history

    # returns metadata of results (NOTE: retrieving best model from hard disk)
    return {
        "best_model"   : load_model(file_name),
        "best_score"   : best_score,
        "best_params"  : best_params,
        "best_history" : best_history,
        "test_scores"  : test_scores,
        "train_scores" : train_scores
    }



def conf_matrix_stats(y_test, preds):
    ''' Return key confusion matrix metrics given true and predicted values'''
    cm = confusion_matrix(y_test, preds)
    TP, FP, FN, TN, = cm[1,1], cm[0,1], cm[1,0], cm[0,0]
    total = (TP + FP + FN + TN)
    acc = (TP + TN ) / total
    miss = 1 - acc
    sens = TP / (TP + FN)
    spec = TN / (TN + FP)
    prec = TP / (TP + FP)
    return {"accuracy": acc, "miss_rate": miss, "sensitivity": sens, "specification": spec, "precision": prec}

In [None]:
MODEL_PATH = "/kaggle/working/"
model_name = "beeImage"
stored_model_path = f"{MODEL_PATH}/{model_name}.p"

datagen_params = {
    "horizontal_flip"  : True,
    "vertical_flip"    : True,
    "rotation_range"   : 360,
    "brightness_range" : [.7, 1.]
}

datagens = create_datagens(
    raw_data,
    datagen_params         = datagen_params,
    batch_size             = 64, # hyperparameter
    target_shape           = (50, 54),
    preprocessing_function = normalize,
    random_state           = 42
)

grid_params = {
     "conv__filters_1" : [32],
     "conv__filters_2" : [48],
     "conv__filters_3" : [64],
     "density_units_1" : [256],
     "density_units_2" : [64],
     "batch_size"      : [64],
     "epochs"          : [50]
 }

best_original_model = gridSearchCNN(
     datagens     = datagens,
     grid_params  = grid_params,
     random_state = 42,
     optimizer    = RMSprop(lr = 0.0001, decay = 1e-6),
     file_name    = f"{MODEL_PATH}/{model_name}.h5"
     )

pickle.dump(best_original_model, open(stored_model_path, 'wb')) # saving metadata

best_original_model = pickle.load(open(stored_model_path, 'rb')) # loading metadata