In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Upvotes are appreciated!

## I'd be happy to answer any questions I can in the comments! or talk off kaggle if you prefer so feel free to comment or reach out via the contact this user on kaggle
### I am also open to any tips or corrections on my work! Constructive criticism is always appreciated.


# Notebook Overview
* this notebook is a step one in this competition
* the goal is to predict weather an image is a whale or a dolphin
    * this will allow for training seperate feature extraction networks later on
* the notebook uses the cropped dataset as its images, theses are focused on the animal of interest
* it also uses the splits data to reproducibly provide cross-validation splits when comparing networks
* we will use the google pretrained efficient nets with fine tuning to see if we can learn better

# Imports

In [None]:
# DOWNLOADS
# !pip install -q efficientnet
# !pip install tensorflow_addons
# !pip install wandb

# NORMAL IMPORTS
import os
import gc
import random
import pickle
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

# PREPROCESSING IMPORTS
import cv2
from sklearn.model_selection import StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# MODEL BUILDING AND TRAINING IMPORTS
import tensorflow_hub as tfhub
import tensorflow as tf
from tensorflow.keras import backend as K
# import tensorflow_addons as tfa
# import efficientnet.tfkeras as efn

BASE_PATH = '/kaggle/input/'
print(os.listdir(BASE_PATH))
MAIN_DATASET = os.path.join(BASE_PATH, 'happy-whale-and-dolphin')
CV_SPLITS_DATASET = os.path.join(BASE_PATH, 'happywhale-splits')
CROPPED_DATASET = os.path.join(BASE_PATH, 'happywhale-cropped-dataset-yolov5-ds')

# The Setup
* we want to setup the training strategy incase we use a tpu
* we want to define the model config class that holds parameters
* we want to seed the environment for reporducibility
* in the future I want to learn to use wandb and connect it in the setup

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
    
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
print("REPLICAS: ", strategy.num_replicas_in_sync)

## Config

#### efficientnet is a python module we can use to download the pretrained efficient net model.

#### The models were trained on a lot of image data, and according to the internet (and the fact a lot of other people are using them) they seem like a good option to extract information from our images

#### We want to use a pretrained model that learned to extract meaningful features on a bunch of images because it is likely that it has learned some good general strategies. By starting with a model that generally preforms (or performs I dont know) well, we can leverage this feature extraction information on other images, and just fine tune the weights to learn to extract what we want from our images.

In [None]:
# this just translates the name of the model were using
# to the function we need to import it
# again this uses the efficient net module that
# creates an efficient net model architecture and 
# preloads it with the corresponsding weights

# EFF_NET_MODEL_DICT_V1 = {'EfficientNetB0':efn.EfficientNetB0, 
#                          'EfficientNetB1':efn.EfficientNetB1,
#                          'EfficientNetB2':efn.EfficientNetB2,
#                          'EfficientNetB3':efn.EfficientNetB3,
#                          'EfficientNetB4':efn.EfficientNetB4,
#                          'EfficientNetB5':efn.EfficientNetB5,
#                          'EfficientNetB6':efn.EfficientNetB6,
#                          'EfficientNetB7':efn.EfficientNetB7}

In [None]:
### In the future see if we can load the efficient net v2 as it looks like it does better on the 
# sets it was trained on over v1 (plus I mean 2 > 1 thus must be better)

### These are for Efficient net version 1
# python package makes it a lot easier to use
# ill integrate the capability to test these models later
# EFF_NET_WEIGHTS_OPTIONS = ['noisy-student', 'imagenet']
# EFF_NET_MODEL_OPTIONS = ['EfficientNetB0', 
#                          'EfficientNetB1', 
#                          'EfficientNetB2', 
#                          'EfficientNetB3', 
#                          'EfficientNetB4',
#                          'EfficientNetB5',
#                          'EfficientNetB6',
#                          'EfficientNetB7']

### These are for Efficient net version 2
EFF_NET_V2_TASK_OPTIONS = ['classification',
                           'feature_vector']
MODEL_TYPE = "imagenet21k_b1"
MODEL_TASK = 'feature_vector'
EFF_NET_PATH = "https://tfhub.dev/google/imagenet/efficientnet_v2_{model_type}/{task}/2".format(model_type=MODEL_TYPE, task=MODEL_TASK)
print(EFF_NET_PATH)


class Config():
    """This class basically just holds information for us
    """
    ### Hyper parameters
    SEED = 123
    BATCH_SIZE = 64 * strategy.num_replicas_in_sync
    EPOCHS = 20
    
    ### Data params
    IMAGE_SIZE = 240
    IMAGE_CHANNELS = 3
    ROTATION_FACTOR = 0.2
    ZOOM_FACTOR=0.2
    FOLDS=5
    
    ### Efficient net params
    EFF_NET_MODEL = f"efficientnet_v2_{MODEL_TYPE}"
    EFF_NET_MODEL_TASK = MODEL_TASK
    
    
config = Config()
print(config.EFF_NET_MODEL)
print(f"Batch size {config.BATCH_SIZE}")

In [None]:
OUTPUT_FOLDER = '/kaggle/working'

In [None]:
def seed_everything(seed):
    """We want to seed the env so that results are reproducible and more easily comparable.
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything(config.SEED) 

# Model Setup
* here we want to define the model helper functions
* we want to create the model itself

In [None]:
### COPIED FROM THIS NOTEBOOK
# https://www.kaggle.com/manojprabhaakr/effnet-b6-whale-comp

"""
I'm not well versed in why this is helpful, but aparently actually ramping up the learning rate at the start of training can perform very well.
"""

def get_lr_callback(plot=False):
    lr_start   = 0.000001
    lr_max     = 0.000005 * config.BATCH_SIZE  
    lr_min     = 0.000001
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        return lr
    if(plot):
        epochs = list(range(config.EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        ax = plt.gca()
        ax.get_yaxis().get_major_formatter().set_scientific(False)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

get_lr_callback(plot=True)

In [None]:
"""
Again I learned this on kaggle but apparently freezing the batch normalization layers from a pretrained model
can help prevent our model from overfitting. I learned this from other people in the competition

TLDR from stack overflow
the new data we train on will likely have very different batch normalization parameters (on average the images are different from the ones the model was trained on)
batch norm is calculated over the train set after training is complete (during training it can be the mean and std of that batch or a rolling average).
keeping this frozen means that the ranges and values from the data will stay the same, requiring the model to keep the general pretrained learnings, and 
not require it to relearn a ton of new information (in our case the parameter to data ratio is high so it could just memorize the training set)
Frozen batch norm keeps the gradients smaller and less drastic so we can fine tune more without worrying about overfitting and losing the generalization capabilities
we wanted this pretrained model for to begin with
(please correct me in comments if my understanding is bad, I'll update this section)


https://www.kaggle.com/manojprabhaakr/effnet-b6-whale-comp
https://stackoverflow.com/questions/63016740/why-its-necessary-to-frozen-all-inner-state-of-a-batch-normalization-layer-when#:~:text=This%20can%20help%20to%20minimize,in%20batch%20statistics%2C%20decreasing%20regularization.
https://towardsdatascience.com/batch-norm-explained-visually-why-does-it-work-90b98bcc58a0
https://towardsdatascience.com/batch-norm-explained-visually-how-it-works-and-why-neural-networks-need-it-b18919692739
"""

def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

In [None]:
def get_resizing_and_rescaling():
    with strategy.scope():
        resize_and_rescale = tf.keras.Sequential([
            tf.keras.layers.Resizing(config.IMAGE_SIZE, config.IMAGE_SIZE),
            tf.keras.layers.Rescaling(scale=1./127.5, offset=-1)
        ])
    return resize_and_rescale

resizer = get_resizing_and_rescaling()

def get_augmentation():
    with strategy.scope():
        augmentation = tf.keras.Sequential([
            tf.keras.layers.RandomFlip(mode="horizontal", seed=config.SEED),
            tf.keras.layers.RandomZoom(config.ZOOM_FACTOR),
            tf.keras.layers.RandomRotation(config.ROTATION_FACTOR)  
        ])
    return augmentation

augmenter = get_augmentation()

In [None]:
def get_model():
    """This function returns the classification model. This inputs the images of whatever shape
        we pick shape based on the expected image size of the different efficient net models
        it then sends the images through efficient net and then uses a single node dense layer 
        with the sigmoid activation to binary classify an image as a whale or a dolphin
    """
    # use the strategy scope becuase might be in tpu 
    # we want to make the model inside the scope of our strategy
    # efficient net reads in an image of any size and spits out a 1000 dimension feature vector
    with strategy.scope():
        # input shape is (batch_size, image_size, image_size, image_channels)
        inputs = tf.keras.layers.Input(shape=(None, None, config.IMAGE_CHANNELS), name="images")
        
        x = resizer(inputs)
        x = augmenter(x)
        
        # input shape is (batch_size, any, any, 3)
        x = tfhub.KerasLayer(EFF_NET_PATH, trainable=True)(x)
        
        # takes the feature vector and condences it down for binary classification
        # kernel regularization provides loss based on the layers weights
        # this promotes the model to maintain smaller weights
        # this is important to stop overfitting as with this number of parameters to data the 
        # network could just memorize the data easily, this means the weights will be more evenly distributed
        # and more likely to generalize
        # l2 regilarization adds weight * sum of the squared coefficients
        outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(x)
   
        # make the model
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        
        optimizer = tf.keras.optimizers.Adam()
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        metrics = tf.keras.metrics.BinaryAccuracy()
        model.compile(loss=loss, optimizer=optimizer, metrics=[metrics])
    return model

model = get_model()
model.summary()

In [None]:
del model
gc.collect()

# Data
* here we want to load the data

In [None]:
# load the main data file
train_df = pd.read_csv(os.path.join(MAIN_DATASET, 'train.csv'))
train_df.species.replace("bottlenose_dolpin", "bottlenose_dolphin", inplace=True)
train_df.head()

In [None]:
# determine the different species in the dataframe
DOLPHIN_REPRESENTATION = 0
WHALE_REPRESENTATION = 1
CLASSES = ['dolphin', 'whale']

species = train_df.species.unique()
print("Species not with dolphin or whale in name: ", [s for s in species if not np.any([x in s for x in ['whale', 'dolphin']])])

def get_sup_species(species):
    """This function uses the species name to extract the sup species, either whale or dolphin for classification.
    """
    if('whale' in species):
        return CLASSES[WHALE_REPRESENTATION]
    elif('dolphin' in species):
        return CLASSES[DOLPHIN_REPRESENTATION]
    elif('beluga' == species):
        return CLASSES[WHALE_REPRESENTATION]
    elif('globis' == species):
        return CLASSES[WHALE_REPRESENTATION]
    else:
        raise Exception(f'Unk species: {species} was found in dataframe')
        
train_df['target'] = train_df.species.apply(get_sup_species)
train_df.head()

In [None]:
## We can see we might have a class imbalance issue between the 
# dolphins and the whales, could sample dolphins more
# could also add class weights.
plot = sns.catplot(data=train_df, x="target", kind="count", order=CLASSES);
# plot.set_xticklabels(["Dolphin", "Whale"]);

In [None]:
#     image_count = len(train_df.image.unique())
#     test_size = int(0.15 * image_count)
#     val_size = int(0.15 * image_count)
#     test_spliter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=config.SEED)
#     for train_idx, test_idx in test_spliter.split(train_df.image.values, train_df.target.values):
#         test_images = train_df.iloc[test_idx, :].image
#         for im in test_images:
#             image_to_fold[im] = TEST_FOLD
            
#     fold_spliter = StratifiedShuffleSplit(n_splits=5, test_size=val_size, random_state=config.SEED)

TEST_FOLD = 0
def generate_splits(train_df):
    image_to_fold = {}
    
    fold_spliter = StratifiedKFold(n_splits=config.FOLDS+1, shuffle=True, random_state=config.SEED)
    for fold, (train_idx, val_idx) in enumerate(fold_spliter.split(train_df.image.values, train_df.target.values)):
        val_images = train_df.iloc[val_idx, :].image
        for im in val_images:
            image_to_fold[im] = fold
               
    return image_to_fold

def return_folds(fold_df_path=None):
    if(fold_df_path is None):
        image_to_fold = generate_splits(train_df)
    else:
        image_to_fold = None
    return image_to_fold
    
        

In [None]:
# load the split data or generate our own
image_to_fold = return_folds()

In [None]:
train_df["fold"] = train_df.image.map(image_to_fold)
train_df.fold.fillna(-1, inplace=True)
train_df.fold = train_df.fold.astype(np.int16)
train_df.head()

In [None]:
FOLD_DF_PATH = os.path.join(OUTPUT_FOLDER, 'folds.csv')
train_df.to_csv(FOLD_DF_PATH, index=False)

In [None]:
# make sure the fold data distributions look good
for fold in np.sort(train_df.fold.unique()):
    print(f'Fold {fold}')
    plot = sns.catplot(data=train_df.loc[train_df.fold == fold], x="target", kind="count", order=CLASSES);
#     plot.set_xticklabels(["Dolphin", "Whale"]);
    plt.show()

In [None]:
test_df = train_df.loc[train_df.fold == TEST_FOLD]
train_df = train_df.loc[train_df.fold != TEST_FOLD]

In [None]:
# setup the image pipeline function
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator()#preprocessing_function=resizer)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator()#preprocessing_function=resizer)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator()#preprocessing_function=resizer)

# Train the model

In [None]:
def get_generator(generator, wells):
    gen =  generator.flow_from_dataframe(wells, 
                                         directory=os.path.join(CROPPED_DATASET,'train_images', 'train_images'),
                                         x_col='image',
                                         y_col='target',
                                         color_mode='rgb',
                                         subset='training',
                                         batch_size=config.BATCH_SIZE,
                                         shuffle=True,
                                         classes=CLASSES,
                                         class_mode='binary',
                                         seed=config.SEED)
    return gen

def train_model(fold):
    assert(TEST_FOLD < fold and fold in train_df.fold.unique())
    fold_train_data = train_df.loc[train_df.fold!=fold]
    fold_val_data = train_df.loc[train_df.fold==fold]
    print(fold_train_data.shape[0], fold_val_data.shape[0], test_df.shape[0])
    stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)
    model = get_model()
    freeze_BN(model)
    train_ds = get_generator(train_datagen, fold_train_data)
    val_ds = get_generator(val_datagen, fold_val_data)
    hist = model.fit(train_ds,
                     validation_data=val_ds,
                     epochs=config.EPOCHS,
                     callbacks=[stopper])
    return model, hist

def test_model(model, test_ds):
    ev = model.evaluate(test_ds)
    return ev

def graph_training(hist):
    plt.figure(figsize=(8,8), tight_layout=True)
    plt.subplot(211)
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('MSE Loss')
    plt.ylabel('Error')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper right')

    plt.subplot(212)
    plt.plot(hist.history['binary_accuracy'])
    plt.plot(hist.history['val_binary_accuracy'])
    plt.title('Binary Accuracy Metric')
    plt.ylabel('Error')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

def conf_matrix(y_true, preds):
    cf = confusion_matrix(y_true, preds)
    plt.figure(figsize=(12,12))
    sns.heatmap(cf, annot=True, xticklabels=class_names, yticklabels=class_names, cmap='Blues', robust=True)
    plt.show()

def train_loop(fold):
    print(f"FOLD {fold}")
    print("===================================================")
    model, hist = train_model(fold)
    graph_training(hist)
    test_ds = get_generator(test_datagen, test_df)
    ev = test_model(model, test_ds)
    print("Model EV", ev)
    model.save(os.path.join(OUTPUT_FOLDER, f"Model_{fold}"))
    del model
    del test_ds
    del ev
    plt.clf()
    gc.collect()

In [None]:
train_loop(fold=1)

In [None]:
train_loop(fold=2)

In [None]:
train_loop(fold=3)

In [None]:
train_loop(fold=4)

In [None]:
train_loop(fold=5)