In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import sys
import seaborn as sns
import keras
import matplotlib.pyplot as plt
import cv2
import gc
import tensorflow as tf
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Conv2D , Activation, MaxPool2D , Flatten , Dropout , BatchNormalization
from keras.layers import MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from keras.callbacks import ReduceLROnPlateau
from glob import glob
from PIL import Image

In [None]:
def getPaths(birdsUsed):
    """
    Returns list of paths to each directory containing our bird images
    Parameters:
        birdsUsed: Number of different bird species we want our model to identify
    """
    trPaths = glob("/kaggle/input/100-bird-species/test/*/")[:birdsUsed]
    tePaths = glob("/kaggle/input/100-bird-species/train/*/")[:birdsUsed]
    crPaths = glob("/kaggle/input/100-bird-species/valid/*/")[:birdsUsed]
    lists = [trPaths, tePaths, crPaths]
    return [val for tup in zip(*lists) for val in tup]

# The owner of the dataset changed the format in the final stages of the project, so that's why the function below is a bit of a mess!

In [None]:
def getSets(paths, downwardScaleFactor, birdsUsed, trackProgress = True):
    """
    Fetches and separates images into training, testing, and cross-validation sets. Each is returned as a numpy
    array first containing numpy image representation of image, then a classification numpy array.
    Parameters:
        paths: list of paths to each directory containing bird images
        downwardScaleFactor: By what factor we decrease image quality to reduce inputs. Base images are 224x224
        birdsUsed: Number of different bird species we want our model to identify
    """
    
    train = []
    test  = []
    cross = []
    species = np.asarray([0 for count in range(len(paths) // 3)], dtype=np.uint8)
    img_size = 224 // downwardScaleFactor
    
    if trackProgress:
        print("****** Fetching Images ******")
    
    prev = 0
    for bird in range(0, len(paths), 3):
        
        if trackProgress:
            x = round(bird / birdsUsed, 1)
            if x != prev:
                prev = x
                print(x)
            
        #Setting species identifier
        species[((bird+1) // 3)-1] = 0
        species[(bird+1) // 3] = 1
        
        for directory in range(3):
        
            #Finding name for each image of the given bird
            wc = glob(paths[bird+directory] + "/*")
            images = [wc[x].split('/')[-1] for x in range(len(wc))]

            birdImages = []
            for im in images:
                imageData = getImage(paths[bird+directory] + im, np.copy(species), img_size)
                birdImages.append(imageData)
        
            #Separating images -> 50% in training set, 20% in testing set, and 30% in cross-validation set
            train += birdImages[0 : (len(images) // 2)]
            test  += birdImages[len(images) // 2 : (len(images) // 2 + len(images) // 5)]
            cross += birdImages[(len(images) // 2 + len(images) // 5) : ]
        
    return train, test, cross
        

In [None]:
def getImage(path, species, img_size):
    """
    Reads the image located at the path and returns it as a numpy array representation.
    Parameters:
        path: path of the image
        species: classification array that describes which species the bird belongs to
        img_size: Number of pixels, width and height, to represent the image as.
    """
    
    #Reading image to numpy array
    img = cv2.imread(path)
    img = cv2.resize(img, (img_size, img_size))
    img = np.asarray(img)
  
    return np.array([img, species], dtype = object)

In [None]:
def setsAsNumpyArrays(train, test, cross):
    """
    Converts the training, testing, and cross-validation sets to numpy arrays
    so they work with our keras model.
    Parameters:
        train: training set
        test: testing set
        corss: cross-validation set
    """

    train = np.asarray(train, dtype = object)
    test  = np.asarray(test,  dtype = object)
    cross = np.asarray(cross, dtype = object)
    
    return train, test, cross

In [None]:
def separateXandY(train, test, cross):
    """
    Separates the image representations and classification arrays for use in the model.
    Parameters:
        train: training set
        test: testing set
        corss: cross-validation set
    """
    
    x_train = []
    y_train = []

    x_cross = []
    y_cross = []

    x_test = []
    y_test = []

    for feature, label in train:
        x_train.append(feature)
        y_train.append(label)

    for feature, label in test:
        x_test.append(feature)
        y_test.append(label)

    for feature, label in cross:
        x_cross.append(feature)
        y_cross.append(label)
        
    return x_train, y_train, x_cross, y_cross, x_test, y_test

In [None]:
def normalizePixels(x_train, x_cross, x_test):
    """
    Normalizes each RGB value to improve performance.
    Parameters:
        x_train: training images
        x_cross: cross-validation images
        x_test:  testing images
    """
    x_train = np.array(x_train) / 255
    x_cross = np.array(x_cross) / 255
    x_test = np.array(x_test) / 255
    return x_train, x_cross, x_test

In [None]:
def readyClassifArrs(train, test, cross):
    """
    Converts classification arrays to numpy arrays so the model can use them.
    Parameters:
        train: training classif. array
        test:  testing classif. array
        cross: cross-validation classif. array
    """
    train = np.asarray(train)
    test  = np.asarray(test)
    cross = np.asarray(cross)
    
    return train, test, cross

In [None]:
def setDatagen(x_train, rr, zr, wsr, hsr):
    """
    Creates image data generator which attempts to prevent overfitting training set
    Parameters:
        x_train: training images
        rr: rotation range 0-180
        zr: zoom range 0-1
        wsr: width shift range 0-1
        hsr: height shift range 0-1
    """
    datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            rotation_range = 15,  # randomly rotate images in the range (degrees, 0 to 180)
            zoom_range = 0.2, # Randomly zoom image 
            width_shift_range=0.10,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.10,  # randomly shift images vertically (fraction of total height)
            horizontal_flip = True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

    datagen.fit(x_train)
    return datagen

In [None]:
def generateModel(downwardScaleFactor, birdsUsed, filters, filterScaling, kernel, kernelScaling, dense, layers):
    """
    Generates a keras sequential model based on the parameters given
    Parameters:
        downwardScaleFactor: By what factor we decrease image quality to reduce inputs. Base images are 224x224
        birdsUsed: Number of different bird species we want our model to identify
        filters: initial number of filters used in base layer
        filterScaling: factor by which filters increases for each following convolution layer
        kernel: initial kernel count
        kernelScaling: How much kernels decrease with each following convolution layer
        dense: Size of final dense layer leading to classification
        layers: number of layers in the model
    """
    
    model = Sequential()
    img_size = 224 // downwardScaleFactor
    
    model.add(Conv2D(filters = filters, kernel_size = (kernel,kernel),padding = 'Same',activation ='relu', input_shape = (img_size,img_size,3)))
    model.add(MaxPooling2D(pool_size=(2,2)))
    
    for count in range(layers-1):
        filters *= filterScaling
        kernel -= kernelScaling
        model.add(Conv2D(filters = filters, kernel_size = (kernel,kernel),padding = 'Same',activation ='relu'))
        model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))
        
    model.add(Flatten())
    model.add(Dense(dense))
    model.add(Activation('relu'))
    model.add(Dense(birdsUsed, activation = "softmax"))

    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
        
    return model
        

In [None]:
def runModel(model, xt, yt, xc, yc, bs, ep, p, datagen):
    """
    Trains the model using the testing data and cross-validation set.
    Parmaters:
        model: base model to use
        xt: training images
        yt: training classification array
        xc: cross-validation images
        yc: cross-validation classification array
        bs: batch-size
        ep: epochs
        p: patience
        datagen: previously constructed image data generator
    """
    
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = p,factor=0.3, min_lr=0.000001)
    history = model.fit(datagen.flow(xt, yt, batch_size = bs),
                        epochs = ep,
                        validation_data = datagen.flow(xc, yc),
                        callbacks = [learning_rate_reduction],
                        verbose = 1)
    return history, model

In [None]:
def testModel(model, x_test, y_test):
    """
    Tests models' accuracy on the testing images
    Parameters:
        model: trained model
        x_test: testing images
        y_test: testing classification array
    """
    
    modelEval = model.evaluate(x_test, y_test, verbose=1)
    return modelEval[0], modelEval[1] * 100

#print("Loss of the model is - " , model.evaluate(x_test, y_test, verbose=0))
#print("Accuracy of the model is - " , model.evaluate(x_test,y_test)[1]*100 , "%")

In [None]:
def runModelTests():
    """
    Main function to run trials
    """
    
    attempts = []
    performance = {"birds": 0, "scale": 0, "layers": 0, "filter-scale":0, "kernels": 0,
                   "test-accuracy": 0, "train-accuracy": 0, "val-accuracy": 0}
    
    scales = [6,4,3,2]
    filenum = 0
    
    for birds in range(15, 31, 5):
        #Number of distinct lables we're using
        #birdsUsed = 20
        
        print("\n\nNew Bird Count!\n\n")

        for scale in scales:
            
            filenum += 1
            
            if filenum <= 10: #Picking up where last process concluded
                print("File:", filenum, " Passed")
                continue
            
            #By what factor we decrease image quality to reduce inputs. Base images are 224x224
            #downwardScaleFactor = 10
            
            print("\nNew Scale\n")

            paths = getPaths(birds)

            #Readying the data for use
            train, test, cross = getSets(paths, scale, birds, trackProgress = False)
            
            #print("****** Readying Images ******")
            train, test, cross = setsAsNumpyArrays(train, test, cross)
            xtr, ytr, xcr, ycr, xte, yte = separateXandY(train, test, cross)
            xtr, xcr, xte = normalizePixels(xtr, xcr, xte)
            ytr, ycr, yte = readyClassifArrs(ytr, ycr, yte)
                
            #Setting up model
            #print("****** Establishing Model ******")
            datagen = setDatagen(xtr, 15, 0.2, 0.1, 0.1) # rotation range, zoom range, width shift, height shift
            
            for layers in range(2, 5):
                for fs in range(2, 3):
                    for kernel in range(2, 7, 2):
                        
                        try:
                        
                            model = generateModel(scale, birds, 32, fs, kernel, 1, 256, layers)

                            #Running model
                            print("****** Running Model ******")
                            history, model = runModel(model, xtr, ytr, xcr, ycr, 32, 35, 3, datagen) # Batch size, Epochs, Patience
                            loss, acc = testModel(model, xte, yte)

                            perf = performance.copy()
                            perf['birds'] = birds
                            perf['scale'] = scale
                            perf['kernels'] = kernel
                            perf['layers'] = layers
                            perf['filter-scale'] = fs
                            perf['test-accuracy'] = acc
                            perf['train-accuracy'] = history.history['accuracy'][-1]
                            perf['val-accuracy'] = history.history['val_accuracy'][-1]

                            attempts.append(perf)
                            
                            print("*** Model Completed ***")
                            
                        except:
                            pass
                        
            df = pd.DataFrame(attempts)
            name = "birdscale{}.csv".format(filenum)
            df.to_csv(name,index=False)
            attempts = []
                        
    return attempts
    
#attempts = runModelTests()

# Post-test result examination

In [None]:
def examineResults():
    
    pathString = "/kaggle/input/model-results/birdscale{}.csv"
    li = []

    for count in range(1,17):
        filename = pathString.format(count)
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)

    df = pd.concat(li, axis=0, ignore_index=True)
    df["train-accuracy"] = df["train-accuracy"] * 100
    df["val-accuracy"] = df["val-accuracy"] * 100
    
    df["avg"] = (df["test-accuracy"] + df["train-accuracy"] + df["val-accuracy"]) / 3
    
    return df

df = examineResults()

In [None]:
def separateBirdCounts(df):
    """
    Separates the dataframe by the amount of birds present in the model, returning four data frames
    Parameters:
        df: dataframe containing all performance data
    """
    birds15 = df[df["birds"] == 15]
    birds20 = df[df["birds"] == 20]
    birds25 = df[df["birds"] == 25]
    birds30 = df[df["birds"] == 30]
    
    return birds15, birds20, birds25, birds30

birds15, birds20, birds25, birds30 = separateBirdCounts(df)

In [None]:
def groupsByPerf(group, data):
    """
    Groups the performance data by a specified column and measures it against performance
    for every subset amount of birds.
    Parameters:
        group: Column to group by
        data: list of performance dataframes separated by bird count
    """
    groupings = []
    
    for item in data:
        perf = item.groupby(group)['avg'].mean().to_frame()
        perf[group] = perf.index
        perf[group] = perf[group].astype(str)
        groupings.append(perf)
        
    return groupings

In [None]:
def plotSubs(dfs, x, title, xlab):
    """
    Plots four subplots for each subset of birds.
    Parameters:
        dfs: list of the four dataframes
        x: name of the column we're measuring against performance
        title: title for entire plot
        xlab: x label for the subplots
    """
    
    [df1, df2, df3, df4] = dfs
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize = (7,7))
    ax = ((ax1, ax2), (ax3, ax4))
    fig.suptitle(title)
    ax1.bar(df1[x], df1["avg"])
    ax1.set_title('15 Birds')
    ax2.bar(df2[x], df2["avg"])
    ax2.set_title('20 Birds')
    ax3.bar(df3[x], df3["avg"])
    ax3.set_title('25 Birds')
    ax4.bar(df4[x], df4["avg"])
    ax4.set_title('30 Birds')
    plt.setp(ax, ylim=(70, 85))
    fig.supxlabel(xlab)
    fig.supylabel("Average Model Performance")

# Plotting relationship between downward scale factor and average performance

In [None]:
scalePerfs = groupsByPerf("scale", [birds15, birds20, birds25, birds30])
plotSubs(scalePerfs, "scale",
         "Model Performance by Downward Scale Factor", "Downward Scale Factor")

# Plotting relationship between initial # of layers and average performance

In [None]:
layerPerfs = groupsByPerf("layers", [birds15, birds20, birds25, birds30])
plotSubs(layerPerfs, "layers",
         "Model Performance by # of Layers", "# of Layers")

# Plotting relationship between initial # of kernels and average performance

In [None]:
kernelPerfs = groupsByPerf("kernels", [birds15, birds20, birds25, birds30])
plotSubs(kernelPerfs, "kernels",
         "Model Performance by Intiial # of Kernels", "Initial # of Kernels")

# Taking the maximum performing parameters to test other independent variables in model construction

In [None]:
"""
def furtherTests(birds):
    
    paths = getPaths(birds)
    filenum = 1
    
    #Readying the data for use
    train, test, cross = getSets(paths, 3, birds, trackProgress = False)
    #print("****** Readying Images ******")
    train, test, cross = setsAsNumpyArrays(train, test, cross)
    xtr, ytr, xcr, ycr, xte, yte = separateXandY(train, test, cross)
    xtr, xcr, xte = normalizePixels(xtr, xcr, xte)
    ytr, ycr, yte = readyClassifArrs(ytr, ycr, yte)
    #Setting up model
    #print("****** Establishing Model ******")
    datagen = setDatagen(xtr, 15, 0.2, 0.1, 0.1) # rotation range, zoom range, width shift, height shift
    performance = {"birds": 30, "scale": 3, "layers": 4, "filter-scale":2, "kernels": 0,
                   "test-accuracy": 0, "train-accuracy": 0, "val-accuracy": 0, "filters": 0,
                    "kernels": 0, "kernel-scaling": 0, "dense": 0}
    attempts = []
    for filters in [32, 45]:
        for kernels in [6,8]:
            for ks in [1,2]:
                for dense in [256, 512, 1024]:
                        model = generateModel(3, birds, filters, 2, kernels, ks, dense, 4)
                        print("****** Running Model ******")
                        history, model = runModel(model, xtr, ytr, xcr, ycr, 32, 35, 3, datagen) # Batch size, Epochs, Patience
                        loss, acc = testModel(model, xte, yte)
                        
                        perf = performance.copy()
                        perf['kernels'] = kernels
                        perf['filters'] = filters
                        perf['kernel-scaling'] = ks
                        perf['dense'] = dense
                        perf['filter-scale'] = fs
                        perf['test-accuracy'] = acc
                        perf['train-accuracy'] = history.history['accuracy'][-1]
                        perf['val-accuracy'] = history.history['val_accuracy'][-1]
                        
                        attempts.append(perf)
                    
            df = pd.DataFrame(attempts)
            name = "SecondTest30Birds{}.csv".format(filenum)
            filenum += 1
            df.to_csv(name,index=False)
            attempts = []
#furtherTests(30)
"""

# Graphing performance over epochs

In [None]:
"""
epochs = [i for i in range(35)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
cross_acc = history.history['val_accuracy']
cross_loss = history.history['val_loss']
fig.set_size_inches(20,10)

ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , cross_acc , 'ro-' , label = 'Cross Validation Accuracy')
ax[0].set_title('Training & Validation Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")

ax[1].plot(epochs , train_loss , 'g-o' , label = 'Training Loss')
ax[1].plot(epochs , cross_loss , 'r-o' , label = 'Cross Validation Loss')
ax[1].set_title('Testing Accuracy & Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Training & Validation Loss")
plt.show()
"""