## Set up the notebook with imports and constants

Install downloaded efficientnet package. Will work without internet access in the notebook

In [None]:
#pip download efficientnet -d ./efficientnet
#import os
#from zipfile import ZipFile
#
#dirName = "./"
#zipName = "packages.zip"

## Create a ZipFile Object
#with ZipFile(zipName, 'w') as zipObj:
#    # Iterate over all the files in directory
#    for folderName, subfolders, filenames in os.walk(dirName):
#        for filename in filenames:
#            if (filename != zipName):
#                # create complete filepath of file in directory
#                filePath = os.path.join(folderName, filename)
#                # Add file to zip
#                zipObj.write(filePath)

In [None]:
! pip install efficientnet --no-index --find-links=file:///kaggle/input/vgis9-2020-packages/efficientnet

In [None]:
! [ -f /kaggle/input/vgis2020model/bestmodel.h5 ] && cp /kaggle/input/vgis2020model/bestmodel.h5 /kaggle/working/bestmodel.h5

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import sys
from pathlib import Path
import random
import pickle

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.image import imread
import cv2
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint
import efficientnet.tfkeras as efn

from tqdm import tqdm

input_dir = Path('../input')
dataset_dir = input_dir / 'landmark-recognition-2020'

test_image_dir = dataset_dir / 'test'
train_image_dir = dataset_dir / 'train'
train_label_path = dataset_dir / 'train.csv'
bestmodel_path = Path('/kaggle/working/bestmodel.h5')
    
ERROR = 1
WARN = 2
INFO = 3
DEBUG = 4
SPAM = 5

VERBOSITY = INFO

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Variables

In [None]:
validation_ratio = 0.2
batch_size = 16
max_epochs = 6

top_n = 1000
img_size = (256,256)
seed = 496

force_retrain = False

## Setting up some helper functions

In [None]:
def get_img_path(df, prepend=""):
    return prepend + df.id.str[0] + "/" + df.id.str[1] + "/" + df.id.str[2] + "/" + df.id + ".jpg" 

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

## Data loading / pre-processing

In [None]:
train_labels = pd.read_csv(train_label_path)
train_labels.head(5)

## Data exploration

In [None]:
def check_for_test():
    testdf = pd.read_csv('../input/landmark-recognition-2020/sample_submission.csv')
    test_images  = test_image_dir.glob("**/*.jpg")

    test_img_arr = []
    for img in test_images:
        test_img_arr.append(img.stem)
    
    x = True
    for _id in testdf.id.values:
        if _id not in test_img_arr:
            x = False
            print(f"{_id} missing from folder")

    for img in test_img_arr:
        if img not in testdf.id.values:
            x = False
            print(f"{_img} missing from csv")
    return x

# x = "are" if check_for_test() else "aren't"
# print(f"All test images {x} listed in sample_submission.csv")
## All test images are listed in sample.csv. Will use that

In [None]:
class_count = len(train_labels["landmark_id"].unique())
test_df = pd.read_csv(dataset_dir/"sample_submission.csv")

test_image_count = len(test_df.id.values)
train_image_count = len(train_labels.id.values)

print(f'''Dataset info:
      \tUnique classes: {class_count:}
      \tImages  : {test_image_count + train_image_count :9,d}
      \t  test  : {test_image_count :9,d}
      \t  train : {train_image_count :9,d}
      ''')

In [None]:
# Make a dataframe sorted by amount of images 
df_by_samples = pd.DataFrame(train_labels['landmark_id'].value_counts())
df_by_samples.reset_index(inplace=True)
df_by_samples.columns=['landmark_id','count']


lt_5_cnt = len(df_by_samples.loc[df_by_samples['count'] < 5])
gt_5_lt_10_cnt = len(df_by_samples.loc[(df_by_samples['count'] > 5) & (df_by_samples['count'] < 10)])
lt_100_cnt = len(df_by_samples.loc[df_by_samples['count'] < 100]) 
print(f"""Classes with:
    <5 samples   : {lt_5_cnt}
    >5<10 samples: {gt_5_lt_10_cnt}
    <500 samples : {lt_100_cnt}""")

### Plotting a bar graph "histogram"

In [None]:
def plot_bars(data, edges, col=None):

    if col is None:
        col = data
    else:
        col = data[col]

    bins = {}
    for idx in range(len(edges)-1):
        if idx == len(edges)-2:
            key = f">{edges[idx]}"
        else:
            key = f">{edges[idx]} <={edges[idx+1]}"
        bins[key] = len(data.loc[(col > edges[idx]) & (col <= edges[idx+1])])

    
    fig = plt.figure(figsize=(10,3.5))
    
    plt.bar(bins.keys(), bins.values(), width=0.4)

    
    

In [None]:
plot_bars(df_by_samples, [0,5,10,50,100,7000], 'count')

### Plotting random classes

In [None]:
def plot_n_img(dataset, n :int, drop_dupes=True, title=None):
    
    if drop_dupes:
        ids = dataset.drop_duplicates(subset=['landmark_id']).sample(n)
    
    else:
        ids = dataset.sample(n)
    
    paths = get_img_path(ids, str(train_image_dir.resolve())+'/').values
    grid_size = int(np.ceil(np.sqrt(len(paths))))
    
    fig = plt.figure(figsize=(grid_size*3,grid_size*3))
    
    axes = []
    for idx in range(grid_size*grid_size):
        if idx == n:
            break
        axes.append(fig.add_subplot(grid_size, grid_size, idx+1))
        plt.imshow(imread(paths[idx]))
        if title is not None:
            plt.title(title)
    
    fig.tight_layout()
    plt.show()

def plot_img_from_class(dataset, class_id :int, n :int):
    """Plots n images from a given class    
    """
    class_subset = dataset.loc[dataset['landmark_id'] == class_id]
    
    plot_n_img(class_subset, n, False, str(class_id))
    
    
    

In [None]:
plot_n_img(train_labels, 16)

Because there's so many classes with few samples, which could cause an issue for training, we'll take a subset of the dataset, using only the top 1000 classes. 

In [None]:


df_by_samples = df_by_samples.drop(df_by_samples.index[top_n:])
full_train = train_labels.copy() # Make copy for later testing
train_labels = train_labels[train_labels.landmark_id.isin(df_by_samples['landmark_id'])]
print(df_by_samples.tail(1))
print(train_labels.shape)

As can be seen, taking the top classes results in classes having at least 59 samples per class, while still leaving us with over half a million images

## Split data into training and validation sets

In [None]:
train_labels['path'] = get_img_path(train_labels)
train_labels['label'] = train_labels.landmark_id.astype(str)

In [None]:
def get_genny(data, x_col, y_col, base_dir :str, target_size=(256,256), batch_size=32, validation_ratio=0.0, subset=None, seed=496):
    gen = ImageDataGenerator(validation_split=validation_ratio)
    #gen = ImageDataGenerator(validation_split=validation_ratio, horizontal_flip=True)  # Introduce random flips
    #gen = ImageDataGenerator(validation_split=validation_ratio, zoom_range=0.1)  # 25% random zoom
    
    class_mode = "categorical" if validation_ratio > 0 else None
    
    genny = gen.flow_from_dataframe(
        data,
        directory = base_dir,
        x_col=x_col,
        y_col=y_col,
        target_size=target_size,
        batch_size=batch_size,
        subset=subset,
        class_mode=class_mode,
        validate_filenames=False,
        seed=seed
    )
    return genny

In [None]:
# The flow_from_dataframe() shuffles the data after splitting it, meaning the training and validation set will contain different classes, so we shuffle the data before
train_labels = train_labels.sample(frac=1, random_state=seed).reset_index(drop=True)

train_gen = get_genny(train_labels, "path", "label", str(train_image_dir), img_size, batch_size, validation_ratio, "training")
valid_gen = get_genny(train_labels, "path", "label", str(train_image_dir), img_size, batch_size, validation_ratio, "validation")



print(f"Split training set into a training and validation set")

In [None]:
if not bestmodel_path.exists() or force_retrain:
    model = tf.keras.Sequential([
        efn.EfficientNetB2(
            input_shape=(256, 256, 3),
            weights='imagenet',
            include_top=False
        ),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(top_n, activation='softmax')
    ])

In [None]:
if not bestmodel_path.exists() or force_retrain:
    modified_adam = tf.keras.optimizers.Adam(learning_rate=0.005)
    normal_adam = tf.keras.optimizers.Adam()
    model.compile(
        #optimizer=modified_adam,
        optimizer=normal_adam,
        loss = 'categorical_crossentropy',
        metrics = ['categorical_accuracy']
    )
    # I'm using the adam optimizer for a few reasons. It's very popular, and that tends to be for a reason, and it attempts to combine the best of both wordls of momentum and RMSProp
    # I'm using categorical_crossentropy as there's a lot of classes


In [None]:
image_count = len(train_labels)

train_steps = int(image_count * (1-validation_ratio) // batch_size)
valid_steps = int(image_count * validation_ratio // batch_size)

if not bestmodel_path.exists() or force_retrain:
    print(f"Fitting model over {max_epochs} epochs with {train_steps} training steps and {valid_steps} validation steps.")
    
    model_checkpoint = ModelCheckpoint("bestmodel.h5", save_best_only=True, verbose=1)

    hist = model.fit(train_gen,
                    steps_per_epoch=train_steps,
                    epochs=max_epochs,
                    validation_data=valid_gen,
                    validation_steps=valid_steps,
                    callbacks=[model_checkpoint]
    )
    plot_history(hist)

## Checking the classifier on validation set

In [None]:
def one_hot_to_labels(pred, class_map=None):
    """Convert from one-hot to predictions to labels with probability"""
    
    pred_idx = np.argmax(pred, axis=-1) # Get the index of the one-hot bit in the last axis

    if class_map is None:
        class_map = np.unique(train_labels.landmark_id.values)
    
    pred_labels = [class_map[idx] for idx in pred_idx]
    pred_prob = np.max(pred, axis=-1)
    
    return pred_labels, pred_prob
    

In [None]:
best_model = tf.keras.models.load_model("bestmodel.h5")

Get a general evaluation of the trained models performance on the validation set

In [None]:
test_gen = get_genny(train_labels, "path", "label", str(train_image_dir), img_size, 1, validation_ratio, "validation") # Validation set but with batch-size 1
#scores = best_model.evaluate(x=test_gen)
#print(f"Validation set classifies with a loss of: {scores[0]} and a categorical_accuracy of {scores[1]}]")

Get predictions on the validation set to allow more exploration of the results

In [None]:
test_gen.reset()

class_map = {idx: name for name, idx in test_gen.class_indices.items()} # Flip the mapping to get the names from idx


results_pickle = Path('../input/vgis2020-pickles/results.p')
quick_run = False


if results_pickle.is_file() and quick_run:
    with results_pickle.open('rb') as f:
        results = pickle.load(f)
else:
    results = []
    for step in tqdm(range(len(test_gen))):
        X, y = next(test_gen)
        pred = best_model.predict(X)
    
        pred_idx = np.argmax(pred)
        true_idx = np.argmax(y)
        pred_prob = np.max(pred)
    
        results.append([class_map[true_idx], class_map[pred_idx], pred_prob])

    with open('results.p', 'wb') as f:
        pickle.dump(results, f)

In [None]:
errs = [x for x in results if x[0] != x[1]]
errs = pd.DataFrame(errs, columns = ['target', 'predicted', 'probability'])

print(f"Testing on the validation set gives {(len(errs) / len(results)) * 100:0.2f}% incorrectly classified landmarks")


In [None]:
results_ranked = pd.DataFrame(results, columns = ['target', 'predicted', 'probability'])

results_ranked = results_ranked['target'].value_counts().to_frame()
results_ranked.reset_index(level=0, inplace=True)
results_ranked.columns = ['class', 'count']


class_err = pd.DataFrame(errs, columns = ['target', 'predicted', 'probability'])

class_err = class_err['target'].value_counts().to_frame()
class_err.reset_index(level=0, inplace=True)
class_err.columns = ['class','count']

In [None]:
print(f"The top 5 worst classified classes were {class_err.head(5).iloc[:,0].values} with {class_err.head(5).iloc[:,1].values} misclassifications respectively")
print(f"A few pictures from the worst prediced class {class_err.iloc[0,0]} have been plotted, as well as some from classes it was mistaken as")

worst_class = class_err.iloc[0,0]
mistaken_as = errs.loc[errs['target'] == class_err.iloc[0,0]]['predicted'].drop_duplicates().sample(3).values


plot_img_from_class(train_labels, int(worst_class), 2)

for mistake in mistaken_as:
    plot_img_from_class(train_labels, int(mistake), 1)



> 

In [None]:
print(f"The top 5 correctly classified classes were {results_ranked.head(5).iloc[:,0].values} with {results_ranked.head(5).iloc[:,1].values} classifications respectively")
print(f"A few pictures from the classes have been plotted")


top_classes_list = results_ranked.head(5).iloc[:,0].values

for class_id in top_classes_list:
    plot_img_from_class(train_labels, int(class_id), 2)


# Submission Generation

Here we will run the test images through the trained model and generate a submission.csv


In [None]:
sub_df = pd.read_csv(dataset_dir / "sample_submission.csv")
sub_df["path"] = get_img_path(sub_df)

test_gen = get_genny(sub_df, "path", None, str(test_image_dir), img_size, 1)
predictions = best_model.predict(test_gen, verbose=1)

Convert from one hot encoding back to categorical labels with probablities

In [None]:
predicted_labels, prediction_prob = one_hot_to_labels(predictions)
predicted_labels = np.argmax(predictions, axis=-1) # Get the index of the one-hot bit in the last axis

classes = np.unique(train_labels.landmark_id.values)
print(classes.shape)
print(predicted_labels.shape)

predicted_labels = [classes[idx] for idx in predicted_labels] 
prediction_prob = np.max(predictions, axis=-1)

print(f"{predicted_labels[0]}: {prediction_prob[0]}")

Save predictions as submission

In [None]:
result = [str(predicted_labels[idx]) + " " + str(prediction_prob[idx]) for idx in range(len(predicted_labels))]

In [None]:
sub_df["landmarks"] = result
sub_df.drop(columns="path")

sub_df.to_csv("submission.csv", index=False)