In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import cv2
import shutil
import numpy as np
import pandas as pd
from scipy import spatial
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None

In [None]:
# Directories and file paths
TRAIN_DIR = '../input/landmark-recognition-2020/train'
TRAIN_CSV = '../input/landmark-recognition-2020/train.csv'
train_df = pd.read_csv(TRAIN_CSV)

TRAIN_PATHS = [os.path.join(TRAIN_DIR, f'{img[0]}/{img[1]}/{img[2]}/{img}.jpg') for img in train_df['id']]
train_df['path'] = TRAIN_PATHS

train_df

In [None]:
train_df_grouped = pd.DataFrame(train_df.landmark_id.value_counts())
train_df_grouped.reset_index(inplace=True)
train_df_grouped.columns = ['landmark_id','count']

# Selected landmarks based on inclass frequency
selected_landmarks = train_df_grouped[(train_df_grouped['count'] <= 155) & (train_df_grouped['count'] >= 150)]

train_df_sub = train_df[train_df['landmark_id'].isin(selected_landmarks['landmark_id'])]
new_id = []
current_id = 0
previous_id = int(train_df_sub.head(1)['landmark_id'])
for landmark_id in train_df_sub['landmark_id']:
    if landmark_id == previous_id:
        new_id.append(current_id)
    else:
        current_id += 1
        new_id.append(current_id)
        previous_id = landmark_id

train_df_sub['new_id'] = new_id

NUM_CLASSES = train_df_sub['landmark_id'].nunique()

print(f"Unique classes found: {NUM_CLASSES}")
train_df_sub

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df_sub[['id', 'path']], train_df_sub['new_id'],
                                                  train_size = 0.9,
                                                  random_state = 123,
                                                  shuffle = True,
                                                  stratify = train_df_sub['new_id'])

# Held-out test set for inference
# Further 95/5 split -> 5% of original training set left for test set
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                   train_size = 0.95,
                                                   random_state = 123,
                                                   shuffle = True,
                                                   stratify = y_train)

assert X_train.shape[0] + X_val.shape[0] + X_test.shape[0] == train_df_sub.shape[0]

print(f"Training data shape: {X_train.shape}")
print(f"Training label shape: {y_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Validation label shape: {y_val.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test label shape: {y_test.shape}")

In [None]:
print(f"Unique classes on y_train: {y_train.nunique()}")
print(f"Unique classes on y_val: {y_val.nunique()}")
print(f"Unique classes on y_test: {y_test.nunique()}")

In [None]:
# Classes distribution on training, validation and test sets
plt.figure(figsize = (10, 3))
ax = sns.histplot(y_train, bins=75, kde = True)
ax.set_title('Distribution of Landmarks on training set')
plt.tight_layout()

plt.figure(figsize = (10, 3))
ax = sns.histplot(y_val, bins=75, kde = True)
ax.set_title('Distribution of Landmarks on validation set')
plt.tight_layout()

plt.figure(figsize = (10, 3))
ax = sns.histplot(y_test, bins=75, kde = True)
ax.set_title('Distribution of Landmarks on test set')
plt.tight_layout()
plt.show()

In [None]:
NEW_BASE_DIR = "/kaggle/working"

# Training set directory
for file, path, landmark in tqdm(zip(X_train['id'], X_train['path'], y_train)):
    dir = f"{NEW_BASE_DIR}/train_sub/{str(landmark)}"
    os.makedirs(dir, exist_ok = True)
    fname = f"{file}.jpg"
    shutil.copyfile(src = path, dst = f"{dir}/{fname}")

# Validation set directory    
for file, path, landmark in tqdm(zip(X_val['id'], X_val['path'], y_val)):
    dir = f"{NEW_BASE_DIR}/val_sub/{str(landmark)}"
    os.makedirs(dir, exist_ok = True)
    fname = f"{file}.jpg"
    shutil.copyfile(src = path, dst = f"{dir}/{fname}")

# Training set directory
for file, path, landmark in tqdm(zip(X_test['id'], X_test['path'], y_test)):
    dir = f"{NEW_BASE_DIR}/test_sub/{str(landmark)}"
    os.makedirs(dir, exist_ok = True)
    fname = f"{file}.jpg"
    shutil.copyfile(src = path, dst = f"{dir}/{fname}")

In [None]:
from tensorflow.keras.utils import image_dataset_from_directory

IMG_SIZE = 224
BATCH_SIZE = 16

print("Building training dataset...")
# Training tf.data.Dataset
train_ds = image_dataset_from_directory(f"{NEW_BASE_DIR}/train_sub",
                                        label_mode = 'int',
                                        shuffle = True,
                                        image_size = (IMG_SIZE, IMG_SIZE),
                                        batch_size = BATCH_SIZE)

print("Building validation dataset...")
# Validation tf.data.Dataset
val_ds = image_dataset_from_directory(f"{NEW_BASE_DIR}/val_sub",
                                        label_mode = 'int',
                                        shuffle = True,
                                        image_size = (IMG_SIZE, IMG_SIZE),
                                        batch_size = BATCH_SIZE)

print("Building test dataset...")
# Test tf.data.Dataset
test_ds = image_dataset_from_directory(f"{NEW_BASE_DIR}/test_sub",
                                        label_mode = 'int',
                                        shuffle = True,
                                        image_size = (IMG_SIZE, IMG_SIZE),
                                        batch_size = BATCH_SIZE)

In [None]:
for data_batch, labels_batch in train_ds.take(1):
    ncols = 4
    nrows = int(data_batch.shape[0]/ncols)
    fig, ax = plt.subplots(nrows = nrows, ncols = ncols, figsize=(10, 11),
                           sharex = True, sharey = True)
    img_counter = 0
    for image, label in zip(data_batch, labels_batch):
        axi = ax.flat[img_counter]
        axi.imshow(image/255.)
        label = label.numpy()
#         axi.set_title(np.where(label == 1)[0])
        axi.set_title(label)
        img_counter += 1
plt.show()

In [None]:
img_augmentation = tf.keras.Sequential(
    # [layers.RandomFlip("horizontal"),
    [layers.RandomTranslation(height_factor = 0.1, width_factor = 0.1),
     layers.RandomRotation(0.02),
     layers.RandomZoom(0.2)],
     name = "img_augmentation",
)
plt.figure(figsize=(9, 9))
for image, label in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        augmented_image = img_augmentation(image, training = True)
        plt.imshow(augmented_image[15].numpy().astype("uint8"))
        plt.axis("off")

In [None]:
# Model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB0

MODELS_DIR = f"{NEW_BASE_DIR}/models"

os.makedirs(MODELS_DIR, exist_ok = True)

# Model instantiator
def build_model(num_classes = None):
    inputs = keras.Input(shape = (IMG_SIZE, IMG_SIZE, 3))
    x = img_augmentation(inputs)
    # EfficientNetB0 backbone
    model = EfficientNetB0(input_tensor = x,
                           weights = 'imagenet',
                           include_top = False,
                           drop_connect_rate = DROP_CONNECT_RATE)
    
    # Freeze pretrained weights
    model.trainable = False
    
    # Rebuild top
    x = layers.GlobalAveragePooling2D(name = "avg_pool")(model.output)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(TOP_DROPOUT_RATE, name = "top_dropout")(x)
    
    # Embedding
    embedding = layers.Dense(512, name = "embedding_512")(x)
    outputs = layers.Dense(num_classes, activation = "softmax", name = "softmax")(embedding)
    
    # Compile
    model = tf.keras.Model(inputs, outputs, name = "EfficientNetB0")
    optimizer = tf.keras.optimizers.Adam(learning_rate = ADAM_LR)
    model.compile(optimizer = optimizer,
                 loss = "sparse_categorical_crossentropy",
                 metrics = ["accuracy"])
    
    return model

In [None]:
def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

DROP_CONNECT_RATE = 0.2 # Dropout rate for stochastic depth on EfficientNet
TOP_DROPOUT_RATE = 0.2  # Top dropout
INIT_LR = 5e-3          # Initial learning rate
EPOCHS = 5
# Adam optimizer learning rate schedule
ADAM_LR = tf.keras.optimizers.schedules.ExponentialDecay(
    INIT_LR,
    decay_steps=100,
    decay_rate=0.96,
    staircase=True)

model = build_model(num_classes = NUM_CLASSES)

In [None]:
# Training embedding layer
model_file_path = os.path.join(MODELS_DIR, "EfficientNetB0_softmax.keras")
callbacks = [
    keras.callbacks.ModelCheckpoint(model_file_path,
                                    save_best_only=True,
                                    monitor = "val_accuracy"),
    keras.callbacks.EarlyStopping(patience = 2,
                                  monitor = "val_accuracy")]

hist = model.fit(train_ds,
                 epochs = EPOCHS,
                 validation_data = val_ds,
                 shuffle = 'batch',
                 callbacks = callbacks)

plot_hist(hist)

In [None]:
# Evaluating best model
model = keras.models.load_model(model_file_path)
print("Predictions on validation set...")
print(f"Validation accuracy: {model.evaluate(val_ds)[1]*100:.2f} %")
print("Predictions on test set...")
print(f"Test accuracy: {model.evaluate(test_ds)[1]*100:.2f} %")

In [None]:
def get_image(path, resize = False, reshape = False, target_size = None):
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if resize:
        img = cv2.resize(img, dsize = (target_size, target_size))
    if reshape:
        img = tf.reshape(img, [1, target_size, target_size, 3])
    return img

# Get landmark samples
def get_landmark(landmark_id, samples = 16):
    nrows = samples // 4
    random_imgs = np.random.choice(train_df_sub[train_df_sub['new_id'] == landmark_id].index, samples, replace = False)
    plt.figure(figsize = (12, 10))
    for i, img in enumerate(train_df_sub.loc[random_imgs, :].values):
        ax = plt.subplot(nrows, 4, i + 1)
        plt.imshow(get_image(img[2]))
        plt.title(f"{img[0]}")
        plt.suptitle(f"Samples of landmark {landmark_id}", fontsize = 14, y = 0.94, weight = "bold")
        plt.axis("off")

# Get image embeddings
def get_embeddings(model, image_paths, input_size, as_df = True):
    embeddings = {}
    embeddings['images_paths'] = []
    embeddings['embedded_images'] = []
    
    target_dir = os.path.split(os.path.split(image_paths[0])[0])[0]
    
    print(f"Retrieving embeddings for {target_dir} with {model.name}...")
    for image_path in tqdm(image_paths):
        embeddings['images_paths'].append(image_path)
        embedded_image = model.predict(get_image(image_path,
                                                 resize = True,
                                                 reshape = True,
                                                 target_size = input_size))
        embeddings['embedded_images'].append(embedded_image)
    
    if as_df:
        embeddings = pd.DataFrame(embeddings)
    
    return embeddings

# Get similarities between query key pair
def get_similarities(query, key):
    '''
    Get cosine similarity matrix between query and key pairs
    Arguments:
    query, key: embedded images
    '''
    query_array = np.stack(query.tolist()).reshape(query.shape[0],
                                                   query[0].shape[1])
    key_array = np.stack(key.tolist()).reshape(key.shape[0],
                                               key[0].shape[1])
    
    # Initializing similarity matrix
    similarity = np.zeros((query_array.shape[0], key_array.shape[0]))
    
    # Getting pairwise similarities
    print(f"Getting pairwise {query_array.shape[0]} query: {key_array.shape[0]} key similarities...")
    for query_index in tqdm(range(query_array.shape[0])):
        similarity[query_index] = 1 - spatial.distance.cdist(query_array[np.newaxis, query_index, :],
                                                             key_array,
                                                             'cosine')[0]
    return similarity

# Plot top ranked images
def plot_similar(similar_imgs, img_paths):
    '''
    Plot top N similar samples from similarity index
    '''
    plt.figure(figsize = (18, 6))
    nrows = similar_imgs.shape[0]//5
    for i, img in enumerate(similar_imgs):
        ax = plt.subplot(nrows, 5, i + 1)
        plt.imshow(get_image(img_paths[img]))
        plt.title(f"Landmark id: {os.path.split(os.path.split(img_paths[img])[0])[1]}")
        plt.axis("off")

In [None]:
embedding_layer = 'embedding_512'
embedding_model = tf.keras.Model(inputs = model.input,
                                 outputs = model.get_layer(embedding_layer).output,
                                 name = "EfficientNetB0_embed512")
# Retrieving embeddings
train_img_paths = train_ds.file_paths
val_img_paths = val_ds.file_paths

train_embeddings = get_embeddings(model = embedding_model,
                                 image_paths = train_img_paths,
                                 input_size = IMG_SIZE)

val_embeddings = get_embeddings(model = embedding_model,
                                 image_paths = val_img_paths,
                                 input_size = IMG_SIZE)

In [None]:
train_embeddings.head()
val_train_similarity = get_similarities(val_embeddings['embedded_images'],
                                        train_embeddings['embedded_images'])
val_train_similarity.shape

In [None]:
def confidence_top(query = None, key = None, similarity = None, query_image_index = None, top = 5):
    '''
    Arguments:
    query_image_index = index of query image on similarity matrix query axis
    Return confidence scores for top N predictions
    '''
    query_paths = query['images_paths']
    key_paths = key['images_paths']

In [None]:
 def confidence_top(query = None, key = None, similarity = None, query_image_index = None, top = 5):
    '''
    Arguments:
    query_image_index = index of query image on similarity matrix query axis
    Return confidence scores for top N predictions
    '''
    query_paths = query['images_paths']
    key_paths = key['images_paths']
    
    similar_n = np.argsort(similarity[query_image_index])[::-1][:top]
    
    confidence_df = {}    
    confidence_df['top_similar'] = []
    for similar in similar_n:
        confidence_df['top_similar'].append(similar)

    confidence_df['image_paths'] = []
    for similar in similar_n:
        similar_image_path = key_paths[similar]
        confidence_df['image_paths'].append(similar_image_path)    
        
    confidence_df['prediction'] = []
    for similar in similar_n:
        similar_image_path = key_paths[similar]
        y = int(os.path.split(os.path.split(similar_image_path)[0])[1])
        confidence_df['prediction'].append(y)  
    
    confidence_df['cos_similarity'] = []
    for similar in similar_n:
        confidence_df['cos_similarity'].append(similarity[query_image_index][similar]) 
    
    return pd.DataFrame(confidence_df)

In [None]:
query_image_index = 0
top_n = 5

image_id = os.path.split(val_embeddings['images_paths'][query_image_index])[1]
query_landmark_id = os.path.split(os.path.split(val_embeddings['images_paths'][query_image_index])[0])[1]

similar_n = np.argsort(val_train_similarity[query_image_index])[::-1][:top_n]

print(f"Queried image: {image_id}")
plt.figure(figsize = (6, 6))
plt.imshow(get_image(val_embeddings['images_paths'][query_image_index]))
plt.title(f"Landmark id: {query_landmark_id}")
plt.axis("off")
plot_similar(similar_n, train_embeddings['images_paths'])

In [None]:
confidence_df = confidence_top(query = val_embeddings,
                               key = train_embeddings,
                               similarity = val_train_similarity,
                               query_image_index = query_image_index,
                               top = top_n)

confidence_df

In [None]:
query_image_index = 4
top_n = 5

image_id = os.path.split(val_embeddings['images_paths'][query_image_index])[1]
query_landmark_id = os.path.split(os.path.split(val_embeddings['images_paths'][query_image_index])[0])[1]

similar_n = np.argsort(val_train_similarity[query_image_index])[::-1][:top_n]

print(f"Queried image: {image_id}")
plt.figure(figsize = (6, 6))
plt.imshow(get_image(val_embeddings['images_paths'][query_image_index]))
plt.title(f"Landmark id: {query_landmark_id}")
plt.axis("off")
plot_similar(similar_n, train_embeddings['images_paths'])