### Documentation - In progress

In [None]:
!mkdir -p /tmp/concatenated-embeddings

In [None]:
!ls /kaggle/input/

# Part 1: Extracting Embeddings 

In [None]:
# %%python

### Script to concatenate all embeddings based on given model votes


import numpy as np
import pandas as pd
import pickle
import glob
import os
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm

model_votes = {
    'v2s-15epochs-800t800':1,
    'v2m-15epochs-640t640':1,
    'v2xl-15epochs-512t640':1,
    'v2m-15epochs-732t732': 1,
     'v2l-12epochs-720t720': 1
}

sample_submission = pd.read_csv('../input/landmark-retrieval-2021/sample_submission.csv')
IS_PRIVATE = sample_submission.shape[0]!=1129
print(IS_PRIVATE)

if not IS_PRIVATE:
    model_votes = {
        'v2xl-15n2epochs-720t720':1
    }


model_names = list(model_votes.keys())
model_votes = [model_votes[x] for x in model_names]
sum_votes = sum(model_votes)
model_votes = [x/sum_votes for x in model_votes]

pickle.dump(model_names, open("model_names.pkl", "wb"))
pickle.dump(model_votes, open("model_votes.pkl", "wb"))

npy_array_files = glob.glob('../input/glr-precomputed-embed-v2l-12epochs-720t720/*.npy')
npy_array_files = [x.split('/')[-1] for x in npy_array_files]



for file in tqdm(npy_array_files):
    if 'train' in file:
        if 'name' not in file:
            array = []
            for i,model_name in enumerate(model_names):
                if model_name=='v2xl-15n2epochs-720t720':
                    model_name = 'v2xl-15n2epochs'
                array.append(np.load(f'../input/glr-precomputed-embed-{model_name}/{file}')*model_votes[i])
            array = normalize(np.concatenate(array,axis=1),axis=1)
        else:
            reference = None
            for i,model_name in enumerate(model_names):
                if model_name=='v2xl-15n2epochs-720t720':
                    model_name = 'v2xl-15n2epochs'
                array = np.load(f'../input/glr-precomputed-embed-{model_name}/{file}')
                if i==0:
                    reference=array
                else:
                    assert (reference==array).sum()==len(reference)
        np.save('/tmp/concatenated-embeddings/'+file,array)
#     except:
#         print("Missing file",file)

In [None]:
!ls -lth /tmp/concatenated-embeddings/

In [None]:
!du -sh /tmp/concatenated-embeddings/

In [None]:
import re
import os
import numpy as np
import pandas as pd
import glob
import pickle
from tqdm.notebook import tqdm
from sklearn.preprocessing import normalize

In [None]:
sample_submission = pd.read_csv('../input/landmark-retrieval-2021/sample_submission.csv')
IS_PRIVATE = sample_submission.shape[0]!=1129
print(IS_PRIVATE)

In [None]:
if IS_PRIVATE:
    test_images = glob.glob('../input/landmark-retrieval-2021/test/*/*/*/*.jpg')
    index_images = glob.glob('../input/landmark-retrieval-2021/index/*/*/*/*.jpg')
else:
    test_images = glob.glob('../input/landmark-retrieval-2021/test/0/0/*/*.jpg')
    index_images = glob.glob('../input/landmark-retrieval-2021/index/0/0/0/*.jpg')

print(len(test_images),len(index_images))

pickle.dump(test_images, open("test_images.pkl", "wb"))
pickle.dump(index_images, open("index_images.pkl", "wb"))

In [None]:
%%writefile get_embeddings.py

import sys
import tensorflow as tf
import tensorflow_hub as tfhub
from sklearn.preprocessing import normalize
import pickle
import numpy as np

MODEL_IMAGE_SIZE_MAP = {
    'v2m-15epochs-640t640':640,
    'v2l-15epochs-512t640':640,
    'v2m-15epochs-800t800':800,
    'v2m-15epochs-732t732':732,
    'v2xl-15n2epochs-720t720':720,
    'v2s-15epochs-800t800':800,
    'v2l-10n4epochs-720t720':720,
    'v2xl-15epochs-512t640':640,
    'b6-15epochs-800t800':800,
    'v2l-12epochs-720t720':720
}

model_name = sys.argv[1]
IMAGE_SIZE = MODEL_IMAGE_SIZE_MAP[model_name]
IMAGE_SIZE = [IMAGE_SIZE,IMAGE_SIZE]

strategy = tf.distribute.get_strategy()  
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
print("REPLICAS: ", strategy.num_replicas_in_sync)
TTA = ['rotate0',  'rotate0_lr']

test_images = pickle.load(open("test_images.pkl",'rb'))
index_images = pickle.load(open("index_images.pkl",'rb'))

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

def test_time_augmentation(img,tta=None):
    if tta:
        if tta[-3:]=='_lr':
            img = tf.image.flip_left_right(img)
            tta = tta[:-3]

        if tta[-3:]=='_ud':
            img = tf.image.flip_up_down(img)
            tta = tta[:-3]
    return img
    
# Function to get our dataset that read images
def get_test_dataset(image_paths,tta=None):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths))
    dataset = dataset.map(lambda image_path: read_image(image_path), num_parallel_calls = AUTO)
    dataset = dataset.map(lambda image: test_time_augmentation(image,tta), num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset


### Loading model from zoo
embed_model = tf.keras.models.load_model(f'../input/google-landmark-model-zoo/{model_name}.h5', 
                           custom_objects={'KerasLayer': tfhub.KerasLayer})

print("Predicting on image size",IMAGE_SIZE)

for i,tta in enumerate(TTA):
    index_ds = get_test_dataset(index_images,tta)
    if i==0:
        index_embeddings = normalize(embed_model.predict(index_ds,verbose=1),axis=1)
    else:
        index_embeddings += normalize(embed_model.predict(index_ds,verbose=1),axis=1)

for i,tta in enumerate(TTA):
    test_ds = get_test_dataset(test_images,tta)
    if i==0:
        test_embeddings = normalize(embed_model.predict(test_ds,verbose=1),axis=1)
    else:
        test_embeddings += normalize(embed_model.predict(test_ds,verbose=1),axis=1)

index_embeddings = normalize(index_embeddings,axis=1)
test_embeddings = normalize(test_embeddings,axis=1)

np.save(f'/tmp/{model_name}/index_embeddings.npy',index_embeddings)
np.save(f'/tmp/{model_name}/test_embeddings.npy',test_embeddings)

In [None]:
model_names = pickle.load(open("model_names.pkl",'rb'))
model_votes = pickle.load(open("model_votes.pkl",'rb'))

In [None]:
for model_name in model_names:
    print("Predicting for",model_name)
    !mkdir -p /tmp/{model_name}
    !python get_embeddings.py {model_name}

In [None]:
%%python

import numpy as np
import pickle
from sklearn.preprocessing import normalize

model_names = pickle.load(open("model_names.pkl",'rb'))
model_votes = pickle.load(open("model_votes.pkl",'rb'))

index_embeddings = []
for model_name,vote in zip(model_names,model_votes):
    index_embeddings.append(np.load(f'/tmp/{model_name}/index_embeddings.npy')*vote)
index_embeddings = np.concatenate(index_embeddings,axis=1)
index_embeddings = normalize(index_embeddings,axis=1)
np.save(f'/tmp/index_embeddings.npy',index_embeddings)

In [None]:
%%python

import numpy as np
import pickle
from sklearn.preprocessing import normalize

model_names = pickle.load(open("model_names.pkl",'rb'))
model_votes = pickle.load(open("model_votes.pkl",'rb'))

test_embeddings = []
for model_name,vote in zip(model_names,model_votes):
    test_embeddings.append(np.load(f'/tmp/{model_name}/test_embeddings.npy')*vote)
test_embeddings = np.concatenate(test_embeddings,axis=1)
test_embeddings = normalize(test_embeddings,axis=1)
np.save(f'/tmp/test_embeddings.npy',test_embeddings)

In [None]:
for model_name in model_names:
    !rm -r /tmp/{model_name}

In [None]:
!du -sh /tmp/

# Part 2: Nearest Neighbour Search

In [None]:
import gc
from cuml.neighbors import NearestNeighbors
model_root = '/tmp/concatenated-embeddings/'

In [None]:
index_image_ids = [x.split('/')[-1].split('.')[0] for x in index_images]
test_image_ids = [x.split('/')[-1].split('.')[0] for x in test_images]
print(len(index_image_ids),len(test_image_ids))

In [None]:
index_embeddings = np.load('/tmp/index_embeddings.npy')
print(index_embeddings.shape)

In [None]:
test_embeddings = np.load('/tmp/test_embeddings.npy')
print(test_embeddings.shape)

In [None]:
train_df = pd.read_csv('../input/glr-validation-metadata/train.csv')
landmark_map = train_df.set_index('id').landmark_id.to_dict()

## Pick top 300 neighbours from train set for each image in test and index set 

In [None]:
## Find 15*20 neighbours for each index images in batches
landmark_knn=15
index_distances_file = '/tmp/index_distances.dat'
index_neighbours_file = '/tmp/index_neighbours.dat'
index_distances = np.memmap(index_distances_file, dtype='float32', mode='w+', 
                            shape=(len(index_image_ids),landmark_knn*20))
index_neighbours = np.memmap(index_neighbours_file, dtype='int32', mode='w+',
                             shape=(len(index_image_ids),landmark_knn*20))

test_distances_file = '/tmp/test_distances.dat'
test_neighbours_file = '/tmp/test_neighbours.dat'
test_distances = np.memmap(test_distances_file, dtype='float32', mode='w+', 
                            shape=(len(test_image_ids),landmark_knn*20))
test_neighbours = np.memmap(test_neighbours_file, dtype='int32', mode='w+',
                             shape=(len(test_image_ids),landmark_knn*20))

for part in tqdm(range(20)):
    train_names = np.load(model_root+f'train-names-{part}.npy')
    train_embed = np.load(model_root+f'train-predictions-{part}.npy')
    non_landmark_scores = pd.read_csv(f'../input/glr-non-landmark-scores-for-train-images/non_landmark_score-{part}.csv',
                      index_col='id').non_landmark_confidence.to_dict()
    neighbors_model = NearestNeighbors(n_neighbors = landmark_knn, metric = 'cosine')
    neighbors_model.fit(train_embed)
    
    distances, indices = neighbors_model.kneighbors(index_embeddings)
    distances = np.abs(distances)
    func = np.vectorize(lambda x: landmark_map[train_names[x]])
    neighbours = func(indices)
    func = np.vectorize(lambda x: non_landmark_scores[train_names[x]])
    nlr_scores = func(indices)
    distances = distances-0.4*(1-nlr_scores)
    start_idx = part*landmark_knn
    end_idx = (part+1)*landmark_knn
    index_distances[:,start_idx:end_idx] = distances
    index_neighbours[:,start_idx:end_idx] = neighbours
    del indices,distances,neighbours,nlr_scores
    gc.collect()
    
    distances, indices = neighbors_model.kneighbors(test_embeddings)
    distances = np.abs(distances)
    func = np.vectorize(lambda x: landmark_map[train_names[x]])
    neighbours = func(indices)
    func = np.vectorize(lambda x: non_landmark_scores[train_names[x]])
    nlr_scores = func(indices)
    distances = distances-0.4*(1-nlr_scores)
    start_idx = part*landmark_knn
    end_idx = (part+1)*landmark_knn
    test_distances[:,start_idx:end_idx] = distances
    test_neighbours[:,start_idx:end_idx] = neighbours
    del train_names, train_embed, neighbors_model,indices,distances,neighbours,nlr_scores
    gc.collect()

In [None]:
!mkdir -p /tmp/index_landmarks/
!mkdir -p /tmp/test_landmarks/

## Pick top 5 landmarks for each index image

In [None]:
BATCHSIZE = 10000 # Higher Batchsize -> Faster code, high chances of memory error
for start_idx in tqdm(range(0,len(index_distances),BATCHSIZE)):
    BATCH_distances = index_distances[start_idx:start_idx+BATCHSIZE]
    BATCH_neighbours = index_neighbours[start_idx:start_idx+BATCHSIZE]
    names = index_image_ids[start_idx:start_idx+BATCHSIZE]

    BATCH_META = pd.DataFrame(np.stack([BATCH_neighbours.reshape(-1),BATCH_distances.reshape(-1)],
                                       axis=1),columns=['landmark','distance'])
    BATCH_META['image_id'] = BATCH_META.index//BATCH_distances.shape[1]
    BATCH_META['image_id'] = BATCH_META['image_id'].apply(lambda x: names[x])
    BATCH_META = BATCH_META.sort_values('distance').groupby(['image_id','landmark']).head(2)
    BATCH_META['confidence'] = 1-BATCH_META['distance']
    BATCH_META = BATCH_META.groupby(['image_id','landmark']).confidence.sum().sort_values(ascending=False).reset_index()
    BATCH_META_max = BATCH_META.groupby('image_id').head(1).sort_values(['image_id','landmark'])
    BATCH_META_max['confidence'] = 0.5
    BATCH_META = pd.concat([BATCH_META,BATCH_META_max])
    BATCH_META = BATCH_META.groupby(['image_id','landmark']).confidence.sum().sort_values(ascending=False).reset_index()
    BATCH_META = BATCH_META.groupby('image_id').head(5).sort_values(['image_id','landmark'])
    
    BATCH_META.to_csv(f'/tmp/index_landmarks/index_landmarks_{start_idx}.csv')
    del BATCH_META,BATCH_META_max
    gc.collect()

## Pick top 5 landmarks for each test image

In [None]:
BATCHSIZE = 10 # Higher Batchsize -> Faster code, high chances of memory error
for start_idx in tqdm(range(0,len(test_distances),BATCHSIZE)):
    BATCH_distances = test_distances[start_idx:start_idx+BATCHSIZE]
    BATCH_neighbours = test_neighbours[start_idx:start_idx+BATCHSIZE]
    names = test_image_ids[start_idx:start_idx+BATCHSIZE]

    BATCH_META = pd.DataFrame(np.stack([BATCH_neighbours.reshape(-1),BATCH_distances.reshape(-1)],
                                       axis=1),columns=['landmark','distance'])
    BATCH_META['image_id'] = BATCH_META.index//BATCH_distances.shape[1]
    BATCH_META['image_id'] = BATCH_META['image_id'].apply(lambda x: names[x])
    BATCH_META = BATCH_META.sort_values('distance').groupby(['image_id','landmark']).head(2)
    BATCH_META['confidence'] = 1-BATCH_META['distance']
    BATCH_META = BATCH_META.groupby(['image_id','landmark']).confidence.sum().sort_values(ascending=False).reset_index()
    BATCH_META_max = BATCH_META.groupby('image_id').head(1).sort_values(['image_id','landmark'])
    BATCH_META_max['confidence'] = 0.5
    BATCH_META = pd.concat([BATCH_META,BATCH_META_max])
    BATCH_META = BATCH_META.groupby(['image_id','landmark']).confidence.sum().sort_values(ascending=False).reset_index()
    BATCH_META = BATCH_META.groupby('image_id').head(5).sort_values(['image_id','landmark'])
    BATCH_META.to_csv(f'/tmp/test_landmarks/test_landmarks_{start_idx}.csv')
    del BATCH_META,BATCH_META_max
    gc.collect()

In [None]:
test_landmarks_paths = os.listdir('/tmp/test_landmarks/')
index_landmarks_paths = os.listdir('/tmp/index_landmarks/')

In [None]:
index_landmarks = []
for index_landmarks_path in tqdm(index_landmarks_paths):
    index_landmarks.append(pd.read_csv('/tmp/index_landmarks/'+index_landmarks_path,index_col=0))
index_landmarks = pd.concat(index_landmarks)
index_landmarks.columns = [x+'_index' if x!='landmark' else x for x in index_landmarks.columns]
print(index_landmarks.shape)

In [None]:
if not IS_PRIVATE:
    index_landmarks['landmark'] = (index_landmarks['landmark']%10).astype(int)

## Finding Direct Neighbours

In [None]:
def db_aug(V, n_neighbors=3):
    
    V = normalize(V,axis=1)
    model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
    model.fit(V)
    distances, indices = model.kneighbors(V)

    w = np.power(np.clip(2.0 - distances, 0, 2.0), 0.5)
    V_tmp = None
    for i in range(n_neighbors):
        if V_tmp is None:
            V_tmp = w[:, i, None]*V[indices[:, i]]
        else:
            V_tmp += w[:, i, None]*V[indices[:, i]]
    V_tmp /= w.sum(axis=1)[:, None]

    return V_tmp

In [None]:
all_embeddings = np.concatenate([index_embeddings,test_embeddings])
all_embeddings = db_aug(all_embeddings, n_neighbors=2)
# index_embeddings = all_embeddings[:index_embeddings.shape[0]]
test_embeddings = all_embeddings[index_embeddings.shape[0]:]
print(index_embeddings.shape,test_embeddings.shape)

In [None]:
KNN = min(len(index_embeddings),100)
neighbors_model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
neighbors_model.fit(index_embeddings)
distances, indices = neighbors_model.kneighbors(test_embeddings)
distances = np.abs(distances)

In [None]:
direct_neighbours = pd.DataFrame(np.stack([np.take(np.array(index_image_ids),indices).reshape(-1),
                              distances.reshape(-1)],axis=1),columns=['image_id_index','direct_distance'])
direct_neighbours['image_id_test'] = np.take(test_image_ids,direct_neighbours.index.values//KNN)
direct_neighbours['direct_confidence'] = (1-direct_neighbours.direct_distance.astype(float)).clip(0,1)

## Finding Bridged Neighbours 

### Ensembling direct & bridged neighbours with power average 

In [None]:
retrieval_results = []
for test_landmarks_path in tqdm(test_landmarks_paths):
    test_landmarks = pd.read_csv('/tmp/test_landmarks/'+test_landmarks_path,index_col=0)
    test_image_ids_subset = test_landmarks.image_id.unique()
    direct_neighbours_subset = direct_neighbours[direct_neighbours.image_id_test.isin(test_image_ids_subset)]
    if not IS_PRIVATE:
        test_landmarks['landmark'] = (test_landmarks['landmark']%10).astype(int)
    test_landmarks.columns = [x+'_test' if x!='landmark' else x for x in test_landmarks.columns]
    merged_landmarks = pd.merge(test_landmarks,index_landmarks,on='landmark')
    merged_landmarks['confidence'] = merged_landmarks.apply(lambda row:
                                                                min(row.confidence_test,row.confidence_index),
                                                               axis=1)
    merged_landmarks['bridged_confidence'] = merged_landmarks.confidence/3
    merged_landmarks =  merged_landmarks.sort_values(['confidence','confidence_index'],ascending=False).drop_duplicates(
        ['image_id_test','image_id_index'],keep='first')
    merged_landmarks = pd.merge(direct_neighbours_subset[['image_id_index','image_id_test','direct_confidence']],
             merged_landmarks[['image_id_test','image_id_index','bridged_confidence']],
             on=['image_id_index','image_id_test'],
            how='outer').fillna(0)
    merged_landmarks['confidence'] = (merged_landmarks['bridged_confidence'] ** 3) + (merged_landmarks['direct_confidence'] ** 3)
    merged_landmarks =  merged_landmarks.sort_values(['confidence','bridged_confidence'],ascending=False).drop_duplicates(
        ['image_id_test','image_id_index'],keep='first')
    subset = merged_landmarks.groupby('image_id_test').image_id_index.apply(
        lambda x: " ".join(x.values)).reset_index()
    subset.columns = ['id','images']
    retrieval_results.append(subset)
retrieval_results = pd.concat(retrieval_results)

In [None]:
retrieval_results.to_csv('submission.csv',index=False)