# Intro
Welcome to the [Google Landmark Recognition 2021](https://www.kaggle.com/c/landmark-recognition-2021) compedition
![](https://storage.googleapis.com/kaggle-competitions/kaggle/29762/logos/header.png)

This notebook will give you a guideline to start step by step with this compedition. We focus on:
* the underlying structure of the data,
* a data generator to load the image data on demand during the prediction process.

We use a simple model with a pretrained model on a subset of the train data to clarify the workflow. Additionally we recommend to use the power of GPU.


<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries
We use some standard python packages and the libraries of scikit learn and keras. 

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

from sklearn.model_selection import train_test_split

from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.optimizers import RMSprop,Adam
from keras.applications import VGG19, VGG16, ResNet50

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/landmark-recognition-2021/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
train_data.head()

In [None]:
samp_subm.head()

# Functions

In [None]:
def plot_examples(landmark_id=1):
    """ Plot 5 examples of images with the same landmark_id """
    
    fig, axs = plt.subplots(1, 5, figsize=(25, 12))
    fig.subplots_adjust(hspace = .2, wspace=.2)
    axs = axs.ravel()
    for i in range(5):
        idx = train_data[train_data['landmark_id']==landmark_id].index[i]
        image_id = train_data.loc[idx, 'id']
        file = image_id+'.jpg'
        subpath = '/'.join([char for char in image_id[0:3]])
        img = cv2.imread(path+'train/'+subpath+'/'+file)
        axs[i].imshow(img)
        axs[i].set_title('landmark_id: '+str(landmark_id))
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])

# Overview
First we look on the size of the dataset:

In [None]:
print('Samples train:', len(train_data))
print('Samples test:', len(samp_subm))

In [None]:
train_data.head()

There are 81313 unique classes:

In [None]:
len(train_data['landmark_id'].unique())

For each test image, we have to predict one landmark label and a corresponding confidence score. 

In [None]:
samp_subm.head()

# Find Image
We consider the first image of the train data set and plot it. The first 3 characters ares used for the subpath which is the location of the image. 

In [None]:
train_data.head()

In [None]:
image_id = train_data.loc[0, 'id']
file = image_id+'.jpg'
subpath = '/'.join([char for char in image_id[0:3]]) 

In [None]:
file

In [None]:
subpath

Is the file located in the subpath?

In [None]:
file in os.listdir(path+'train/'+subpath)

In [None]:
path

Plot the image:

In [None]:
img = cv2.imread(path+'train/'+subpath+'/'+file)
plt.imshow(img)
plt.show()

Look on the image shape:

In [None]:
img.shape

# Plot Some Examples
We plot some examples of images with the same **landmark_id** in a row.

In [None]:
plot_examples(landmark_id = 138982)

In [None]:
plot_examples(landmark_id = 126637)

In [None]:
plot_examples(landmark_id = 83144)

In [None]:
plot_examples(landmark_id = 83145)

In [None]:
idx = train_data[train_data['landmark_id']==83145].index[0]
print(idx)
print('-------')
print(train_data.loc[idx])
print('-------')
image_id = train_data.loc[idx, 'id']
image_id

# Split Data
We define train, validation and test data.

In [None]:
train_data.index[0:3]

In [None]:
list(train_data.index)[:15]

In [None]:
list_IDs_trainA, list_IDs_valA = train_test_split(list(train_data.index)[:15], test_size=0.33, random_state=2021)
print('---- list_IDs_trainA -----')
print(list_IDs_trainA)
print('---list_IDs_valA-----')
print(list_IDs_valA)
print('-----------')
list_IDs_testA = list(samp_subm.index)[:15]
print('---- list_IDs_testA-----')
print(list_IDs_testA)
print('-----------')

In [None]:
train_data.iloc[train_data.index[0:3]]

In [None]:
#500000
list_IDs_train, list_IDs_val = train_test_split(list(train_data.index)[:100000], test_size=0.33, random_state=2021)
list_IDs_test = list(samp_subm.index)

In [None]:
print('Number train samples:', len(list_IDs_train))
print('Number val samples:', len(list_IDs_val))
print('Number test samples:', len(list_IDs_test))

# Data Generator

We use a data generator to load the data on demand.

In [None]:
img_size = 32
img_channel = 3
batch_size = 64

num_classes = len(train_data['landmark_id'].value_counts())

In [None]:
num_classes

# DataGenerator

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, img_size, img_channel, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.img_size = img_size
        self.img_channel = img_channel
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, 1), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            
            image_id = self.data.loc[ID, 'id']
            file = image_id+'.jpg'
            subpath = '/'.join([char for char in image_id[0:3]]) 
            
            img = cv2.imread(self.path+subpath+'/'+file)
            
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img/255
            if self.path.find('train')>=0:
                y[i, ] = self.data.loc[ID, 'landmark_id']
            else:
                y[i, ] = 0
        return X, y

Use the DataGenerator class to define the data generators for train, validation and test data:

In [None]:
train_generator = DataGenerator(path+'train/', list_IDs_train, train_data, img_size, img_channel, batch_size)
val_generator = DataGenerator(path+'train/', list_IDs_val, train_data, img_size, img_channel, batch_size)
test_generator = DataGenerator(path+'test/', list_IDs_test, samp_subm, img_size, img_channel, batch_size)

In [None]:
train_generator.data[:5]

In [None]:
# data_generation(list_IDs_train)


# Model

Load pretrained model:

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
conv_base = ResNet50(weights=weights,
                     include_top=False,
                     input_shape=(img_size, img_size, img_channel))
conv_base.trainable = True

Define Model

In [None]:
model = Sequential()
model.add(conv_base)
model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer = Adam(lr=1e-4),
              loss="sparse_categorical_crossentropy",
              metrics=['sparse_categorical_accuracy'])

model.summary()

In [None]:
epochs = 1

In [None]:
history = model.fit_generator(generator=train_generator,
                              validation_data=val_generator,
                              epochs = epochs, workers=4)

In [None]:
print("Train done")

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
history.history['loss']

In [None]:
plt.plot(history.history['loss'])

# Predict Test Data

In [None]:
y_pred = model.predict_generator(test_generator, verbose=1)

In [None]:
y_pred.shape

In [None]:
for i in range(len(samp_subm.index)):
    category = np.argmax(y_pred[i])
    score = y_pred[i][np.argmax(y_pred[i])].round(2)
    samp_subm.loc[i, 'landmarks'] = str(category)+' '+str(score)

In [None]:
samp_subm.head()

# Export

In [None]:
samp_subm.to_csv('submission.csv', index=False)