In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
import seaborn as sns

# Data parameter
input_dir = os.path.join('..', 'input')
output_dir = os.path.join('..', 'output')

dataset_dir = os.path.join(input_dir, 'landmark-recognition-2020')
train_dir = os.path.join(dataset_dir, 'train')
train_labelmap_dir = os.path.join(dataset_dir, 'train.csv')
test_dir = os.path.join(dataset_dir, 'test')
test_labelmap_dir = os.path.join(dataset_dir, 'sample_submission.csv')

train_df = pd.read_csv(train_labelmap_dir)
test_df = pd.read_csv(test_labelmap_dir)
num_data = len(train_df)

print("The number of train images is :", train_df.shape[0])
print("The number of test images is :", test_df.shape[0])
print('The total number of images is :', train_df.shape[0]+test_df.shape[0])

landmark = train_df.landmark_id.value_counts()
landmark_df = pd.DataFrame({'landmark_id':landmark.index, 'frequency':landmark.values})#.head(30)

landmark_df.reset_index(inplace=True)
print("Amount of classes with less than 5 trainning samples:", (landmark_df['frequency'].between(0,4,True)).sum())
print("Amount of classes with between 5 and 10 training samples:", (landmark_df['frequency'].between(5,10,True)).sum())

print(landmark_df)
plt.figure(figsize=(20,6))
plt.hist(train_df.landmark_id, bins=81312,log=True);
plt.title('Images per class', fontsize=16)
plt.xlabel('Class number')
plt.ylabel('Number of images')
plt.show()

plt.figure(figsize = (10, 8))
plt.title('Landmark ID Distribuition')
sns.distplot(landmark_df['landmark_id'])
plt.show()

print("The data distribution will affect the training process negatively, since some classes have a very large number of samples when compared with other classes. \n For example the largest class contains 6272 images where there are 4749 classes which contain only 2 images, meaning that the largest class will have higher impact \n when trying to do some predictions after training the model. \n in other words, when trying to predict a sample from the small classes, 99% of the times, the classifier will predict it as it belongs to the large class, which is refered to as generalization. ") 


# Visualize different images

In [None]:
### Visualize random images from the dataset

def get_image_from_number(num):
    fname, label = train_df.loc[num,:]
    fname = fname + ".jpg"
    f1 = fname[0]
    f2 = fname[1]
    f3 = fname[2]
    path = os.path.join(f1,f2,f3,fname)
    im = cv2.imread(os.path.join(train_dir,path))
    RGB_img = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    return RGB_img, label


fig=plt.figure(figsize=(16, 8))

columns = 4
rows = 1
for i in range(1, columns*rows +1):
    n = np.random.randint(num_data)
    img, lbl = get_image_from_number(n)
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
    plt.title("Label = " + str(lbl))
plt.show()

In [None]:
#Adding two extra columns to the data-frame, where filename will be used to retreive the images when needed, and the "label" is the integer version of the "landmark_id"
train_df["filename"] = train_df.id.str[0]+"/"+train_df.id.str[1]+"/"+train_df.id.str[2]+"/"+train_df.id+".jpg"
train_df["label"] = train_df.landmark_id.astype(str)

#Due to the limited resources, i will be keeping only the top 1000 classes of the data.
from collections import Counter

number_of_classes_to_keep = 1000
c = train_df.landmark_id.values
count = Counter(c).most_common(number_of_classes_to_keep)
keep_labels = [i[0] for i in count]
train_keep = train_df[train_df.landmark_id.isin(keep_labels)]

#Since we took the 1000 most common classes, we will reset the index of the samples in addition to shuffling them.
train_keep = train_keep.sample(frac=1).reset_index(drop=True)
print(train_keep)


# Training model

Importing the needed libraries

In [None]:
from keras.applications import VGG19
from keras.layers import *
from keras import Sequential
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from tensorflow import keras
import tensorflow as tf
from keras.layers import Dropout
from sklearn.utils import class_weight


**The used settings**

In [None]:
val_split = 0.2 #The percentage of the validation data out of the entire training data
batch_size = 32 
learning_rate = 0.005 # The learning rate
opt = tf.keras.optimizers.SGD(lr=learning_rate, momentum=0.9) #The optimizor
#opt = keras.optimizers.Adagrad(learning_rate = learning_rate, initial_accumulator_value=0.01, epsilon=1e-07)
epochs = 5 # The number of epochs 

The data generation and preprocessing step, before the training

In [None]:
datagen = ImageDataGenerator(validation_split=val_split,
                             rescale=1.0/255.0,
                             rotation_range=40,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=True
                            )

train_datagen = datagen.flow_from_dataframe(
    train_keep, # Pandas dataframe containing the filepaths relative to directory (or absolute paths if directory is None) and classes label
    directory=train_dir + "/",
    x_col="filename",
    y_col="label",
    weight_col=None,
    target_size=(224, 224),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="training",
    interpolation="nearest",
    validate_filenames=False)

val_datagen = datagen.flow_from_dataframe(
    train_keep, # Pandas dataframe containing the filepaths relative to directory (or absolute paths if directory is None) and classes label
    directory=train_dir + "/",
    x_col="filename",
    y_col="label",
    weight_col=None,
    target_size=(224, 224),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="validation",
    interpolation="nearest",
    validate_filenames=False)

The network architecture used to train the model

In [None]:
model = Sequential()
model.add(Input(shape=(224,224,3)))
model.add(BatchNormalization())
model.add(Conv2D(64, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(64, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())
model.add(Conv2D(128, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(128, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(Conv2D(256, kernel_size = (3,3), padding = "same"))
model.add(BatchNormalization())
model.add(MaxPooling2D())

model.add(Flatten())
model.add(Dense(4096, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(4096, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(number_of_classes_to_keep, activation="softmax"))
print(model.summary())


model.compile(
    optimizer=opt,
    loss = 'categorical_crossentropy', 
    metrics=['categorical_accuracy']
)

Preparing the class weights to be fed to the network when the training starts


In [None]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_keep.landmark_id),
                                                 train_keep.landmark_id)
class_weights = dict(enumerate(class_weights))
class_weights

# Training the model

In [None]:
train_steps = int(len(train_keep)*(1-val_split))//batch_size
val_steps = int(len(train_keep)*val_split)//batch_size

model_checkpoint = ModelCheckpoint("best_model.h5", save_best_only=True, verbose=1)

history = model.fit(train_datagen, steps_per_epoch=train_steps, epochs=epochs,validation_data=val_datagen, validation_steps=val_steps,class_weight=class_weights,callbacks=[model_checkpoint])

model.save("model.h5")

# Validate model

Plotting the accuracy/loss metrices 

In [None]:
print(history.history.keys())
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Doing a prediction on the validation data, just to gen an insight on how the model is performing.

In [None]:
predict = model.predict(val_datagen, val_steps,verbose=1)

good_preds = []
bad_preds = []

val_filenames = val_datagen.filenames
label_map = (val_datagen.class_indices)
#label_categories = to_categorical(np.asarray(labels)) 
cla = np.argmax(predict, axis=1)
label_map = list(map(int, label_map.keys()))
val_label = val_datagen.labels

for idx, res in enumerate(predict):
    #print("image_id: ", val_filenames[idx], ", class predict: ", label_map[cla[idx]], "class: ", label_map[val_label[idx]])
    
    if label_map[cla[idx]] != label_map[val_label[idx]]:
        bad_preds.append([val_filenames[idx], label_map[cla[idx]], label_map[val_label[idx]], res[cla[idx]]])
    else:
        good_preds.append([val_filenames[idx], label_map[cla[idx]], label_map[val_label[idx]], res[cla[idx]]])
print("wrong predictions: ", len(bad_preds), " right predictions: ", len(good_preds), " acc: ", np.round(100*(len(predict)-len(bad_preds))/len(predict),2))

In [None]:
### plot some of the best predictions
fig=plt.figure(figsize=(16, 8))

good_preds = np.array(good_preds)
good_preds = np.array(sorted(good_preds, key = lambda x: x[3], reverse=True))
#print(good_preds.shape)

columns = 4
rows = 1
for i in range(1, columns*rows +1):
    n = good_preds[i,0]
    #print(n)
    img = cv2.imread(os.path.join(train_dir,n))
    RGB_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    lbl = good_preds[i,2]
    fig.add_subplot(rows, columns, i)
    plt.imshow(RGB_img)
    lbl2 = good_preds[i,1]
    plt.title("Label = " + str(lbl) + " \nClassified:" + str(lbl2) + " \nConfidence:" + str(good_preds[i,3]))
plt.show()

In [None]:
### plot the worst predictions

fig=plt.figure(figsize=(16, 8))

bad_preds = np.array(bad_preds)
bad_preds = np.array(sorted(bad_preds, key = lambda x: x[3], reverse=True))
#print(bad_preds.shape)

columns = 4
rows = 1
for i in range(1, columns*rows +1):
    n = bad_preds[i,0]
    #print(n)
    img = cv2.imread(os.path.join(train_dir,n))
    RGB_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    lbl = bad_preds[i,2]
    fig.add_subplot(rows, columns, i)
    plt.imshow(RGB_img)
    lbl2 = bad_preds[i,1]
    plt.title("Label = " + str(lbl) + " \nClassified:" + str(lbl2) + " \nConfidence:" + str(good_preds[i,3]))
plt.show()

# Explaining the predictions

In [None]:
train_val = train_keep.landmark_id.value_counts()
train_keep_df = pd.DataFrame({'landmark_id':train_val.index, 'frequency':train_val.values})
train_keep_df.reset_index(inplace=True)

print("Top 5 training classes with most data:")
for i in range(5):
    print("label:", train_keep_df.landmark_id[i], "has", train_keep_df.frequency[i], "instances in training set" )

train_keep_df.set_index("landmark_id", inplace = True)
print("\nTop 5 classes with the worst prediction")
    
for i in range(5):
    label = bad_preds[i, 2]
    #print(label)
    label_counts = train_keep_df.loc[int(label)]
    #print(label_counts)
    print("label:", label, "has", label_counts["frequency"], "instances in training set" )
    
    
print("\nTop 5 classes with the best prediction")
for i in range(5):
    label = good_preds[i, 2]
    #print(label)
    label_counts = train_keep_df.loc[int(label)]
    #print(label_counts)
    print("label:", label, "has", label_counts["frequency"], "instances in training set" )

# Usnig the trained model to predict the given test data

Preprocessing the test data before doing the predection

In [None]:
sub = pd.read_csv("/kaggle/input/landmark-recognition-2020/sample_submission.csv")
sub["filename"] = sub.id.str[0]+"/"+sub.id.str[1]+"/"+sub.id.str[2]+"/"+sub.id+".jpg"

best_model = load_model("best_model.h5")

test_gen = ImageDataGenerator(rescale=1.0/255.0).flow_from_dataframe(
    sub,
    directory="/kaggle/input/landmark-recognition-2020/test/",
    x_col="filename",
    y_col=None,
    weight_col=None,
    target_size=(224, 224),
    color_mode="rgb",
    classes=None,
    class_mode=None,
    batch_size=1,
    shuffle=True,
    subset=None,
    interpolation="nearest",
    validate_filenames=False)

Performing the prediction

In [None]:
y_pred_one_hot = best_model.predict_generator(test_gen, verbose=1, steps=len(sub))
y_pred = np.argmax(y_pred_one_hot, axis=-1)
y_prob = np.max(y_pred_one_hot, axis=-1)
y_uniq = np.unique(train_keep.landmark_id.values)
y_pred = [y_uniq[Y] for Y in y_pred]

Submitting the prediction results

In [None]:
for i in range(len(sub)):
    sub.loc[i, "landmarks"] = str(y_pred[i])+" "+str(y_prob[i])
sub = sub.drop(columns="filename")
sub.to_csv("submission.csv", index=False)


# Visualizing best and worst classifications from the test data

In [None]:
df = pd.DataFrame()
for i in range(len(sub)):    
    df.loc[i, "id"] = sub.loc[i,"id"]
    df.loc[i, "class"]=str(y_pred[i])
    df.loc[i,"prob"]=y_prob[i]
df.sort_values(by=['prob'], inplace=True)


**Showing the 4 classifications with high confidence**

In [None]:
good_predections= df.tail(4).reset_index()
fig=plt.figure(figsize=(16, 8))
columns = 4
rows = 1

for i in range(0, columns*rows ):
    image_path = str(test_dir+"/"+good_predections["id"][i][0]+"/"+good_predections["id"][i][1]+"/"+good_predections["id"][i][2]+"/"+good_predections["id"][i]+".jpg")
    fig.add_subplot(rows, columns, i+1)
    image = cv2.imread(image_path)
    RGB_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(RGB_img)
    plt.title(" \nClassified as:" + str(good_predections["class"][i]) + " \nConfidence:" + str(good_predections["prob"][i]))



**Showing the 4 classifications with low confidence**

In [None]:
bad_predections= df.head(4).reset_index()
fig=plt.figure(figsize=(16, 8))
columns = 4
rows = 1

for i in range(0, columns*rows ):
    image_path = str(test_dir+"/"+bad_predections["id"][i][0]+"/"+bad_predections["id"][i][1]+"/"+bad_predections["id"][i][2]+"/"+bad_predections["id"][i]+".jpg")
    fig.add_subplot(rows, columns, i+1)
    image = cv2.imread(image_path,cv2.COLOR_BGR2RGB)
    RGB_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(RGB_img)
    plt.title(" \nClassified as:" + str(bad_predections["class"][i]) + " \nConfidence:" + str(bad_predections["prob"][i]))

    