In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import MobileNetV2
from keras.utils import to_categorical
from keras.layers import Dense
from keras import Model
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [None]:
train = pd.read_csv("/kaggle/input/landmark-recognition-2020/train.csv")
train["filename"] = train.id.str[0]+"/"+train.id.str[1]+"/"+train.id.str[2]+"/"+train.id+".jpg"
train["label"] = train.landmark_id.astype(str)
train

In [None]:
sub = pd.read_csv("/kaggle/input/landmark-recognition-2020/sample_submission.csv")
sub["filename"] = sub.id.str[0]+"/"+sub.id.str[1]+"/"+sub.id.str[2]+"/"+sub.id+".jpg"
sub

In [None]:
y = train.landmark_id.values
n_classes = np.max(y)
print(n_classes)
plt.hist(y)
plt.show()

In [None]:
from collections import Counter
count = Counter(y).most_common(1000)
print(len(count), count[-1])

In [None]:
# only keep 10000 classes
keep_labels = [c[0] for c in count]
train_keep = train[train.landmark_id.isin(keep_labels)]

In [None]:
val_rate = 0.3
batch_size = 32

In [None]:
gen = ImageDataGenerator(validation_split=val_rate)

train_gen = gen.flow_from_dataframe(
    train_keep,
    directory="/kaggle/input/landmark-recognition-2020/train/",
    x_col="filename",
    y_col="label",
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="training",
    interpolation="nearest",
    validate_filenames=False)
    
val_gen = gen.flow_from_dataframe(
    train_keep,
    directory="/kaggle/input/landmark-recognition-2020/train/",
    x_col="filename",
    y_col="label",
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="validation",
    interpolation="nearest",
    validate_filenames=False)

In [None]:
model  = load_model("/kaggle/input/common-keras-pretrained-models/MobileNetV2.h5")
for i in range(len(model.layers)-1):
    model.layers[i].trainable = False
model.summary()

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["categorical_accuracy"])

In [None]:
# training parameters
epochs = 5 # maximum number of epochs
train_steps = int(len(train_keep)*(1-val_rate))//batch_size
val_steps = int(len(train_keep)*val_rate)//batch_size

In [None]:
model_checkpoint = ModelCheckpoint("best_model.h5", save_best_only=True, verbose=1)

history = model.fit_generator(train_gen, steps_per_epoch=train_steps, epochs=epochs,
                              validation_data=val_gen, validation_steps=val_steps, callbacks=[model_checkpoint])

model.save("model.h5")

In [None]:
from keras.models import load_model
best_model = load_model("best_model.h5")

In [None]:
test_gen = ImageDataGenerator().flow_from_dataframe(
    sub,
    directory="/kaggle/input/landmark-recognition-2020/test/",
    x_col="filename",
    y_col=None,
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode=None,
    batch_size=1,
    shuffle=True,
    subset=None,
    interpolation="nearest",
    validate_filenames=False)

In [None]:
print("Predicting on all available data...")
y_pred_one_hot = best_model.predict_generator(test_gen, verbose=1, steps=len(sub))

In [None]:
y_pred = np.argmax(y_pred_one_hot, axis=-1)
y_prob = np.max(y_pred_one_hot, axis=-1)
print(y_pred.shape, y_prob.shape)

In [None]:
y_uniq = np.unique(train_keep.landmark_id.values)
print(y_uniq)
y_pred = [y_uniq[Y] for Y in y_pred]

In [None]:
for i in range(len(sub)):
    sub.loc[i, "landmarks"] = str(y_pred[i])+" "+str(y_prob[i])
sub = sub.drop(columns="filename")
sub.to_csv("submission.csv", index=False)
sub