In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm as tqdm
from collections import Counter

import cv2
import copy
import csv
import os
import pathlib
import random

# Assign paths
input_path = os.path.join('..', 'input')
dataset_path = os.path.join(input_path, 'landmark-recognition-2020')
train_path = os.path.join(dataset_path, 'train')
test_path = os.path.join(dataset_path, 'test')
train_csv_path = os.path.join(dataset_path, 'train.csv')
submission_csv_path = os.path.join(dataset_path, 'sample_submission.csv')

# Load data to dataframe
train = pd.read_csv(train_csv_path)
submission = pd.read_csv(submission_csv_path)

print("Training dataset has {} images".format(train.shape[0]))
print("Submission dataset has {} rows and {} columns \n".format(submission.shape[0],submission.shape[1]))

In [None]:
# Create dictionary and load image data
data_label_dict = {'image_name': [], 'landmark_id': [], 'image_dir': []}
for i in tqdm(range(train.shape[0])):
    data_label_dict['image_name'].append(train['id'][i])
    data_label_dict['landmark_id'].append(train['landmark_id'][i])
    train_image_dir = "{}/{}/{}/{}/{}.jpg".format(train_path,train['id'][i][0],train['id'][i][1],train['id'][i][2],train['id'][i])
    data_label_dict['image_dir'].append(train_image_dir)

# Convert to dataframe
train_pathlabel = pd.DataFrame(data_label_dict)

In [None]:
train_pathlabel = pd.DataFrame(data_label_dict)
# Reduce set to input 
top_classes = 1000
id_sorted = train_pathlabel.landmark_id.values
count = Counter(id_sorted).most_common(top_classes)
keep_classes = [i[0] for i in count]
temp_train = train_pathlabel[train_pathlabel.landmark_id.isin(keep_classes)]
temp_train = temp_train.sample(frac=1).reset_index(drop=True)

value_count = pd.DataFrame(temp_train.landmark_id.value_counts())
value_count.reset_index(inplace=True) 
value_count.columns=['landmark_id','count']


top = value_count.landmark_id.iloc[0]
top_only = temp_train.loc[temp_train.landmark_id.isin([top])]

reduced_top = top_only[:1000]

without = temp_train[temp_train.landmark_id != top]
final_train = pd.concat([without, reduced_top], ignore_index=True)

final_train = final_train.sample(frac=1).reset_index(drop=True)
train_pathlabel = final_train
# Save to csv
train_pathlabel.to_csv("train_data.csv",index=False)
train_pathlabel

In [None]:
# Variables from data
nr_classes = train_pathlabel.landmark_id.nunique()
nr_image_dir = train_pathlabel.image_dir.nunique()
class_count = train_pathlabel.landmark_id.value_counts()
worst_classes = train_pathlabel.landmark_id.min()

print("There are {} images from {} classes in the training dataset".format(nr_image_dir, nr_classes))
print("Minimum images in each class {}".format(worst_classes))

print("\nClasses with 5 or less images in full set:", (train.landmark_id.value_counts().between(0,5)).sum()) 
print("Classes with between 5 and 10 images in full set:", (train.landmark_id.value_counts().between(5,10)).sum())

print("\nClasses with 5 or less images in reduced set:", (class_count.between(0,5)).sum()) 
print("Classes with between 5 and 10 images in reduced set:", (class_count.between(5,10)).sum())

In [None]:
# Histogram of full set in custom bins
plt.figure(figsize = (12, 8))
plot = pd.DataFrame(train.landmark_id.value_counts())

order = ['1-5','5-10','10-50','50-100','100-200','200-500','>=500']
plot['Number of images'] = np.where(plot['landmark_id']>=500,'>=500',plot['landmark_id'])
plot['Number of images'] = np.where((plot['landmark_id']>=200) & (plot['landmark_id']<500),'200-500',plot['Number of images'])
plot['Number of images'] = np.where((plot['landmark_id']>=100) & (plot['landmark_id']<200),'100-200',plot['Number of images'])
plot['Number of images'] = np.where((plot['landmark_id']>=50) & (plot['landmark_id']<100),'50-100',plot['Number of images'])
plot['Number of images'] = np.where((plot['landmark_id']>=10) & (plot['landmark_id']<50),'10-50',plot['Number of images'])
plot['Number of images'] = np.where((plot['landmark_id']>=5) & (plot['landmark_id']<10),'5-10',plot['Number of images'])
plot['Number of images'] = np.where((plot['landmark_id']>=0) & (plot['landmark_id']<5),'1-5',plot['Number of images'])

plot['Number of images'].value_counts().loc[order].plot(kind = 'bar', width = 0.8)
plt.xlabel('Number of images')
plt.ylabel('Classes')
plt.title('Distribution of classes')
plt.show()

In [None]:
# Sort full set and plot the top input classes
fig = plt.figure(figsize = (12,8))

sns.countplot(x=train_pathlabel.landmark_id, order = train_pathlabel.landmark_id.value_counts().sort_values(ascending=False).iloc[:100].index)

plt.xlabel("LandMark Id")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
plt.title("Top 100 Classes in the Dataset")

plt.show()

In [None]:
# Show four random images from reduced set
images = []

for i in range(4):
    img = cv2.imread(random.choice(train_pathlabel.image_dir))   
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    images.append(img)

f, ax = plt.subplots(2,2, figsize=(20,15))
for i, img in enumerate(images):        
        ax[i//2, i%2].imshow(img)
        ax[i//2, i%2].axis('off')

---

# Model training

In [None]:
import tensorflow as tf
import keras
from tensorflow.keras import activations
from keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten, BatchNormalization, Dropout
from keras.models import Model, Sequential
from keras.applications.vgg16 import preprocess_input, decode_predictions
from keras.applications import VGG16
from keras.optimizers import SGD, Adam
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
from keras.utils.np_utils import to_categorical

data_csv_path = "./train_data.csv"
data = pd.read_csv(data_csv_path, dtype=str)

data.head()

In [None]:
# Set up parameters
batch_size = 128
validation_split = 0.2
img_height = 64
img_width = 64
target_shape=(img_height,img_width)

# Create generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
validation_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Load images to generator
train_generator = train_datagen.flow_from_dataframe(
        dataframe = data,
        x_col = "image_dir",
        y_col = "landmark_id",
        target_size = target_shape,
        color_mode = "rgb",
        class_mode = "categorical",
        batch_size = batch_size,
        subset = 'training'
)

validation_generator = validation_datagen.flow_from_dataframe(
        dataframe = data,
        x_col = "image_dir",
        y_col = "landmark_id",
        target_size = target_shape,
        shuffle = False,
        color_mode = "rgb",
        class_mode = "categorical",
        batch_size = 1,
        subset = 'validation'
) 

In [None]:
# Model and training Parameters
epochs          = 15
epoch_shuffle   = False
steps_per_epoch = train_generator.samples // batch_size
valid_per_epoch = validation_generator.samples // batch_size
loss_func   = "categorical_crossentropy"

In [None]:
model = keras.models.load_model("../input/rvgismodel/model.h5")

In [None]:
# Predict on every sample in validation set
predict = model.predict(validation_generator, steps = validation_generator.samples)

In [None]:
# Create variables from prediction data and labels from generator
predicted_class_indices = np.argmax(predict,axis=1)
class_prob = np.max(predict, axis=1)

labels = validation_generator.class_indices
val_classes = validation_generator.classes
image_paths = validation_generator.filenames

labels_dict = dict((v,k) for k,v in labels.items())
predictions = [labels_dict[k] for k in predicted_class_indices]

In [None]:
# Create dictionaries and load data
good_dict = {'image_path': [], 'landmark_id': [], 'prediction':[], 'probability':[]}
bad_dict = {'image_path': [], 'landmark_id': [], 'prediction':[], 'probability':[]}
all_dict = {'image_path': [], 'landmark_id': [], 'prediction':[], 'probability':[]}

for i in range(predict.shape[0]):
    all_dict['image_path'].append(image_paths[i])
    all_dict['landmark_id'].append(labels_dict[val_classes[i]])
    all_dict['prediction'].append(predictions[i])
    all_dict['probability'].append(class_prob[i])
        
    if val_classes[i] == predicted_class_indices[i]:
        good_dict['image_path'].append(image_paths[i])
        good_dict['landmark_id'].append(labels_dict[val_classes[i]])
        good_dict['prediction'].append(predictions[i])
        good_dict['probability'].append(class_prob[i])
    else:
        bad_dict['image_path'].append(image_paths[i])
        bad_dict['landmark_id'].append(labels_dict[val_classes[i]])
        bad_dict['prediction'].append(predictions[i])
        bad_dict['probability'].append(class_prob[i])

# Save to dataframe        
good_preds = pd.DataFrame(good_dict)
bad_preds = pd.DataFrame(bad_dict)
all_preds = pd.DataFrame(all_dict)
perc = len(good_preds.landmark_id)/len(all_preds.landmark_id)*100

print("Predictions: {}".format(len(all_preds.landmark_id)))
print("Correct predictions: {}".format(len(good_preds.landmark_id)))
print("Icorrect predictions: {}".format(len(bad_preds.landmark_id)))
print("Correct percentage: {}%".format(perc))

In [None]:
# Count instances of predictions
best_preds = pd.DataFrame(good_preds.landmark_id.value_counts())
best_preds.reset_index(inplace=True) 
best_preds.columns=['landmark_id','correct']

worst_preds = pd.DataFrame(bad_preds.landmark_id.value_counts())
worst_preds.reset_index(inplace=True) 
worst_preds.columns=['landmark_id','incorrect']

all_preds_count = pd.DataFrame(all_preds.landmark_id.value_counts())
all_preds_count.reset_index(inplace=True) 
all_preds_count.columns=['landmark_id','predictions']

# Merge datasets
merged = all_preds_count.merge(best_preds, how='left', left_on='landmark_id', right_on='landmark_id')
merged_all = merged.merge(worst_preds, how='left', left_on='landmark_id', right_on='landmark_id')

merged_all.fillna(0, inplace=True)

# Compute ratios
merged_all = merged_all.assign(correct_ratio = (merged_all.correct/merged_all.predictions)*100)
merged_all = merged_all.assign(incorrect_ratio = (merged_all.incorrect/merged_all.predictions)*100)
merged_all.correct_ratio = merged_all.correct_ratio.round(decimals=2)
merged_all.incorrect_ratio = merged_all.incorrect_ratio.round(decimals=2)

merged_all.head(10)

In [None]:
# Dataframe sorted based on highest ratio of correct predictions
best_ratio = merged_all.sort_values('correct_ratio', ascending=False)
best_ratio.head(10)

In [None]:
# Dataframe sorted based on highest ratio of incorrect predictions
worst_ratio = merged_all.sort_values('incorrect_ratio', ascending=False)
worst_ratio.head(10)

In [None]:
# Show 6 images with correct classification
good_images = []
good_names = []
good_probs = []
good_img_df = good_preds.sample(n=6)

for i in range(6):
    img = cv2.imread(good_img_df.image_path.iloc[i])   
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    good_images.append(img)
    good_names.append(good_img_df.landmark_id.iloc[i])
    good_probs.append(good_img_df.probability.iloc[i])

f, ax = plt.subplots(2,3, figsize=(20,15))
for i, img in enumerate(good_images):        
        ax[i//3, i%3].imshow(img)
        ax[i//3, i%3].axis('off')
        ax[i//3, i%3].set_title("Landmark: {} | Prob: {}".format(good_names[i], np.round(good_probs[i], 3)))

In [None]:
# Show 6 images with incorrect classification
bad_images = []
bad_names = []
bad_probs = []
img_df = bad_preds.sample(n=6)

for i in range(6):
    img = cv2.imread(img_df.image_path.iloc[i])   
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    bad_images.append(img)
    bad_names.append(img_df.landmark_id.iloc[i])
    bad_probs.append(img_df.probability.iloc[i])

f, ax = plt.subplots(2,3, figsize=(20,15))
for i, img in enumerate(bad_images):        
        ax[i//3, i%3].imshow(img)
        ax[i//3, i%3].axis('off')
        ax[i//3, i%3].set_title("Landmark: {} | Prob: {}".format(bad_names[i], np.round(bad_probs[i], 3)))

---

# Test and submit

In [None]:
# Create test dictionary and load to dataframe 
test_dict = {'filename': []}
for i in tqdm(range(submission.shape[0])):
    test_dict['filename'].append(
        test_path + '/' +
        submission['id'][i][0] + '/' + submission['id'][i][1]+ '/' +
        submission['id'][i][2]+ '/' + submission['id'][i] + ".jpg")

test_pathlabel = pd.DataFrame(test_dict)
test_pathlabel.head()

In [None]:
# Create test generator and load images
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
        dataframe = test_pathlabel,
        x_col="filename",
        y_col=None,
        target_size = target_shape,
        shuffle = False,
        color_mode = "rgb",
        class_mode = None,
        batch_size = 1,
        validate_filenames=False
) 

In [None]:
# Predict on every sample
test_predict = model.predict(test_generator, steps = test_generator.samples)

In [None]:
# Load prediction data and write to submission file
for i in tqdm(range(len(test_pathlabel))):
    class_predicted = np.argmax(test_predict[i], axis=0)
    class_prob = np.max(test_predict[i], axis=0)
    test_pathlabel.loc[i, "landmarks"] = str(class_predicted)+" "+str(class_prob)
test_pathlabel = test_pathlabel.drop(columns="filename")
test_pathlabel.to_csv("submission.csv", index=False)
test_pathlabel