In [None]:
# Loading data

from __future__ import print_function
from __future__ import division
import os
import pandas as pd
import matplotlib as mpl
import numpy as np
import cv2
from PIL import *
from numpy import interp

import random


import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator

path = '/kaggle/input/landmark-recognition-2021'
os.listdir(path)
train_images = f'{path}/train'
train_df = pd.read_csv(f'{path}/train.csv')
train_df['path'] = train_df['id'].apply(lambda f: os.path.join('../input/landmark-recognition-2021/train',f[0], f[1], f[2], f + '.jpg'))
test_images = f'{path}/test'
test_df = pd.read_csv(f'{path}/sample_submission.csv')
test_df['path'] = test_df['id'].apply(lambda f: os.path.join('../input/landmark-recognition-2021/test',f[0], f[1], f[2], f + '.jpg'))

In [None]:
### Exploring the dataset

num_classes = train_df['landmark_id'].nunique()
print('Number of classes:', num_classes)
print('Number of images in training set:', len(train_df))

print('Distribution of images in classes:')
counts = train_df['landmark_id'].value_counts()
print(counts.describe()) 

# Show a histogram of the number of instances per class
# takes a while to compute
#hist = plt.figure(figsi)
#ax = train_df.plot.hist(bins=num_classes, alpha=1)
#plt.ylim([0,800])
#plt.show()

# How many classes have less than 5 training samples? And between 5 and 10 training samples?
listoflists = train_df.groupby('landmark_id').apply(lambda g: g.index.tolist())
lessthan5 = []
between510 = []
for listi in listoflists:
    if len(listi)<5:
        lessthan5.append(listi)
    if len(listi)>4 and len(listi)<11:
        between510.append(listi)
        
print (len(lessthan5), "classes have less than 5 training samples")
print (len(between510), "classes have between 5 and 10 training samples")

#Show 4 sample images from 4 random classes (16 images in total)
samplelist = []
for i in range(0,4):
    #select a random class
    randclass = train_df[train_df['landmark_id'] == counts.iloc[[np.random.randint(0,num_classes)]].index[0]]
    for j in range (0,4):
        # select random images from class
        randimg = randclass.iloc[np.random.randint(0,len(randclass))]
        samplelist.append(randimg)

plt.subplots(4, 4, figsize=(15, 15))
for i in range(len(samplelist)):
    plt.subplot(4, 4, i + 1)
    plt.axis('Off')
    image = cv2.imread(samplelist[i][2])
    plt.imshow(image)
    plt.title(f'landmark id:{samplelist[i][1]} ', fontsize=10)

3.1.D Describe if/how you think the data distribution will affect training of a classifier:
I think the the data distribution could make it very difficult to train a good classifier. The fact that a few classes contain so many images compared to others might make the model cheat by predicting only the class with many images, and the results will then appear good with high accuracy and loww loss from training until the prediction phase, where it will predict the same label for all testing images.

In [None]:
# Training a model

def PathToimages(pathlist):
    images=[]
    for imagefile in pathlist:
        img_pix = cv2.imread(imagefile,1)
        images.append(cv2.resize(img_pix, (image_size,image_size)))
    
    
    return images

# Parameters

epochs = 30
batch_size = 16
learning_rate= 0.001
image_size = 128
train_split = 0.7 # in percent
val_split = 0.2 # in percent
num_of_classes = 50 # number of images determined based the number found in the classes



imagelist = []
actuallabels = []
temp_labels = []

#get classes with similar amount of images
i=0
for label in train_df['landmark_id'].unique():
    if i ==num_of_classes:
        break
    if(len(train_df['path'][train_df['landmark_id'] == label].value_counts())>50 and len(train_df['path'][train_df['landmark_id'] == label].value_counts())<150):
        for path in train_df['path'][train_df['landmark_id'] == label]:
            imagelist.append(path)
            actuallabels.append(label)
            temp_labels.append(i)
        i = i+1



shuf = list(zip(imagelist,temp_labels))
random.shuffle(shuf)

imagelist, labels = zip(*shuf)


imagenum = round(len(imagelist)*train_split)

trainimagelist = imagelist[:imagenum]
print("resizing ", len(trainimagelist), "images")
train_data = PathToimages(trainimagelist)
trainlabels = labels[:imagenum]






print('Images: ', len(train_data))
print('Image labels: ', len(trainlabels))

X_data = np.array(train_data) / 255
Y_data =  to_categorical(trainlabels, num_classes = num_of_classes) 

X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size = val_split, random_state=101)



datagen = ImageDataGenerator(horizontal_flip=False,
                             vertical_flip=False,
                             rotation_range=0.0,
                             zoom_range=0.2,
                             width_shift_range=0.0,
                             height_shift_range=0.0,
                             shear_range=0.0,
                             fill_mode="nearest")



pretrained_model = tf.keras.applications.DenseNet201(input_shape=(image_size,image_size,3),
                                                      include_top=False,
                                                      weights='imagenet',
                                                      pooling='avg')


inputs = pretrained_model.input

drop_layer = tf.keras.layers.Dropout(0.25)(pretrained_model.output)
x_layer = tf.keras.layers.Dense(512, activation='relu')(drop_layer)
x_layer1 = tf.keras.layers.Dense(128, activation='relu')(x_layer)
drop_layer1 = tf.keras.layers.Dropout(0.20)(x_layer1)

x_layer2 = tf.keras.layers.Dense(512, activation='relu')(drop_layer1)
x_layer3 = tf.keras.layers.Dense(128, activation='relu')(x_layer2)
drop_layer5 = tf.keras.layers.Dropout(0.20)(x_layer3)

x_layer4 = tf.keras.layers.Dense(512, activation='relu')(drop_layer5)
x_layer5 = tf.keras.layers.Dense(128, activation='relu')(x_layer4)
drop_layer6 = tf.keras.layers.Dropout(0.20)(x_layer5)


output = tf.keras.layers.Dense(num_of_classes, activation='softmax')(drop_layer6)

model = tf.keras.Model(inputs=inputs, outputs=output)

model.compile(optimizer = tf.optimizers.Adam(lr=learning_rate),
              loss="categorical_crossentropy",
              metrics=['acc'])

#model.summary()

history = model.fit(datagen.flow(X_train,Y_train,batch_size=batch_size),validation_data=(X_val,Y_val),epochs=epochs)


In [None]:
# plot results

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()
plt.savefig('/kaggle/working/accuracy.png')

plt.show()



plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend(loc=0)
plt.figure()
plt.savefig('/kaggle/working/loss.png')

plt.show()


In [None]:
## predict

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import metrics



testimagelist = imagelist[len(trainimagelist):]
testimagelist = PathToimages(testimagelist)
testlabels = labels [len(trainlabels):]

test_data = np.array(testimagelist) / 255
predictedlabels = model.predict(datagen.flow(test_data,batch_size=batch_size))

highestacc =[]
howconfident = []
for i in predictedlabels:
    #highest = max(i)
    howconfident.append(max(i))
    highestacc.append(np.argmax(i))

#print(highestacc)
#label_ranking_average_precision_score((testlabels, 200), (highestacc,200))
precision, recall, fscore, support = score(testlabels, highestacc, labels=np.unique(highestacc))
#print('precision: {}'.format(precision))
#print('recall: {}'.format(recall))
#print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))


print(metrics.confusion_matrix(testlabels, highestacc))
print(metrics.classification_report(testlabels, highestacc, digits=3))
#print(results)





#plt.subplots(4, 4, figsize=(15, 15))
for i in range(len(highestacc)):
#    plt.subplot(4, 4, i + 1)
#    plt.axis('Off')
    #image =  cv2.imread(testimagelist[i],1)
    plt.axis('Off')
    #print ("number:", str(i), 'true label:' , str(testlabels[i]), 'predicted label:',  str(highestacc[i]),'confidence:' , str(howconfident[i]))
    if (testlabels[i] == highestacc[i]):
        print("success")
        title = ( 'true label: '  + str(testlabels[i]) + ' predicted label: '+  str(highestacc[i]) + ' confidence: ' + str(howconfident[i]))
        plt.title(title, fontsize=10)
        plt.imshow(testimagelist[i])
        plt.show()
        
    elif(howconfident[i]>0.9):
        title = ( 'true label: '  + str(testlabels[i]) + ' predicted label: '+  str(highestacc[i]) + ' confidence: ' + str(howconfident[i]))
        plt.title(title, fontsize=10)
        plt.imshow(testimagelist[i])
        plt.show()
        #print ("number:", str(i), 'true label:' , str(testlabels[i]), 'predicted label:',  str(highestacc[i]),'confidence:' , str(howconfident[i]))
    
    
    #plt.imshow(image)
#    title =  "{'predicted label:' '%s','true label:' '%s'}" % ( str(np.argmax(predictedlabels[i])) , str(testlabels[i]))
#    plt.title( title, fontsize=10)
    


#loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
#dot_img_file = './model_1.png'
#tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)