# Google Landmark Recognition 2020

**Let's perform Exploratory Data Analysis to understand the data better**



# 1. Let's begin by importing libraries and packages

In [None]:
import os

import random
import seaborn as sns
import cv2


import pandas as pd
pd.set_option('display.max_colwidth', 1000)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import PIL
import IPython.display as ipd
import glob
import h5py
import plotly.graph_objs as go
import plotly.express as px
from PIL import Image, ImageDraw
from tempfile import mktemp


from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, LinearAxis, Range1d
from bokeh.models.tools import HoverTool
from bokeh.palettes import BuGn4
from bokeh.plotting import figure, output_notebook, show
from bokeh.transform import cumsum
from math import pi

output_notebook()

from IPython.display import Image, display
import warnings
warnings.filterwarnings("ignore")

# 2. **Loading Data**

In [None]:
image_samples = os.listdir('../input/landmark-recognition-2020/')

BASE_PATH = '../input/landmark-recognition-2020'

TRAIN_DIR = f'{BASE_PATH}/train'
TRST_DIR = f'{BASE_PATH}/test'

print('Reading Data ...')
train = pd.read_csv(f'{BASE_PATH}/train.csv')
submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')
print('Reading data is completed')

The dataset comprises of following important files:

**train.csv**: This file contains, ids and targets
* id: image id
* landmark_id: target landmark id

In [None]:
display(train.head(10))
print("Shape of train_data : ", train.shape)

In [None]:
display(submission.head())
print("Shape of Sample Submission", submission.shape)

# 3. Performing Exploratory Data Analysis

# Target Distribution (Number of images per landmark_id)

In [None]:
# display top 10 landmarks

landmark = train.landmark_id.value_counts()
landmark_df = pd.DataFrame({'landmark_id': landmark.index, 'frequency': landmark.values}).head(10)

landmark_df['landmark_id'] = landmark_df.landmark_id.apply(lambda x: f'landmark_id_{x}')
print(landmark_df.head())

fig = px.bar(landmark_df, x="frequency", y = "landmark_id", color='landmark_id', hover_data = ["landmark_id", "frequency"],
            height = 500, title = 'Number of Images per landmark_id (Top 10 landmark_ids)'
            )

fig.show()

**The most frequent landmark_id is 138982 and the frequency is 6272**

# Let's see least frequent landmarks

In [None]:
# display bottom 10 landmarks

landmark = train.landmark_id.value_counts()
landmark_df = pd.DataFrame({'landmark_id': landmark.index, 'frequency': landmark.values}).tail(10)

landmark_df['landmark_id'] = landmark_df.landmark_id.apply(lambda x: f'landmark_id_{x}')


fig = px.bar(landmark_df, x="frequency", y = "landmark_id", color='landmark_id', hover_data = ["landmark_id", "frequency"],
            height = 500, title = 'Number of Images per landmark_id (Top 10 landmark_ids)'
            )

fig.show()

**There are many least frequency landmarks with frequency as 2**

In [None]:
# Missing Data in the training set
total = train.isnull().sum().sort_values(ascending= False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending = False)
missing_train_data = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
missing_train_data.head()

In [None]:
#Class distribution

plt.figure(figsize = (10, 8))
plt.title('Category Distribuition')
sns.distplot(train['landmark_id'])

plt.show()

In [None]:
print("Number of classes under 20 occurences",
      (train['landmark_id'].value_counts() <= 20).sum(),
      'out of total number of categories',len(train['landmark_id'].unique()))

# 4 Visulaization of Images

In [None]:
import PIL
from PIL import Image, ImageDraw

def display_images(images, title=None): 
    f, ax = plt.subplots(5,5, figsize=(18,22))
    if title:
        f.suptitle(title, fontsize = 30)

    for i, image_id in enumerate(images):
        image_path = os.path.join(TRAIN_DIR, f'{image_id[0]}/{image_id[1]}/{image_id[2]}/{image_id}.jpg')
        image = Image.open(image_path)
        
        ax[i//5, i%5].imshow(image) 
        image.close()       
        ax[i//5, i%5].axis('off')

        landmark_id = train[train.id==image_id.split('.')[0]].landmark_id.values[0]
        ax[i//5, i%5].set_title(f"ID: {image_id.split('.')[0]}\nLandmark_id: {landmark_id}", fontsize="12")

    plt.show()

In [None]:
samples = train.sample(25).id.values
display_images(samples)

**Visualizing landmarks with most number of images**

In [None]:
samples = train[train.landmark_id == 138982].sample(25).id.values


display_images(samples)


In [None]:
lands = pd.DataFrame(train.landmark_id.value_counts())
lands.reset_index(inplace=True)
lands.columns = ['landmark_id','count']

In [None]:
print("Number of classes {}".format(lands.shape[0]))

In [None]:
print("Total of examples in train set = ",lands['count'].sum())

In [None]:
NUM_THRESHOLD = 50
top_lands = set(lands[lands['count'] >= NUM_THRESHOLD]['landmark_id'])
print("Number of TOP classes {}".format(len(top_lands)))

In [None]:
new_train = train[train['landmark_id'].isin(top_lands)]
print("Total of examples in subset of train: {}".format(new_train.shape[0]))

# Graphical Visualization of Landmarks Vs. Counts

In [None]:
ax = lands['count'].plot(loglog=True, grid=True)
ax.set(xlabel="Landmarks", ylabel="Count")

**#References -**
https://www.kaggle.com/rohitsingh9990/glr-eda-all-you-need-to-know/data

https://www.kaggle.com/rsmits/keras-landmark-or-non-landmark-identification

https://www.kaggle.com/codename007/a-very-extensive-landmark-exploratory-analysis