## Google Landmark Recognition 2020

### Import required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook as tqdm

import glob
import cv2
import os

from colorama import Fore, Back, Style

# Setting color palette.
plt.rcdefaults()
plt.style.use('dark_background')

import warnings
warnings.filterwarnings("ignore")

### Declare path variables

In [None]:
# Assigning paths to variables
INPUT_PATH = os.path.join('..', 'input')
DATASET_PATH = os.path.join(INPUT_PATH, 'landmark-recognition-2020')
TRAIN_IMAGE_PATH = os.path.join(DATASET_PATH, 'train')
TEST_IMAGE_PATH = os.path.join(DATASET_PATH, 'test')
TRAIN_CSV_PATH = os.path.join(DATASET_PATH, 'train.csv')
SUBMISSION_CSV_PATH = os.path.join(DATASET_PATH, 'sample_submission.csv')

### Load CSV files

In [None]:
train = pd.read_csv(TRAIN_CSV_PATH)
print("training dataset has {} rows and {} columns".format(train.shape[0],train.shape[1]))

submission = pd.read_csv(SUBMISSION_CSV_PATH)
print("submission dataset has {} rows and {} columns \n".format(submission.shape[0],submission.shape[1]))

### Folder Structure

In [None]:
# understand folder structure
print(Fore.YELLOW + "If you want to access image a40d00dc4fcc3a10, you should traverse as shown below:\n",Style.RESET_ALL)

print(Fore.GREEN + f"Image name: {train['id'].iloc[9]}\n",Style.RESET_ALL)

print(Fore.BLUE + f"First folder to look inside: {train['id'][9][0]}")
print(Fore.BLUE + f"Second folder to look inside: {train['id'][9][1]}")
print(Fore.BLUE + f"Second folder to look inside: {train['id'][9][2]}",Style.RESET_ALL)

## Build dictionary to store image paths & labels

In [None]:
print(Fore.BLUE + f"{'---'*20} \n Mapping for Training Data \n {'---'*20}")
data_label_dict = {'image': [], 'target': []}
for i in tqdm(range(train.shape[0])):
    data_label_dict['image'].append(
        TRAIN_IMAGE_PATH + '/' +
        train['id'][i][0] + '/' + 
        train['id'][i][1]+ '/' +
        train['id'][i][2]+ '/' +
        train['id'][i] + ".jpg")
    data_label_dict['target'].append(
        train['landmark_id'][i])

#Convert to dataframe
train_pathlabel = pd.DataFrame(data_label_dict)
print(train_pathlabel.head())
    
print(Fore.BLUE + f"{'---'*20} \n Mapping for Test Data \n {'---'*20}",Style.RESET_ALL)
data_label_dict = {'image': []}
for i in tqdm(range(submission.shape[0])):
    data_label_dict['image'].append(
        TEST_IMAGE_PATH + '/' +
        submission['id'][i][0] + '/' + 
        submission['id'][i][1]+ '/' +
        submission['id'][i][2]+ '/' +
        submission['id'][i] + ".jpg")

test_pathlabel = pd.DataFrame(data_label_dict)
print(test_pathlabel.head())

In [None]:
# list of unique landmark ids
train.landmark_id.unique()

In [None]:
# count of unique landmark_ids
print("There are", train.landmark_id.nunique(), "landmarks in the training dataset")

In [None]:
# each class count-wise
train.landmark_id.value_counts()

### Chceck File sizes of first 10 files

In [None]:
files = train_pathlabel.image[:10]
print(Fore.BLUE + "Shape of files from training dataset",Style.RESET_ALL)
for i in range(10):
    im = cv2.imread(files[i])
    print(im.shape)


print("------------------------------------")    
print("------------------------------------")    
print("------------------------------------")    

files = test_pathlabel.image[:10]
print(Fore.BLUE + "Shape of files from test dataset",Style.RESET_ALL)
for i in range(10):
    im = cv2.imread(files[i])
    print(im.shape)

## Exploratory Data Analysis

### Density plot for class distribution 

In [None]:
plt.figure(figsize = (12, 8))

sns.kdeplot(train['landmark_id'], color="yellow",shade=True)
plt.xlabel("LandMark IDs")
plt.ylabel("Probability Density")
plt.title('Class Distribution - Density plot')

plt.show()

### Top 10 class categories

In [None]:
fig = plt.figure(figsize = (12,8))

count = train.landmark_id.value_counts().sort_values(ascending=False)[:10]

sns.countplot(x=train.landmark_id,
             order = train.landmark_id.value_counts().sort_values(ascending=False).iloc[:10].index)

plt.xlabel("LandMark Id")
plt.ylabel("Frequency")
plt.title("Top 10 Classes in the Dataset")

plt.show()

Note that Landmark id '138982' has more than 6000 images, next top 9 clasess in this table have less than 2500 images


### Viewing some landmarks 

In [None]:
top6 = train.landmark_id.value_counts().sort_values(ascending=False)[:6].index

images = []

for i in range(6):
    img=cv2.imread(train_pathlabel[train_pathlabel.target == top6[i]]['image'].values[1])   
    images.append(img)

f, ax = plt.subplots(3,2, figsize=(20,15))
for i, img in enumerate(images):        
        ax[i//2, i%2].imshow(img)
        ax[i//2, i%2].axis('off')

### Top 50 Class Categories

In [None]:
fig = plt.figure(figsize = (12,8))

count = train.landmark_id.value_counts().sort_values(ascending=False)[:50]

sns.countplot(x=train.landmark_id,
             order = train.landmark_id.value_counts().sort_values(ascending=False).iloc[:50].index)

plt.xticks(rotation = 90)

plt.xlabel("LandMark Id")
plt.ylabel("Frequency")
plt.title("Top 50 Classes in the Dataset")

plt.show()

In [None]:
top50 = train.landmark_id.value_counts().sort_values(ascending=False).index[:50]

images = []

for i in range(50):
    img=cv2.imread(train_pathlabel[train_pathlabel.target == top50[i]]['image'].values[1])   
    images.append(img)

f, ax = plt.subplots(10,5, figsize=(20,15))
for i, img in enumerate(images):        
        ax[i//5, i%5].imshow(img)
        ax[i//5, i%5].axis('off')

### Bottom 10 Class Categories

In [None]:
fig = plt.figure(figsize = (10,6))

count = train.landmark_id.value_counts()[-10:]

sns.countplot(x=train.landmark_id,
             order = train_pathlabel.target.value_counts().iloc[-10:].index)

plt.xlabel("LandMark Id")
plt.ylabel("Frequency")
plt.title("Bottom 10 Classes in the Dataset")

plt.show()

Just 2 images per class for the bottom 10 classes

Observations from the whole analysis done above: 
*     There are 81313 unique landmark_ids 
*     There is only one landmark which has more than 6000 images
*     Number of images per landmark_id ranges from 2 to 6272.

### Histogram of grayscale images
* We will loaded the grayscale images here & generated its histogram
* Since the images are stored in the form of a 2D ordered matrix we converted it to a 1D array using the ravel() method

In [None]:
files = train_pathlabel.image[:4]

fig = plt.figure(figsize = (20,9))

for i in range(4):
    img=cv2.imread(files[i])   
    plt.subplot(2,2,i+1)
    plt.hist(img.ravel(), bins = 256,color = 'gold')
    
plt.suptitle("Histogram for Grayscale Images",fontsize = 25)    
plt.show()

### Histogram of grayscale images with bins = 8

Usually, the range of intensity values of images is from [0–255] in 8bits representation(2⁸).
But images can be also represented using 2¹⁶, 2³² bits and so on. In such cases the intensity range is high and it is hard to represent each intensity value in a histogram.

We use binning to overcome the above problem. Here we quantize the range into several buckets. For example,
If we quantize 0-255 into 8 bins, here our bins will be: 0-31, 32-63, 64-95, 96-127, 128-159, 160-191, 192-223, 224-255

In [None]:
fig = plt.figure(figsize = (20,9))

for i in range(4):
    img=cv2.imread(files[i])   
    plt.subplot(2,2,i+1)
    plt.hist(img.ravel(), bins = 8, color = "coral")

plt.suptitle("Cumulative Histogram for Grayscale Images - Bin Size = 8",fontsize = 25)    
plt.show()

### Cumulative Histogram

The cumulative histogram is a special histogram that can be derived from the normal histogram. We find the counts of each intensity value from 0–255 and then add each subsequent counts.

In [None]:
fig = plt.figure(figsize = (20,9))

for i in range(4):
    img=cv2.imread(files[i])   
    plt.subplot(2,2,i+1)
    plt.hist(img.ravel(), bins = 256,color = 'magenta',cumulative = True)

plt.suptitle("Cumulative Histogram for Grayscale Images",fontsize = 25)    
plt.show()

### Histogram of Color Images

In color images, we have 3 color channels representing RGB. In Combined Color Histogram the intensity count is the sum of all three color channels.

In [None]:
fig = plt.figure(figsize = (20,9))

for i in range(4):
    img=cv2.imread(files[i])   
    plt.subplot(2,2,i+1)
    plt.hist(img.ravel(), bins = 256, color = 'orange', )
    plt.hist(img[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
    plt.hist(img[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
    plt.hist(img[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
    plt.xlabel('Intensity Value')
    plt.ylabel('Count')
    plt.legend(['Total', 'Red_Channel', 'Green_Channel', 'Blue_Channel'])

plt.suptitle("Color Histograms",fontsize = 25)    
plt.show()