In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot  as plt # data visualization
import cv2
import os

plt.rc('axes', titlesize=14)     # fontsize of the axes title
plt.rc('figure', titlesize=18)  # fontsize of the figure title


DATASET_DIR = '/kaggle/input/landmark-recognition-2020'
TRAIN_IMAGE_DIR = '/kaggle/input/landmark-recognition-2020/train'
WIKIPEDIA_CATS_PATH = '/kaggle/input/wikipedia-categories-for-glr-2020/gldv2-train-category.csv'


def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

def plot_images(image_list,rows,cols,title):
    
    
    fig,ax = plt.subplots(rows,cols,figsize = (25,5*rows))
    ax = ax.flatten()
    
    for i, idx in enumerate(image_list[:rows*cols]):
        image = cv2.imread(TRAIN_IMAGE_DIR+'/{}/{}/{}/{}.jpg'.format(idx[0],idx[1],idx[2],idx))
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        ax[i].imshow(image)
        ax[i].set_axis_off()
        ax[i].set_title(idx)
    
    plt.suptitle(title)

    
def get_image_list (landmark_id):
    return train.query("landmark_id == @landmark_id")["id"].values    


def show_landmark (landmark_id,rows=5,cols=5, shuffle=True):
    image_list = get_image_list (landmark_id = landmark_id )
    if shuffle:
        np.random.shuffle(image_list)
    plot_images(image_list,rows=rows,cols=cols,title=f'{landmark_id} - # {rows*cols} of {len(image_list)}' ) 
    

train = pd.read_csv(f'{DATASET_DIR}/train.csv')
cats = pd.read_csv(WIKIPEDIA_CATS_PATH)
print("Shape of train_data :", train.shape)
print("Number of unique landmarks :", train["landmark_id"].nunique())



# Landmarks with highest number of images in train

In [None]:
top_landmarks = train.groupby("landmark_id").agg({'id':'count'}).reset_index().rename(columns={"id":"count"}).sort_values(by="count",ascending=False).reset_index(drop=True)[:10]
top_landmarks = top_landmarks.merge(cats, on="landmark_id")
top_landmarks

## Category:Media_contributed_by_the_ETH-Bibliothek (138982)

In [None]:
landmark_id=138982
show_landmark(landmark_id=landmark_id,rows=3,cols=5)
cats.query("landmark_id == @landmark_id").style.format({'category': make_clickable})

## Category:Corktown,_Toronto (126637)

In [None]:
landmark_id=126637
show_landmark(landmark_id=landmark_id,rows=3,cols=5)
cats.query("landmark_id == @landmark_id").style.format({'category': make_clickable})

## Category:Noraduz_Cemetery (20409) 	

In [None]:
landmark_id=20409
show_landmark(landmark_id=landmark_id,rows=3,cols=5)
cats.query("landmark_id == @landmark_id").style.format({'category': make_clickable})

# Wikipedia Instances

In [None]:
def get_image_list_by_istance (df,instance):
    return df.query("instance == @instance")["id"].values    


def show_instance (df,instance,rows=5,cols=5, shuffle=True):
    image_list = get_image_list_by_istance (df,instance )
    if shuffle:
        np.random.shuffle(image_list)
    plot_images(image_list,rows=rows,cols=cols,title=f'{instance} - # {rows*cols} of {len(image_list)}' ) 

In [None]:
cats["instance"] = cats["instance"].map(lambda x: str(x).strip() )

In [None]:
cats.groupby(["instance"]).agg({"landmark_id":"count"}).reset_index().sort_values(by="landmark_id", ascending=False)[:50]

In [None]:
train = train.merge(cats, on="landmark_id")

## Instance: church building 

In [None]:
instance="church building"
show_instance(train,instance,rows=3,cols=5)

## Instance: castle

In [None]:
instance="castle"
show_instance(train,instance,rows=3,cols=5)

## Instance: lighthouse

In [None]:
instance="lighthouse"
show_instance(train,instance,rows=3,cols=5)