# Import libraries

In [None]:
import pandas as pd
import numpy as np
import os
from os import listdir
import cv2
import matplotlib.pyplot as plt
import glob
%matplotlib inline  
# To store resultimg plots/graphs in the notebook document below the respective code cells

!pip install chart_studio
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
import cufflinks
#Required to apply plotly
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

import seaborn as sns
sns.set(style='whitegrid')

import pydicom

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.show()

In [None]:
print(os.listdir('../input/landmark-recognition-2020/'))

# Training Data

In [None]:
BASE_DIR = '../input/landmark-recognition-2020/'

train_df = pd.read_csv(f'{BASE_DIR}train.csv')
sample_df = pd.read_csv(f'{BASE_DIR}sample_submission.csv')

print('Number of training examples {}'.format(train_df.shape[0]))
train_df.head()

In [None]:
print('sample_submission shape {}'.format(sample_df.shape))
sample_df.head()

## Train data info

In [None]:
print('## train_info ##')
print(train_df.info())

#### Clearly we don't have any missing values

## Number of unique landmarks

In [None]:

landmarks = len(train_df['landmark_id'].unique())
print('Number of unique landmarks in train {}'.format(landmarks))

In [None]:
print('Top few landmark_ids by count')

z = train_df.landmark_id.value_counts().head(10).to_frame()
z.reset_index(inplace=True)
z.columns=['landmark_id','count']
z.landmark_id = z.landmark_id.apply(lambda x: f'id_{x}')

z.style.background_gradient(cmap='Oranges')

# Let's visualize some distributions

In [None]:
# distribution of landmark_ids

train_df['landmark_id'].value_counts().sort_values(ascending=False)\
.iplot(kind='barh',
      xTitle='Count',
      yTitle='landmark_id',
      linecolor='black',
      opacity=0.7,
      color='orange',
      theme='pearl',
      bargap=0,
      gridcolor='white',
      title='[Interactive] Distribution of landmark_ids from training set')

In [None]:
# distribution of top few landmark_ids based on count

plt.figure(figsize=(14,5))
plt.title('Top few landmark_id(s) based on count')

sns.set_color_codes("pastel")
sns.barplot(x='landmark_id', y='count', data=z,)
plt.show()

In [None]:
print('Bottom few landmark_ids based on count')

z_ = train_df.landmark_id.value_counts().tail(10).to_frame()
z_.reset_index(inplace=True)
z_.columns=['landmark_id','count']
z_.landmark_id = z_.landmark_id.apply(lambda x: f'id_{x}')

z_.style.background_gradient(cmap='Oranges')
#few landmark_ids with least count

In [None]:
# distribution of bottom few landmark_ids based on count

plt.figure(figsize=(14,5))
plt.title('Bottom few landmark_id(s) based on count')

sns.set_color_codes("pastel")
sns.barplot(x='landmark_id', y='count', data=z_,)
plt.show()

In [None]:
#density plot

plt.figure(figsize=(9,5))
plt.title('landmark_id distribution')
plt.ylabel('Density')
sns.distplot(train_df.landmark_id, label='Train landmark_ids',color='#fdc029')
plt.show()

# Scatter plot for Number of images for each landmark_id

In [None]:
#scatter plot
temp = train_df.landmark_id.value_counts().to_frame()
temp.reset_index(inplace=True)
temp.columns=['landmark_id','count']

plt.figure(figsize=(14,8))
sns.scatterplot(x='landmark_id', y='count', data=temp)
plt.ylabel('# of images')
plt.xlabel('landmark id')
plt.title('Number of images for each landmark category')

In [None]:
print('Count of landmark_ids whick are in less than 100 images {}'.format(len(temp[temp['count']<100])))
percentage = len(temp[temp['count']<100])/landmarks * 100
print('{0:.2f}% of landmark_ids with less than 100 reference images'.format(percentage))

# Lets plot some random images from train

In [None]:
train_list = glob.glob('../input/landmark-recognition-2020/train/*/*/*/*')

In [None]:
f, axes = plt.subplots(3, 4, figsize=(48, 20))
rnd = np.random.choice(100)

curr_row = 0
for i in range(12):
    image = cv2.imread(train_list[i+rnd])
    
    col = i%4
    axes[curr_row, col].imshow(image)
    if col == 3:
        curr_row += 1

### More is coming...

### References:
[https://www.kaggle.com/huangxiaoquan/google-landmarks-v2-exploratory-data-analysis-eda/notebook](https://www.kaggle.com/huangxiaoquan/google-landmarks-v2-exploratory-data-analysis-eda/notebook)
[https://www.kaggle.com/codename007/a-very-extensive-landmark-exploratory-analysis](https://www.kaggle.com/codename007/a-very-extensive-landmark-exploratory-analysis)

# If you like my kernel, do upvote :)