## Google Landmark Recognition - Basic EDA

### Imports

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

### Utility Functions

In [None]:
BASE_PATH = '../input/landmark-recognition-2021'

@np.vectorize
def get_train_filepath(data_id):
    return os.path.join(BASE_PATH, 'train', data_id[0], data_id[1], data_id[2], data_id + '.jpg')

@np.vectorize
def get_test_filepath(data_id):
    return os.path.join(BASE_PATH, 'test', data_id[0], data_id[1], data_id[2], data_id + '.jpg')

@np.vectorize
def get_img_dim(filepath):
    return Image.open(filepath).size

## Training Data

In [None]:
train_df = pd.read_csv('../input/landmark-recognition-2021/train.csv')
train_df.head()

In [None]:
train_df.shape

In [None]:
%%time

train_df["filepath"] = get_train_filepath(train_df['id'])
train_df[['width', 'height']] = np.array(get_img_dim(train_df["filepath"])).T
train_df['aspect_ratio'] = train_df['width'] / train_df['height']
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
train_df[["width", "height", "aspect_ratio"]].describe().round(2)

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = train_df[["width", "height"]], common_norm = False)
plt.xlabel("Pixels", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.legend(["Width", "Height"], loc = "upper left", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = train_df["aspect_ratio"], common_norm = False)
plt.xlabel("Aspect Ratio", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.xticks(np.arange(int(min(train_df["aspect_ratio"])), int(max(train_df["aspect_ratio"])) + 1, 1.0))
plt.show()

In [None]:
landmark_vc = train_df[["landmark_id"]].value_counts()
print(f"Total unique landmarks: {train_df[['landmark_id']].nunique().values[0]}")
print("\nLandmarks with most samples:\n")
landmark_vc.sort_values(ascending = False)[:10]

In [None]:
print("\nLandmarks with least samples:\n")
landmark_vc.sort_values()[:10]

In [None]:
mask = landmark_vc >= len(train_df) * 0.001
print(f"Count of landmarks with at least 0.1% of total samples: {landmark_vc[mask].count()}")
print(f"Count of landmarks with less than 0.1% of total samples: {landmark_vc[~mask].count()}")

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = landmark_vc, common_norm = False)
plt.xlabel("Count", fontsize = 16)
plt.ylabel("Number of Samples", fontsize = 16)
plt.title("Most landmarks have very few samples", fontsize = 20)
plt.show()

## Testing Data

In [None]:
test_df = pd.read_csv('../input/landmark-recognition-2021/sample_submission.csv')
test_df.head()

In [None]:
test_df.shape

In [None]:
%%time

test_df["filepath"] = get_test_filepath(test_df['id'])
test_df[['width', 'height']] = np.array(get_img_dim(test_df["filepath"])).T
test_df['aspect_ratio'] = test_df['width'] / test_df['height']
test_df.head()

In [None]:
test_df.isnull().sum()

In [None]:
test_df[["width", "height", "aspect_ratio"]].describe().round(2)

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = test_df[["width", "height"]], common_norm = False)
plt.xlabel("Pixels", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.legend(["Width", "Height"], loc = "upper left", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = test_df["aspect_ratio"], common_norm = False)
plt.xlabel("Aspect Ratio", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.xticks(np.arange(int(min(test_df["aspect_ratio"])), int(max(test_df["aspect_ratio"])) + 1, 1.0))
plt.show()

## Saving the outputs

In [None]:
train_df.to_csv("train_df.csv", index = False)
test_df.to_csv("test_df.csv", index = False)