In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tqdm.autonotebook import tqdm
tqdm.pandas()

In [None]:
base_dir = '/kaggle/input/landmark-recognition-2020'

Analyze training dataset

In [None]:
train = pd.read_csv(os.path.join(base_dir,'train.csv'))

In [None]:
train.head()

In [None]:
train.duplicated().sum()

In [None]:
train['landmark_id'].value_counts().hist()


In [None]:
# missing data in training data 
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending = False)
missing_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train_data.head()

In [None]:
temp_data= pd.DataFrame(train['landmark_id'].value_counts().head(10)).reset_index()

In [None]:
temp_data.columns=['landmark_id','count']

In [None]:
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
import glob
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(8,9))
plt.title('Most frequent Landmarks')
sns.set_color_codes("pastel")
sns.barplot(x="landmark_id", y="count", data=temp_data,
            label="Count")
plt.show()

In [None]:
# Occurance of landmark_id in increasing order
temp = pd.DataFrame(train.landmark_id.value_counts().tail(8))
temp.reset_index(inplace=True)
temp.columns = ['landmark_id','count']
temp

In [None]:
# Plot the least frequent landmark_ids
plt.figure(figsize = (9, 8))
plt.title('Least frequent landmarks')
sns.set_color_codes("pastel")
sns.barplot(x="landmark_id", y="count", data=temp,
            label="Count")
plt.show()

In [None]:
#Landmark ID distribution
plt.figure(figsize = (10, 8))
plt.title('Landmark ID Distribuition')
sns.distplot(train['landmark_id'])

plt.show()

In [None]:
print("Number of classes under 20 occurences",(train['landmark_id'].value_counts() <= 20).sum(),'out of total number of categories',len(train['landmark_id'].unique()))

In [None]:
train_list = glob.glob(os.path.join(base_dir,'train/*/*/*/*'))

In [None]:
plt.rcParams["axes.grid"] = False
f, axarr = plt.subplots(8, 7, figsize=(24, 22))

curr_row = 0
for i in range(56):
    example = cv2.imread(train_list[i])
    example = example[:,:,::-1]
    
    col = i%8
    axarr[col, curr_row].imshow(example)
    if col == 7 :
        curr_row += 1

In [None]:
sample_train = train[train['landmark_id'].isin(temp_data['landmark_id'])].reset_index(drop = True)

In [None]:
sample_train

In [None]:
train.landmark_id.nunique()

In [None]:
sample_train.landmark_id.nunique()

In [None]:
import shutil

In [None]:
landmark_id = list(sample_train.landmark_id.unique())

In [None]:
def create_folder_structure(base_dir,landmark_id, mode="train"):
    """
    :param output_dir:
    :param mode:
    :return:
    """
    base_dir = base_dir + "/" + mode
    if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
    os.makedirs(base_dir)
    for id_ in landmard_id:
        os.makedirs(base_dir + "/" + str(id_)) 
    return base_dir

****Create folder structure for sample dataset****

In [None]:
output_dir = '/kaggle/working'
train_dir = create_folder_structure(output_dir,landmark_id,mode='train')
validation_dir = create_folder_structure(output_dir,landmark_id,mode='validation')

In [None]:
def get_file_path(input_dir,path_id,mode='train'):
    prefix = path_id[:3]
    path = input_dir + "/" + mode + "/" +"{0}/{1}/{2}/".format(prefix[0],prefix[1],prefix[2])
    filename = path_id
    return path + filename + ".jpg"
    

In [None]:
def copy(dataframe,output_dir,id_,mode='train'):
    destination = output_dir + "/" + mode + "/" + str(id_)
    for index,row in dataframe.iterrows():
        shutil.copy(row['file_path'],destination)

In [None]:
def copy_files(input_dir,output_dir,dataframe,landmark_id):
    for id_ in tqdm(landmark_id):
        print('Landmark with id: {}'.format(id_))
        temp = dataframe[sample_train['landmark_id']==id_]
        train = temp.sample(frac = 0.8,random_state = 1)
        validation = temp[~temp['id'].isin(train.id)]
        train['file_path'] = train.apply(lambda x: get_file_path(input_dir,x['id'],mode = 'train'),axis =1)
        validation['file_path'] = validation.apply(lambda x: get_file_path(input_dir,x['id'],mode = 'train'),axis =1)
        # Copy training files
        copy(train,output_dir,id_,mode = 'train')
        # Copy validation files
        copy(validation,output_dir,id_,mode = 'validation')
    
    

In [None]:
copy_files(base_dir,output_dir,sample_train,landmark_id)

In [None]:
sample_train_list = glob.glob(os.path.join(output_dir,'train/*/*'))
sample_validation_list = glob.glob(os.path.join(output_dir,'validation/*/*'))

In [None]:
print('Number of training images: {}'.format(len(sample_train_list))+"\n" + "Number of validation images: {}".format(len(sample_validation_list)))

In [None]:
from zipfile import ZipFile
import os
from os.path import basename
# create a ZipFile object
with ZipFile('/kaggle/working/sample_google_landmark_retrieval.zip', 'w') as zipObj:
   # Iterate over all the files in directory
   for folderName, subfolders, filenames in os.walk(output_dir):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath, basename(filePath))

In [None]:
import datetime
def print_info(archive_name):
    zf = ZipFile(archive_name)
    size = sum([zinfo.file_size for zinfo in  zf.filelist])
    zip_mb = float(size)/1000000 #MB
    print('Archive size: {}'.format(zip_mb))


In [None]:
print_info(archive_name=os.path.join(output_dir,'sample_google_landmark_retrieval.zip'))

****Generate Download Link****

In [None]:
from IPython.display import FileLink
FileLink(r'sample_google_landmark_retrieval.zip')