## Load dataset

In [None]:
import numpy as np
import pandas as pd
import re
import requests
import shutil
import cv2
from PIL import Image
import os
from tqdm import tqdm

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_data = pd.read_csv('google-landmarks-dataset/train.csv')
test_data = pd.read_csv('google-landmarks-dataset/test.csv')

In [None]:
train_data.head(5)

## Sample the data

In [None]:
landmark_list = [str(x) for x in range(1000, 3001)]
train_data_sample = train_data[train_data['landmark_id'].isin(landmark_list)]

In [None]:
print('sample train data:', len(train_data_sample))

In [None]:
colors = np.array(['#4285f4','#34a853','#fbbc05','#ea4335'])
#Define the order in which to display the graph
order = ['1-5','5-10','10-50','50-100','100-200','200-500','>=500']
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))

def plot_distribution(data_f, data_k, axis):
    x = data_f.landmark_id.value_counts().index
    y = pd.DataFrame(data_f.landmark_id.value_counts())
    
    y['Number of images'] = np.where(y.landmark_id >= 500, '>=500', y['landmark_id'])
    y['Number of images'] = np.where((y.landmark_id >= 200) & (y.landmark_id < 500), '200-500', y['Number of images'])
    y['Number of images'] = np.where((y.landmark_id >= 100) & (y.landmark_id < 200), '100-200', y['Number of images'])
    y['Number of images'] = np.where((y.landmark_id >= 50) & (y.landmark_id < 100), '50-100', y['Number of images'])
    y['Number of images'] = np.where((y.landmark_id >= 10) & (y.landmark_id < 50), '10-50', y['Number of images'])
    y['Number of images'] = np.where((y.landmark_id >= 5) & (y.landmark_id < 10), '5-10', y['Number of images'])
    y['Number of images'] = np.where((y.landmark_id >= 1) & (y.landmark_id < 5), '1-5', y['Number of images'])

    y['Number of images'].value_counts().loc[order].plot(kind = 'bar',color = colors,width = 0.8, ax=axis)
    axis.set_xlabel('Number of images')
    axis.set_ylabel('Number of classes')
    axis.set_title(data_k)

plot_distribution(train_data, 'Original', ax1)
plot_distribution(train_data_sample, 'Sample', ax2)

### Rewrite urls

In [None]:
TARGET_SIZE = 96
def reso_overwrite(url_tail, resolution=TARGET_SIZE):
    pattern = 's[0-9]+'
    matched = re.match(pattern, url_tail)
    if matched:
        return 's{}'.format(resolution)
    else:
        return url_tail

In [None]:
def join_url(parsed_url, s_reso):
    parsed_url[-2] = s_reso
    return '/'.join(parsed_url)

In [None]:
def overwrite_urls(df):
    df = df[df.url.apply(lambda x: len(x.split('/')) > 1)]
    parsed_url = df.url.apply(lambda x: x.split('/'))
    url_tail = parsed_url.apply(lambda x: x[-2])
    resos = url_tail.apply(lambda x: reso_overwrite(x, TARGET_SIZE))
    overwritten_df = pd.concat([parsed_url, resos], axis=1)
    overwritten_df.columns = ['url', 's_reso']
    df['url'] = overwritten_df.apply(lambda x: join_url(x['url'], x['s_reso']), axis=1)
    return df

In [None]:
train_data_sample_resize = overwrite_urls(train_data_sample)

In [None]:
train_data_sample_resize.url.iloc[0]

### Split train, test and validation data from train_data_sample_resize

In [None]:
#
# to check
#
sample_randmak_id = train_data_sample_resize.landmark_id.iloc[0]
sample_li = train_data_sample_resize[train_data_sample_resize.landmark_id == sample_randmak_id]
print(len(sample_li))

# select test set
sample_li_test = sample_li.sample(frac=ratio_test)
print(len(sample_li_test))
sample_li = sample_li[~sample_li.id.isin(sample_li_test.id)]
print(len(sample_li))

# select valid set
sample_li_valid = sample_li.sample(frac=ratio_valid)
print(len(sample_li_valid))
sample_li = sample_li[~sample_li.id.isin(sample_li_valid.id)]
print(len(sample_li))

In [None]:
train_train = pd.DataFrame(columns=['id', 'url', 'landmark_id'])
train_test = pd.DataFrame(columns=['id', 'url', 'landmark_id'])
train_valid = pd.DataFrame(columns=['id', 'url', 'landmark_id'])
ratio_test = 0.1
ratio_valid = 0.2

In [None]:
for landmark_id in set(train_data_sample_resize['landmark_id']):
    # get list for each landmark_id
    li = train_data_sample_resize[train_data_sample_resize.landmark_id == landmark_id]
    # select test set
    li_test = li.sample(frac=ratio_test)
    li = li[~li.id.isin(li_test.id)]
    # select valid set
    li_valid = li.sample(frac=ratio_valid)
    li = li[~li.id.isin(li_valid.id)]
    
    train_train = train_train.append(li)    
    train_test = train_test.append(li_test)
    train_valid = train_valid.append(li_valid)

In [None]:
len(train_train), len(train_test), len(train_valid)

### Fetch images

In [None]:
def create_dir(dir_path):
    if not os.path.exists(dir_path):
        print('Created: {}'.format(dir_path))
        os.makedirs(dir_path)

In [None]:
train_train_images_dir = 'train_train_images'
train_valid_images_dir = 'train_valid_images'
train_test_images_dir = 'train_test_images'
create_dir(train_train_images_dir)
create_dir(train_valid_images_dir)
create_dir(train_test_images_dir)

In [None]:
def fetch_image(url, dir_path):
    # fetch image and save as test.jpg(dummy name)
    response = requests.get(url, stream=True)
    dummy = os.path.join(dir_path, 'test.jpg')
    with open(dummy, 'wb') as f:
        response.raw.decode_content = True
        shutil.copyfileobj(response.raw, f)
    return dummy

In [None]:
def fetch_images(data, dir_path):
    idx = 0
    urls = data['url']
    for url in tqdm(urls):
        # Skip if already fetched
        if os.path.exists(os.path.join(dir_path, data['id'].iloc[idx] + '.jpg')):
            idx += 1
            continue
        # fetch image
        dummy = fetch_image(url, dir_path)
        # rename
        os.rename(dummy, os.path.join(dir_path, data['id'].iloc[idx] + '.jpg'))
        idx += 1

In [None]:
fetch_images(train_train, train_train_images_dir)
fetch_images(train_valid, train_valid_images_dir)
fetch_images(train_test, train_test_images_dir)

### Preprocessing

In [None]:
def create_dirs(dataset, root_dir):
    df = pd.DataFrame(dataset.landmark_id.value_counts())
    df.reset_index(inplace=True)
    df.columns = ['landmark_id', 'count']
    num_dirs = 0
    for idx, row in dataset.iterrows():
        landmark_id = row.landmark_id
        landmark_id_dir = os.path.join(root_dir, landmark_id)
        if not os.path.exists(landmark_id_dir):
            os.makedirs(landmark_id_dir)
            num_dirs += 1
    print('Created:', num_dirs)

In [None]:
create_dirs(train_test, train_test_images_dir)
create_dirs(train_valid, train_valid_images_dir)
create_dirs(train_train, train_train_images_dir)