In [None]:
import os, json, random, cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import tensorflow as tf
import re
import math
from tqdm import tqdm

In [None]:
class BASE_CFG:
    comp_path = "/kaggle/input/landmark-retrieval-2021"
    n_labels = 81313
    NUMBER_OF_CLASSES = 81313
    BATCH_SIZE = 256
    EPOCHS = 10
    LEARNING_RATE=0.0001
    OBJ_HEIGHT = 256
    OBJ_WIDTH = 256
    IMAGE_SIZE = 256
    CHANNELS = 0
    NET = 0
    dtype = 'float32'
    VAL_CLASS_NUM = 1000
    FOLD = 3

In [None]:
CFG = BASE_CFG()
comp_path = pathlib.Path(CFG.comp_path)
print(comp_path)

# DATASET MAKING

In [None]:
DATASET_NAME = f'landmark-retrieval-2021-stratify-fold{CFG.FOLD}'

In [None]:
!rm -r /tmp/{DATASET_NAME}
os.makedirs(f'/tmp/{DATASET_NAME}', exist_ok=True)

In [None]:
with open('../input/statking-kaggle-api/kaggle.json') as f:
    kaggle_creds = json.load(f)
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

In [None]:
!kaggle datasets init -p /tmp/{DATASET_NAME}
with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)
dataset_meta['id'] = f'deepkim/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME
with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)
print(dataset_meta)

!cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
!ls /tmp/{DATASET_NAME}

!kaggle datasets create -u -p /tmp/{DATASET_NAME}

In [None]:
#os.listdir("/tmp/landmark-retrieval-2021-tfrecords-size256")

In [None]:
!ls /tmp/{DATASET_NAME}


In [None]:
train = pd.read_csv(comp_path / "train.csv")

In [None]:
landmark_id_count_dict = train['landmark_id'].value_counts().to_dict()

In [None]:
#class_pair_dict = {}
class_pair_dict_keys = train.groupby('landmark_id').count().reset_index().index.tolist()
class_pair_dict_values = train.groupby('landmark_id').count().reset_index()['landmark_id'].tolist()

In [None]:
class_pair_dict = {key:value for key, value in zip(class_pair_dict_keys, class_pair_dict_values)}
reverse_class_pair_dict = {value:key for key, value in zip(class_pair_dict_keys, class_pair_dict_values)}

In [None]:
train['fixed_landmark_id'] = train['landmark_id'].map(reverse_class_pair_dict)

In [None]:
train['landmark_id_count'] = train['landmark_id'].map(landmark_id_count_dict)

In [None]:
train.id.nunique()

In [None]:
train

In [None]:
from sklearn.model_selection import StratifiedKFold
train['fold'] = -1
skf = StratifiedKFold(n_splits=40)
for fold,(tr_idx, val_idx) in enumerate(skf.split(train, y = train['fixed_landmark_id'])):
    train.loc[val_idx,'fold'] = fold

In [None]:
tqdm.pandas()

In [None]:
#image_path = "../input/landmark-recognition-2020/train/{}/{}/{}/{}.jpg".format(image_id[0],image_id[1],image_id[2],image_id) 
train['file_path'] = train['id'].map(lambda x: "train/{}/{}/{}/{}.jpg".format(x[0],x[1],x[2],x))

In [None]:
train['kaggle_file_path'] = train['id'].map(lambda x: "../input/landmark-retrieval-2021/train/{}/{}/{}/{}.jpg".format(x[0],x[1],x[2],x))

In [None]:
train.head()

In [None]:
for fold in range(40):
    print(train.loc[train['fold']==fold]['fixed_landmark_id'].value_counts().sort_values())

In [None]:
import gc
gc.collect()

In [None]:
train

In [None]:
import pickle
train.to_csv("startified_train.csv", index=False)


In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image,image_name,label):
    feature = {
        'image': _bytes_feature(image),
        'image_id': _bytes_feature(image_name),
        'target': _int64_feature(label),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def create_real_train_tf_records(fold  = 0):
    df = train.loc[train['fold']==fold].reset_index(drop=True)
    tfr_filename = f'/tmp/{DATASET_NAME}/landmark-2021-train-{fold}-{df.shape[0]}.tfrec'
    with tf.io.TFRecordWriter(tfr_filename) as writer:
        for i,row in df.iterrows():
            image_id = row.id
            target = row.fixed_landmark_id
            image_path = "../input/landmark-retrieval-2021/train/{}/{}/{}/{}.jpg".format(image_id[0],image_id[1],image_id[2],image_id) 
            image_encoded = tf.io.read_file(image_path)
            image_name = str.encode(image_id)
            example = serialize_example(image_encoded,image_name,target)
            writer.write(example)

In [None]:
if CFG.FOLD == 0:
    foldrange=range(0,10)
elif CFG.FOLD==1:
    foldrange = range(10,20)
elif CFG.FOLD==2:
    foldrange = range(20,30)
elif CFG.FOLD==3:
    foldrange = range(30,40)

In [None]:
import joblib
_ = joblib.Parallel(n_jobs=8)(
        joblib.delayed(create_real_train_tf_records)(fold) for fold in tqdm(foldrange))

In [None]:
from datetime import datetime
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
print(version_name)

In [None]:
!cp /kaggle/working/train.csv /tmp/{DATASET_NAME}/train.csv


In [None]:
!ls /tmp/{DATASET_NAME}

In [None]:
!kaggle datasets version -m {version_name} -p /tmp/{DATASET_NAME} -r zip -q