# Script for preparing data

This script downloads the data and extracts features for MegaDescriptor. We first load the required packages and specify where the data and extracted features will be stored.

In [None]:
import sys
sys.path.append('..')
import os
import torch
from wildlife_datasets import datasets
from sides_matching import WD, get_extractor, get_normalized_features, get_transform

root_data = '../data'
root_features = '../features'
data = [
    ('AmvrakikosTurtles', datasets.AmvrakikosTurtles),
    ('ReunionTurtles', datasets.ReunionTurtles),
    ('ZakynthosTurtles', datasets.ZakynthosTurtles),
]

Then we download the data. If an error appears, Kaggle is probably not setup. In such a case, either download the data manually or follow the link in the error message.

In [None]:
for dataset_name, dataset_class in data:
    root = os.path.join(root_data, dataset_name)
    dataset_class.get_data(root)

Now we extract the features by the MegaDescriptor (large flavour) model.

In [None]:
model_name = 'hf-hub:BVRA/MegaDescriptor-L-384'
img_size = 384

device = 'cuda' if torch.cuda.is_available() else 'cpu'
for dataset_name, dataset_class in data:
    for grayscale in [True, False]:
        transform = get_transform(flip=False, grayscale=grayscale, img_size=img_size, normalize=True)
        root = os.path.join(root_data, dataset_name)
        file_name = os.path.join(root_features, f'features_{dataset_name}_flip={False}_grayscale={grayscale}.npy')
        if not os.path.exists(file_name):
            d = dataset_class(root)
            img_load = 'bbox' if 'bbox' in d.df else 'full'
            dataset = WD(d.df, d.root, transform=transform, img_load=img_load)
            extractor = get_extractor(model_name=model_name, batch_size=32, device=device)
            features = get_normalized_features(file_name, dataset, extractor)