# Script for preparing data

This script downloads the data and extracts features for MegaDescriptor. We first load the required packages and specify where the data and extracted features will be stored.

In [None]:
import sys
sys.path.append('..')
import os
from wildlife_datasets import datasets
from sides_matching import get_features, get_transform
from sides_matching import amvrakikos, reunion_green, reunion_hawksbill, zakynthos

root_data = '/data/wildlife_datasets/data'
root_features = '../features'
data = [
    ('Amvrakikos', os.path.join(root_data, 'AmvrakikosTurtles'), amvrakikos),
    ('ReunionGreen', os.path.join(root_data, 'ReunionTurtles'), reunion_green),
    ('ReunionHawksbill', os.path.join(root_data, 'ReunionTurtles'), reunion_hawksbill),
    ('Zakynthos', os.path.join(root_data, 'ZakynthosTurtles'), zakynthos),
]

Then we download the data. If an error appears, Kaggle is probably not setup. In such a case, either download the data manually or follow the link in the error message.

In [None]:
for dataset_class in [datasets.AmvrakikosTurtles, datasets.ReunionTurtles, datasets.ZakynthosTurtles]:    
    root = os.path.join(root_data, dataset_class.__name__)
    dataset_class.get_data(root)

Now we extract the features by the MegaDescriptor (large flavour) model.

In [None]:
import timm
import torch
from wildlife_tools.features import DeepFeatures, AlikedExtractor, SiftExtractor

model_name = 'hf-hub:BVRA/MegaDescriptor-L-384'
img_size = 384
batch_size = 32

device = 'cuda' if torch.cuda.is_available() else 'cpu'
os.makedirs(root_features, exist_ok=True)
for name, root, dataset_class in data:
    for flip in [True, False]:
        for grayscale in [True, False]:
            transform = get_transform(flip=flip, grayscale=grayscale, img_size=img_size, normalize=True)
            dataset = dataset_class(root, transform=transform, load_label=True)
            # MegaDescriptor
            file_name = os.path.join(root_features, f'MegaDescriptor_{name}_flip={flip}_grayscale={grayscale}.pickle')
            if not os.path.exists(file_name):
                model = timm.create_model(model_name, num_classes=0, pretrained=True)
                extractor = DeepFeatures(model, batch_size=batch_size, device=device)
                features = get_features(file_name, dataset, extractor)
            # Aliked
            file_name = os.path.join(root_features, f'Aliked_{name}_flip={flip}_grayscale={grayscale}_{img_size}.pickle')
            if not os.path.exists(file_name):
                extractor = AlikedExtractor(batch_size=batch_size, device=device)
                features = get_features(file_name, dataset, extractor)