# Load Flickr8k metadata

In [None]:
import json
import shutil
import os

In [None]:
base_dn = '/Users/pbos/projects/spokenLanguage/data/'

In [None]:
flickr8k_dn = '/Users/pbos/projects/spokenLanguage/data/flickr8k/'

In [None]:
with open(flickr8k_dn + 'dataset.json') as fh:
    metadata = json.load(fh)

In [None]:
assert(len(metadata['images']) == 8000)

In [None]:
{label: sum(image['split'] == label for image in metadata['images']) for label in ('train', 'test', 'val')}

# Build small dataset

Let's build a smaller dataset out of this.

We need to make sure we include all types of images, i.e. training, testing and validation. We could select these out of the dataset to stay as close to flickr8k as possible. However, since it's just a training set, we can also just reset the metadata.

In the original dataset, the fractions are 6:1:1 for train:test:val, so trying to stay close to that may make sense.

In the original flickr1d build script (v1 and v2) we copied images and didn't actually use them. Now we use these images to rebuild the feature file.

In [None]:
name = "flickr1h"

In [None]:
mini_size = 100
N_test = 12
N_val = 12

In [None]:
data_dn = f'{base_dn}/{name}/'

In [None]:
audio_subdir = 'flickr_audio/'
image_subdir = 'flickr8k_images/'

In [None]:
os.makedirs(data_dn + image_subdir)
os.makedirs(data_dn + audio_subdir)

## Images and image metadata

In [None]:
for ix in range(mini_size):
    shutil.copyfile(flickr8k_dn + '/Flickr8k_Dataset/Flicker8k_Dataset/' + metadata['images'][ix]['filename'],
                    data_dn + image_subdir + metadata['images'][ix]['filename'])

In [None]:
miniset_meta = {'dataset': name, 'images': metadata['images'][:mini_size]}

In [None]:
for i in range((mini_size - N_test - N_val), (mini_size - N_val)):
    miniset_meta['images'][i]['split'] = 'test'
for i in range((mini_size - N_val), mini_size):
    miniset_meta['images'][i]['split'] = 'val'

In [None]:
print([i['split'] for i in miniset_meta['images']])

In [None]:
with open(data_dn + 'dataset.json', 'w') as fh:
    json.dump(miniset_meta, fh)

## Wav files -  wav2capt metadata

In [None]:
miniset_img_filenames = [im['filename'] for im in miniset_meta['images']]
with open(flickr8k_dn + 'wav2capt.txt', 'r') as fh:
    wav2capt = [line.split() for line in fh if line.split()[1] in miniset_img_filenames]

In [None]:
with open(data_dn + audio_subdir + 'wav2capt.txt', 'w') as fh:
    for line in wav2capt:
        fh.write(' '.join(line) + '\n')

In [None]:
miniset_wav_filenames = [wav[0] for wav in wav2capt]
with open(flickr8k_dn + 'wav2spk.txt', 'r') as fh:
    wav2spk = [line.split() for line in fh if line.split()[0] in miniset_wav_filenames]

In [None]:
with open(data_dn + audio_subdir + 'wav2spk.txt', 'w') as fh:
    for line in wav2spk:
        fh.write(' '.join(line) + '\n')

Then finally copy over the actual wavs:

In [None]:
for wv in wav2spk:
    shutil.copyfile(flickr8k_dn + '/flickr_audio/wavs/' + wv[0],
                    data_dn + audio_subdir + wv[0])

## Preprocess

In [None]:
# %pip install soundfile

In [None]:
import platalea.utils.preprocessing as prep

In [None]:
prep.preprocess_flickr8k(data_dn, audio_subdir, image_subdir)

## Remove files

We don't need the actual images and wavs after preprocessing into features, so we can safely remove them again before publishing our dataset somewhere.

In [None]:
import glob

In [None]:
shutil.rmtree(data_dn + image_subdir)
for fn in glob.glob(data_dn + audio_subdir + "*.wav"):
    os.remove(fn)

## Add default config.yml file

In [None]:
with open(data_dn + 'config.yml', 'w') as fh:
    fh.writelines(["flickr8k_meta        dataset.json",
                   "audio_features_fn    mfcc_features.pt"])

# Done!