In [27]:
import os
import os.path as osp
import shutil
import pandas as pd
import numpy as np
from collections import defaultdict
import random
import json
from tqdm import tqdm

In [None]:
# save all images from all cameras to one folder
def copy_images_from_dataset(dataset_path, dst_path):
    if not os.path.isdir(dst_path):
        os.mkdir(dst_path)
    for car_name in tqdm(os.listdir(dataset_path), desc='Copying images'):
        car_path = osp.join(dataset_path, car_name)
        for camera_name in os.listdir(car_path):
            cur_camera_path = osp.join(car_path, camera_name)
            for image_name in os.listdir(cur_camera_path):
                shutil.copy(osp.join(cur_camera_path, image_name), dst_path)

In [28]:
dataset_path = osp.join('..', '..', 'veri')
dst_path = osp.join('dataset', 'veri_images')
copy_images_from_dataset(dataset_path, dst_path)

NameError: name 'copy_images_from_dataset' is not defined

In [None]:
# sorted(os.listdir(dst_path), key=lambda s: int(s.split('_')[2]))[-15:]

['0247_c002_00089940_0.jpg',
 '0691_c009_00089940_0.jpg',
 '0691_c009_00089945_0.jpg',
 '0247_c002_00089950_0.jpg',
 '0691_c009_00089960_0.jpg',
 '0247_c002_00089960_0.jpg',
 '0326_c009_00089965_0.jpg',
 '0247_c002_00089970_0.jpg',
 '0326_c009_00089970_0.jpg',
 '0326_c009_00089975_0.jpg',
 '0247_c002_00089980_0.jpg',
 '0326_c009_00089980_0.jpg',
 '0326_c009_00089985_0.jpg',
 '0326_c009_00089990_0.jpg',
 '0247_c002_00089990_0.jpg']

In [30]:
image_folder = dst_path

# Dict to save info about all images
data = defaultdict(lambda: defaultdict(list))
for filename in tqdm(os.listdir(image_folder), desc='Scanning images'):
    if filename.endswith('.jpg'):
        car_id, camera_id, image_id, _ = filename.split('_')
        data[car_id][camera_id].append(filename)

# Split images into train and test (query + galley)
train_data: list[str] = []
query_data: list[str] = []
gallery_data: list[str] = []
random.seed(0)
for car_id, cameras in tqdm(data.items(), desc='Making split'):
    for camera_id, images in cameras.items():
        # if at least two images for this camera are present, add some of them to test
        if len(images) >= 2:
            # 1 in 4 chance to add first image to query, 3 in 4 to add it to gallery
            if random.randint(0, 3) == 0:
                query_data.append(images[0])
            else:
                gallery_data.append(images[0])
            # second image always goes to gallery so it always has images of cars that are in query
            gallery_data.append(images[1])
            # the rest goes to train
            train_data.extend(images[2:])
        else:
            train_data.extend(images)

len(train_data), len(gallery_data), len(query_data)            

Scanning images: 100%|██████████| 49357/49357 [00:00<00:00, 642397.15it/s]
Making split: 100%|██████████| 776/776 [00:00<00:00, 95963.08it/s]


(35737, 11936, 1684)

In [None]:
# save split image names to respective files
with open(os.path.join('dataset', 'VeRi', 'name_train.txt'), 'w+') as f:
    f.write('\n'.join(train_data))
with open(os.path.join('dataset', 'VeRi', 'name_test.txt'), 'w+') as f:
    f.write('\n'.join(gallery_data))
with open(os.path.join('dataset', 'VeRi', 'name_query.txt'), 'w+') as f:
    f.write('\n'.join(query_data))

In [31]:
set(gallery_data).isdisjoint(query_data)

True

In [57]:
all_images = train_data + query_data + gallery_data
len(all_images)

49357

In [None]:
# transform keypoin_orient (viewpoints) into one file (since our split does not match the one in the
# original paper and therefore their train/test dplit does not make sence for us)
df1 = pd.read_csv(os.path.join('dataset', 'VeRi', 'keypoint_orient_train.txt'), sep=' ', header = None)
df2 = pd.read_csv(os.path.join('dataset', 'VeRi', 'keypoint_orient_test.txt'), sep=' ', header = None)
df = pd.concat([df1, df2])
all_keys = np.unique(df[0])
len(all_keys), len(np.intersect1d(all_images, all_keys))

(49356, 49294)

Somehow there are 62 images missing from viewpoint files... How can this be?

Anyway, save what we have to one file (the model has native missing viewpoint support)

In [80]:
all_views = {img_name: df.loc[df[0] == img_name, 1:].to_numpy() for img_name in tqdm(all_images) if img_name in all_keys}

100%|██████████| 49357/49357 [05:29<00:00, 149.60it/s]


In [90]:
all_views = {img_name: arr[0].tolist() for img_name, arr in all_views.items()}

In [None]:
with open(os.path.join('dataset', 'VeRi', 'viewpoints.txt'), "w") as f: 
    json.dump(all_views, f)

In [6]:
with open(os.path.join('dataset', 'VeRi', 'viewpoints.txt'), 'r') as f:
    viewpoints = json.load(f)

viewpoints['0208_c016_00038985_0.jpg']

[11,
 67,
 77,
 117,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 29,
 33,
 -1,
 -1,
 114,
 12,
 63,
 14,
 107,
 46,
 177,
 44,
 113,
 89,
 196,
 92,
 155,
 88,
 156,
 123,
 4]

In [1]:
%pwd
%ls ../../veri_splits/

'/home/serg_fedchn/Homework/6_semester/НИР/object-reidentification/baseline_reworked'

In [6]:
import os.path as osp
true_splits_path = '../../veri_splits' 
with open(osp.join(true_splits_path, 'veri_train_list.txt'), 'r') as f:
    train_list = set(map(lambda s: s.strip().split()[0], f.readlines()))
with open(osp.join(true_splits_path, 'veri_test_list.txt'), 'r') as f:
    test_list = set(map(lambda s: s.strip(), f.readlines()))
with open(osp.join(true_splits_path, 'veri_query_list.txt'), 'r') as f:
    query_list = set(map(lambda s: s.strip(), f.readlines()))

In [15]:
len(train_list), len(test_list),  len(query_list)

(37778, 11579, 1678)

In [16]:
len(train_list) + len(test_list)

49357

In [20]:
gallery_list = test_list - query_list

In [18]:
len(query_list.intersection(test_list)), len(train_list.intersection(test_list))

(1678, 0)

In [19]:
%pwd

'/home/serg_fedchn/Homework/6_semester/НИР/object-reidentification/baseline_reworked'

In [32]:
with open(osp.join('dataset', 'VeRi', 'original_veri_train_list.txt'), 'w') as f:
    f.write('\n'.join(train_list))
with open(osp.join('dataset', 'VeRi', 'original_veri_test_list.txt'), 'w') as f:
    f.write('\n'.join(test_list))
with open(osp.join('dataset', 'VeRi', 'original_veri_query_list.txt'), 'w') as f:
    f.write('\n'.join(query_list))

In [None]:
with open('dataset/VeRi/original_veri_train_list.txt', 'r') as f:
    lines = f.read()


944449