# Find Indexes that should be skipped due to bad data
This file focuses  on preprocessing data that should be skipped
specifically the indexes that have bad image data or bad joint data

In [1]:
import sys;sys.path.insert(0, '..')
import numpy as np
from src.dog_data_set import DogPoseDataSet
from tqdm.notebook import tqdm
import torchvision.transforms as transforms
from src.data_utils import is_bad_training_sample, is_bad_evaluation_sample
from src.crop import CropAroundBoundingBox

train_skip_file_path = "../data/annotations/train_skip_stanford_StanfordExtra_v12_new_split.npy"

TransformsBasic = transforms.Compose([
    CropAroundBoundingBox(),
])

dog_pose_train_data_set = DogPoseDataSet(
    images_dir = "../data/Images/", 
    np_split_file="../data/annotations/train_stanford_StanfordExtra_v12_new_split.npy", 
    annotations_json_file="../data/annotations/StanfordExtra_v12.json", transform=TransformsBasic)

train_data_bad_indexes = []

for index, sample in tqdm(enumerate(dog_pose_train_data_set)):
    img_index = sample["img_index"]

    if (is_bad_training_sample(sample)):
        train_data_bad_indexes.append(img_index)
        
    
train_data_bad_indexes_numpy = np.array(train_data_bad_indexes, dtype=int)

print(f"Number of Bad indexes for training data is {len(train_data_bad_indexes)}")

print(f"Saving file to {train_skip_file_path}")

np.save(train_skip_file_path, train_data_bad_indexes_numpy) 

loaded_train_skip = np.load(train_skip_file_path)

print(loaded_train_skip)

0it [00:00, ?it/s]

Number of Bad indexes for training data is 82
Saving file to ../data/annotations/train_skip_stanford_StanfordExtra_v12_new_split.npy
[1413  186 1776  629  663  992 1119 1167   83 1236 1827  275  617  254
  778 1739 1966 2302 1963 2033 2525  122 2484 3065 3182 3239 3425 3959
 4613 4344 4407 4491 4553 4586 4763 5382 5232 5239 5231 4946 5385 5916
 5965 6598 6626 7242 7299 6200 6743 6828 7207 7376 7232 7662 7962 8161
  113  469 1757 2071 2199 2615 6192 7378 7658 7799 8586 8680 8784 8974
 8987 9009 9014 9027 9071 9091 9169 9276 9322 9470 9487 9770]


# Find Indexes that should be skipped for testing data
Testing sample should skipped if it has `is_multiple_dogs` set to `True`
as well as bad joint or bad image data


In [2]:
test_skip_file_path = "../data/annotations/test_skip_stanford_StanfordExtra_v12_new_split.npy"

dog_pose_test_data_set = DogPoseDataSet(
    images_dir = "../data/Images/", 
    np_split_file="../data/annotations/test_stanford_StanfordExtra_v12_new_split.npy", 
    annotations_json_file="../data/annotations/StanfordExtra_v12.json", transform=TransformsBasic)

test_data_bad_indexes = []

for index, sample in tqdm(enumerate(dog_pose_test_data_set)):
    img_index = sample["img_index"]

    if (is_bad_evaluation_sample(sample)):
        test_data_bad_indexes.append(img_index)
        
    
test_data_bad_indexes_numpy = np.array(test_data_bad_indexes, dtype=int)

print(f"Number of Bad indexes for testing data is {len(test_data_bad_indexes)}")

print(f"Saving file to {test_skip_file_path}")

np.save(test_skip_file_path, test_data_bad_indexes_numpy) 

loaded_test_skip = np.load(test_skip_file_path)

print(loaded_test_skip)

0it [00:00, ?it/s]

Number of Bad indexes for testing data is 20
Saving file to ../data/annotations/test_skip_stanford_StanfordExtra_v12_new_split.npy
[ 944    0  463 1485 1860 2280 2256 2310 3233 2946 3515 4000 4362 4736
 5447 5606 6417 7175 8063 7631]


# Find Indexes that should be skipped for validation data
validation sample should skipped if it has `is_multiple_dogs` set to `True`
as well as bad joint or bad image data

In [3]:
val_skip_file_path = "../data/annotations/val_skip_stanford_StanfordExtra_v12_new_split.npy"

dog_pose_val_data_set = DogPoseDataSet(
    images_dir = "../data/Images/", 
    np_split_file="../data/annotations/val_stanford_StanfordExtra_v12_new_split.npy", 
    annotations_json_file="../data/annotations/StanfordExtra_v12.json", transform=TransformsBasic)

val_data_bad_indexes = []

for index, sample in tqdm(enumerate(dog_pose_val_data_set)):
    img_index = sample["img_index"]

    if (is_bad_evaluation_sample(sample)):
        val_data_bad_indexes.append(img_index)
        
    
val_data_bad_indexes_numpy = np.array(val_data_bad_indexes, dtype=int)

print(f"Number of Bad indexes for validation data is {len(val_data_bad_indexes)}")

print(f"Saving file to {val_skip_file_path}")

np.save(val_skip_file_path, val_data_bad_indexes_numpy) 

loaded_val_skip = np.load(val_skip_file_path)

print(len(loaded_val_skip))

0it [00:00, ?it/s]

Number of Bad indexes for validation data is 1
Saving file to ../data/annotations/val_skip_stanford_StanfordExtra_v12_new_split.npy
1


In [4]:
skip_indexes = np.load("../data/annotations/val_skip_stanford_StanfordExtra_v12_new_split.npy")
skip_indexes
skip_set = set(skip_indexes.flatten())

val_split_np = np.load("../data/annotations/val_stanford_StanfordExtra_v12_new_split.npy")

clean_split_np = []

for index in val_split_np:
    if (index not in skip_set):
        print(f"Good Index {index}")
        clean_split_np.append(index)
    else:
        print(f"Bad Index {index}")

clean_split_np


Good Index 10835
Good Index 10836
Good Index 10837
Good Index 10838
Good Index 10839
Good Index 10840
Good Index 10841
Good Index 10842
Good Index 10843
Good Index 10844
Good Index 10845
Good Index 10846
Good Index 10847
Good Index 10848
Good Index 10849
Good Index 10850
Good Index 10851
Good Index 10852
Good Index 10853
Good Index 10854
Good Index 10855
Good Index 10856
Good Index 10857
Good Index 10858
Good Index 10859
Good Index 10860
Good Index 10861
Good Index 10862
Good Index 10863
Good Index 10864
Good Index 10865
Good Index 10866
Good Index 10867
Good Index 10868
Good Index 10869
Good Index 10870
Good Index 10871
Good Index 10872
Good Index 10873
Good Index 10874
Good Index 10875
Good Index 10876
Good Index 10877
Good Index 10878
Good Index 10879
Good Index 10880
Good Index 10881
Good Index 10882
Good Index 10883
Good Index 10884
Good Index 10885
Good Index 10886
Good Index 10887
Good Index 10888
Good Index 10889
Good Index 10890
Good Index 10891
Good Index 10892
Good Index 108

[10835,
 10836,
 10837,
 10838,
 10839,
 10840,
 10841,
 10842,
 10843,
 10844,
 10845,
 10846,
 10847,
 10848,
 10849,
 10850,
 10851,
 10852,
 10853,
 10854,
 10855,
 10856,
 10857,
 10858,
 10859,
 10860,
 10861,
 10862,
 10863,
 10864,
 10865,
 10866,
 10867,
 10868,
 10869,
 10870,
 10871,
 10872,
 10873,
 10874,
 10875,
 10876,
 10877,
 10878,
 10879,
 10880,
 10881,
 10882,
 10883,
 10884,
 10885,
 10886,
 10887,
 10888,
 10889,
 10890,
 10891,
 10892,
 10893,
 10894,
 10895,
 10896,
 10897,
 10898,
 10899,
 10900,
 10901,
 10902,
 10903,
 10904,
 10905,
 10906,
 10907,
 10908,
 10909,
 10910,
 10911,
 10912,
 10913,
 10914,
 10915,
 10916,
 10917,
 10918,
 10919,
 10920,
 10921,
 10922,
 10923,
 10924,
 10925,
 10926,
 10927,
 10928,
 10929,
 10930,
 10931,
 10932,
 10933,
 10934,
 10935,
 10936,
 10937,
 10938,
 10939,
 10940,
 10941,
 10942,
 10943,
 10944,
 10945,
 10946,
 10947,
 10948,
 10949,
 10950,
 10951,
 10952,
 10953,
 10954,
 10955,
 10956,
 10957,
 10958,
 10959,
