# Shrinking dataset for initial model exploration

Input directory must have images and an _annotations.csv file containing labels in the retinanet format.

### Import dependencies

In [116]:
import numpy as np
import pandas as pd
import os, shutil

### Setup paths and constants

In [117]:
input_dataset_path = '../dataset/test/'
output_dataset_path = '../dataset_small/test/'

assert(os.path.exists(input_dataset_path))
assert(os.path.exists(output_dataset_path))

output_size = 200 # number of images to return

### Read _annotations.csv file

In [118]:
'/'.join([input_dataset_path, '_annotations.csv'])

'../dataset/test//_annotations.csv'

In [119]:
# read in the annotations file and provide column labels
labels = ['file_name', 'x1', 'y1', 'x2', 'y2', 'label']
bboxes_df = pd.read_csv(input_dataset_path + '_annotations.csv', sep=',', header=None, names=labels)
bboxes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1884 entries, 0 to 1883
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1884 non-null   object
 1   x1         1884 non-null   int64 
 2   y1         1884 non-null   int64 
 3   x2         1884 non-null   int64 
 4   y2         1884 non-null   int64 
 5   label      1884 non-null   object
dtypes: int64(4), object(2)
memory usage: 88.4+ KB


In [120]:
# drop all rows that don't have label 'car'
bboxes_df = bboxes_df[bboxes_df['label'] == 'car']

In [121]:
# if the total number of images is less than output_size, reduce output_size to the
# total amount of unique image names.

unique_file_names = len(bboxes_df['file_name'].unique())
if unique_file_names < output_size:
    output_size = unique_file_names

### Pick a random sample of images (file_name)

- Sample output_size images from bboxes_df.
- Create mapping of file_name to img_1, img_2...
- Construct output df with only labels for those images.

In [122]:
# sample file names
file_names = bboxes_df['file_name'].unique()
file_names_subset = np.random.choice(file_names, size=output_size, replace=False)
print('length of subset: {}\n'.format(len(file_names_subset)))

# create name mapping. Example: '1478732900171504336_jpg.rf.7423b62a846b66cadd0ef67daa59cb83.jpg' -> 'img1.jpg'
file_names_mapping = {file_names_subset[i]: 'img{}.jpg'.format(i+1) for i in range(len(file_names_subset))}
print('Name Mapping:\n')
for i in range(3):
    print(file_names_subset[i] + ' -> ' + file_names_mapping[file_names_subset[i]])

length of subset: 200

Name Mapping:

1478732704040480944_jpg.rf.247052eee42a713b33df0938aad4ac67.jpg -> img1.jpg
1478896462764592863_jpg.rf.f61181266143bd8d959f1130fae5ed98.jpg -> img2.jpg
1478896308515808431_jpg.rf.78681db0a7ecc78e32e958401642b959.jpg -> img3.jpg


In [123]:
bboxes_subset_df = bboxes_df[bboxes_df['file_name'].isin(file_names_subset)]

assert(bboxes_subset_df['file_name'].unique().size == output_size)

### Move images and write to output _annotations.csv file

In [124]:
# move images from subset and rename with file_names_mapping
for name in bboxes_subset_df['file_name'].unique():
    shutil.copy(input_dataset_path + name, output_dataset_path + file_names_mapping[name])

In [125]:
# map file_names_mapping to file_name in df
bboxes_subset_df = bboxes_subset_df.assign(**{'file_name': bboxes_subset_df['file_name'].map(file_names_mapping)})

# write to csv
bboxes_subset_df.to_csv(output_dataset_path + '_annotations.csv', header=False, index=False)