In [1]:
import os
os.chdir('/Users/nick/Documents/school/research/EfficientLPR')
import numpy as np
import pandas as pd
import shutil
from cv2 import cv2
from tqdm import tqdm

In [2]:
def mkdir(dir_):
    if not os.path.exists(dir_):
        os.mkdir(dir_)
        
def remove_extraneous(data):
    new_data = data[['file', 'top', 'left', 'height', 'width', 'body-type', 'categorical-color']]
    new_data.columns = ['file', 'top', 'left', 'height', 'width', 'body', 'color']
    return new_data

def tlhw_to_corners(data):
    top, left, height, width = [data[[col]].values for col in TLHW]
    x1, y1 = left, top
    x2, y2 = left + width, top + height
    assert np.all(x1 < x2) and np.all(y1 < y2), "All x1 must be < x2 and all y1 must be < y2"
    assert np.all(x1 >= 0) and np.all(y1 >= 0), "All x1 and y1 must be >= 0"
    data[TLHW] =  np.hstack([x1,y1,x2,y2])
    data.columns = OUT_HEADER
    return data

def get_body_names(data):
    return data[['body']].values

def get_colors_names(data):
    return data[['color']].values

def align_stanford_classes(data, all_bodies):
    data = data.values
    class_id_idx = 5
    rows = pd.read_csv(f'{input_dir}/stanford-cars/names.csv', header=None).values
    bodies = pd.Series([row[0].split(' ')[-2].lower() for row in rows])
    
    # coerce some names
    for old, new in mappings.items():
        bodies = bodies.replace(old, new)
    
    bodies = bodies.values
    # replace class id with class name
    class_ids = data[:,class_id_idx].astype(int)
    class_names = bodies[class_ids -1]
    data[:,class_id_idx] = class_names

    filtered_data = [data[i] for i in range(len(data)) if data[i,class_id_idx] in all_bodies]
    
    dummy_color =  np.repeat('black', (len(filtered_data)))
    filtered_data = np.hstack([filtered_data, np.expand_dims(dummy_color,1)])
    assert np.all([x in all_bodies for x in filtered_data[:,class_id_idx]]), "Bodies must all be in the specified set of bodies"

    # set bounding boxes correctly
    filtered_data = pd.DataFrame(filtered_data)
    filtered_data.columns = OUT_HEADER
    return filtered_data


def split(data, prop=0.9):
    split_point = int(len(data) * prop)
    return data.iloc[:split_point], data.iloc[split_point:]

def shuffle(data):
    return data.sample(frac=1).reset_index(drop=True) # "drop" prevents old index from being prepended to columns

def get_colored_cars(all_colors):
    fnames = [f for f in os.listdir(f'{input_dir}/car-colors/train') if not f.startswith('.')]
    fnames = np.expand_dims(fnames,1 )
    # set dummy bbox such that [x1,y1] < [x2,y2]
    dummy_bbox = np.expand_dims([20,20,200,200], 0)
    dummy_bboxes = np.repeat(dummy_bbox, (len(fnames)), 0)
    dummy_body = np.expand_dims(np.repeat('coupe', (len(fnames))),1)
    colors = np.expand_dims(list(map(lambda x: x[0].split('_')[0], fnames)), 1)
    print(set([x[0] for x in colors]))
    assert np.all([x[0] in all_colors for x in colors]), "All colors must be one of {}".format(all_colors)
    data = pd.DataFrame(np.hstack([fnames, dummy_bboxes, dummy_body, colors]))
    data.columns = OUT_HEADER
    return data


def copy_images(data, in_dir, out_dir):
    data.reset_index(drop=True, inplace=True)
    for idx, row in enumerate(tqdm(data.values)):
        fname = row[0]
        x1, y1, x2, y2 = [int(x) for x in row[1:5]]
        Xs = np.array([x1,x2])
        Ys = np.array([y1,y2])
        
        in_path = f'{in_dir}/{fname}'
        out_path = f'{out_dir}/{fname}'

        img = cv2.imread(in_path)
        H, W = img.shape[:2]

        if (
            not np.all(np.hstack([Xs, Ys]) >= 0) or
            not np.all(Xs <= W) or 
            not np.all(Ys <= H)
        ):
            print('Dropped row with out of bounds bbox: {}'.format(fname))
            data.drop([idx], inplace=True)
        cv2.imwrite(out_path, img)
    return data
        
def save_dataset(data, ds_name, folder, set_name):
    out_dir = output_dir + '/' + ds_name
    img_dir = out_dir + '/' + folder

    for dir_ in [out_dir, img_dir]:
        if not os.path.exists(dir_):
            os.mkdir(dir_)

    # copy images to outfolder
    data = copy_images(data, f'{input_dir}/{ds_name}/{folder}', img_dir)
    
    # append folder/ to fname
    data[['file']] = folder + '/' + data[['file']]

    # save csv
    data.to_csv(f'{out_dir}/{set_name}_annotations.csv', index=False, header=False)

def merge_4x4s(data):
    data = data.replace('4x4', 'suv')
    return data

def merge_greys(data):
    data = data.replace('grey', 'silver')
    return data

def nzvd_pipeline(data):
    data = remove_extraneous(data)
    data = tlhw_to_corners(data)
    data = merge_4x4s(data)
    data = merge_greys(data)
    data = shuffle(data)
    return data

In [4]:
input_dir = 'data/raw'
output_dir = 'data/processed'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

TLHW = ['top', 'left', 'height', 'width']
XYXY = ['x1', 'y1', 'x2', 'y2']
OUT_HEADER = ['file', *XYXY, 'body', 'color']

mappings = {
    'supercab': 'ute',
    'cab': 'ute',
    'minivan': 'people-mover',
    'wagon': 'station-wagon',
}

nzvd_train = pd.read_csv(f'{input_dir}/nzvd/train_labels.csv')
nzvd_test = pd.read_csv(f'{input_dir}/nzvd/test_labels.csv')

stan = pd.read_csv(f'{input_dir}/stanford-cars/anno_train.csv', header=None)
stan.columns = ['file', *TLHW, 'class']

nzvd_train, nzvd_test = nzvd_pipeline(nzvd_train), nzvd_pipeline(nzvd_test)

all_data = nzvd_train.append(nzvd_test, ignore_index=True)
all_bodies = np.unique(get_body_names(all_data))
all_colors = np.unique(get_colors_names(all_data))

stan = align_stanford_classes(stan, all_bodies)
stan = shuffle(stan)

colored_cars = get_colored_cars(all_colors)
colored_cars = shuffle(colored_cars)

nzvd_train, nzvd_val = split(nzvd_train)
assert abs(len(nzvd_val) * 9 - len(nzvd_train)) < 10

stan_train, stan_val = split(stan)
assert abs(len(stan_val) * 9 - len(stan_train)) < 10

colored_train, colored_val = split(colored_cars)
assert abs(len(colored_val) * 9 - len(colored_train)) < 10

assert np.all(np.equal(nzvd_train.columns, nzvd_test.columns)), "Column names must match"
assert np.all(np.equal(nzvd_train.columns, stan.columns)), "Column names must match"

save_dataset(nzvd_train, 'nzvd', 'train', 'train')
save_dataset(nzvd_val, 'nzvd', 'train', 'val')
save_dataset(nzvd_test, 'nzvd', 'test', 'test')
save_dataset(stan_train, 'stanford-cars', 'train', 'train')
save_dataset(stan_val, 'stanford-cars', 'train', 'val')
save_dataset(colored_train, 'car-colors', 'train', 'train')
save_dataset(colored_val, 'car-colors', 'train', 'val')

classes_out = pd.DataFrame(np.stack([all_bodies, list(range(len(all_bodies)))], axis=1))
colors_out = pd.DataFrame(np.stack([all_colors, list(range(len(all_colors)))], axis=1))
classes_out.to_csv(output_dir +'/classes.csv', index=False, header=False)
colors_out.to_csv(output_dir + '/colors.csv', index=False, header=False)

3%|▎         | 10/355 [00:00<00:03, 91.24it/s]{'orange', 'white', 'purple', 'silver', 'blue', 'brown', 'black', 'green', 'yellow', 'red', 'cream', 'gold'}
100%|██████████| 355/355 [00:03<00:00, 114.01it/s]
100%|██████████| 40/40 [00:00<00:00, 112.51it/s]
100%|██████████| 100/100 [00:00<00:00, 118.33it/s]
 34%|███▍      | 2336/6788 [00:37<01:28, 50.14it/s]Dropped row with out of bounds bbox: 07389.jpg
100%|██████████| 6788/6788 [01:54<00:00, 59.37it/s]
100%|██████████| 755/755 [00:17<00:00, 42.08it/s]
  4%|▍         | 50/1137 [00:03<00:51, 21.15it/s]Dropped row with out of bounds bbox: green_54.jpeg
 80%|████████  | 915/1137 [00:48<00:16, 13.80it/s]Dropped row with out of bounds bbox: red_99.jpeg
 95%|█████████▍| 1077/1137 [00:55<00:02, 20.24it/s]Dropped row with out of bounds bbox: cream_46.jpeg
100%|██████████| 1137/1137 [00:58<00:00, 19.51it/s]
100%|██████████| 127/127 [00:08<00:00, 15.54it/s]


In [5]:
# verify BBs
from cv2 import cv2
import pandas as pd
import os
os.chdir('/Users/nick/Documents/school/research/EfficientLPR')
data = pd.read_csv('data/processed/stanford-cars/train_annotations.csv')
data.columns = ['file', 'x1', 'y1', 'x2', 'y2', 'body', 'color']
print(data[['body']].count())

index = 500
sample = data.values[index,:]
fname = sample[0]
print(sample[5:])
a, b, c, d = [int(x) for x in sample[1:5]]
img = cv2.imread('data/processed/stanford-cars/' + fname)
img = cv2.circle(img, (a, b), 5, (250, 0,0)) #x1
img = cv2.circle(img, (c, d), 5, (0, 250,0)) #y1
cv2.imwrite('sample.png', img)

body    6786
dtype: int64
['hatchback' 'black']


True

In [9]:
os.chdir('/Users/nick/Documents/school/research/EfficientLPR')
data = pd.read_csv('data/processed/car-colors/train_annotations.csv')

index = 10
sample = data.values[index, :]
print(sample)
img = cv2.imread('data/processed/car-colors/' + sample[0])
cv2.imwrite('sample.png', img)

['train/silver_76.jpeg' 20 20 200 200 'coupe' 'silver']


True