In [1]:
import os
import sys
import logging
import cv2
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('PrepData')

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
BASE_DATA_DIR = '../data/'

from vehicle_detector.utils import fs_utils

def download_datasets():
    datasets = [
        { 'url': 'https://s3.amazonaws.com/udacity-sdc/Vehicle_Tracking/vehicles.zip',
          'download_dir': os.path.join(BASE_DATA_DIR, 'zips'),
          'extract_dir': os.path.join(BASE_DATA_DIR, 'images')
        },
        { 'url': 'https://s3.amazonaws.com/udacity-sdc/Vehicle_Tracking/non-vehicles.zip',
          'download_dir': os.path.join(BASE_DATA_DIR, 'zips'),
          'extract_dir': os.path.join(BASE_DATA_DIR, 'images')
        },
    ]
    
    for dataset in datasets:
        file_path = fs_utils.download_file(dataset['url'], dataset['download_dir'])
        fs_utils.extract_zip(file_path, dataset['extract_dir'])

        
download_datasets()

In [3]:
def get_image_paths(dir_path):
    image_paths = []
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if file.endswith('.png'):
                image_paths.append(os.path.join(root, file))
    return sorted(image_paths)

def filter_similar_images(image_paths, intersection_thresh=750):
    '''
    Given a list of image paths, return a list of image paths with similar images removed
    Implemented by calculating histogram for the two images and comparing them by calculating the 
    overlap between those two histograms. If the overlap is greater than the given threshold then
    one of the similar image is ignore.
    '''
    curr_hist = None
    prev_hist = None

    filtered_images = [image_paths[0]]

    prev_image = cv2.imread(image_paths[0])
    curr_image = cv2.imread(image_paths[1])

    prev_hist = cv2.calcHist([prev_image], [0,1,2], None, [256, 256, 256], [0,256, 0,256, 0,256])
    curr_idx = 1


    with tqdm(total=len(image_paths)) as pbar:
        pbar.set_description('Filtering similar images')

        while True: 
            curr_hist = cv2.calcHist([curr_image], [0,1,2], None, [256, 256, 256], [0,256, 0,256, 0,256])

            intersection = cv2.compareHist(prev_hist, curr_hist, cv2.HISTCMP_INTERSECT)
            if intersection > 750:
                logger.debug('Ignoring %s - Score %d', os.path.basename(image_paths[curr_idx]), intersection)
            else:
                filtered_images.append(image_paths[curr_idx])
                prev_image = curr_image
                prev_hist = curr_hist
            curr_idx += 1
            
            if curr_idx >= len(image_paths):
                break
            curr_image = cv2.imread(image_paths[curr_idx])
            pbar.update(1)
    return filtered_images

In [4]:
GTI_data  = ['../data/images/vehicles/GTI_MiddleClose',
             '../data/images/vehicles/GTI_Far',
             '../data/images/vehicles/GTI_Left',
             '../data/images/vehicles/GTI_Right']

filtered_gti_images = []
unfiltered_gti_images = []

for data_path in tqdm(GTI_data, desc='GTI data'):
    
    this_path_images = get_image_paths(data_path)
    unfiltered_gti_images.extend(this_path_images)
    
    # Filter Similar Images since dataset is from a video sequence
    filtered_images = filter_similar_images(this_path_images)
    filtered_gti_images.extend(filtered_images)
    
    logger.info('Loaded %d images from %s', len(filtered_images), os.path.basename(data_path))

logger.info('Using %d images from a total of %d image in GTI Dataset',
            len(filtered_gti_images), len(unfiltered_gti_images))

INFO:PrepData:Loaded 218 images from GTI_MiddleClose
INFO:PrepData:Loaded 514 images from GTI_Far
INFO:PrepData:Loaded 519 images from GTI_Left
INFO:PrepData:Loaded 469 images from GTI_Right
INFO:PrepData:Using 1720 images from a total of 2826 image in GTI Dataset





In [6]:
non_vehicle_dirs = ['../data/images/non-vehicles/GTI',
                   '../data/images/non-vehicles/Extras']

filtered_vehicle_data = []
unfiltered_vehicle_data = []

kitti_vehicle_data = get_image_paths('../data/images/vehicles/KITTI_extracted')

filtered_vehicle_data.extend(filtered_gti_images)
filtered_vehicle_data.extend(kitti_vehicle_data)

unfiltered_vehicle_data.extend(unfiltered_gti_images)
unfiltered_vehicle_data.extend(kitti_vehicle_data)


all_non_vehicle_data = get_image_paths(non_vehicle_dirs[0])
all_non_vehicle_data.extend(get_image_paths(non_vehicle_dirs[1]))

logger.info('Total Positive Samples %s (Filtered)', len(filtered_vehicle_data))
logger.info('Total Positive Samples %s (Not Filtered)', len(unfiltered_vehicle_data))
logger.info('Total Negative Samples %s (Not Filtered)', len(all_non_vehicle_data))

INFO:PrepData:Total Positive Samples 7686
INFO:PrepData:Total Negative Samples 8968


In [8]:
import json
file_paths = {
    'vehicle_images': filtered_vehicle_data,
    'non_vehicle_images': all_non_vehicle_data
}

with open('../data/filtered_images_path.json', 'w') as fp:
    json.dump(file_paths, fp)

file_paths['vehicle_images'] = unfiltered_vehicle_data

with open('../data/unfiltered_images_path.json', 'w') as fp:
    json.dump(file_paths, fp)