### Image Download & Processing Functions:

The functions below downloads images specified by query from Microsoft Cognitive Services API and processes the files to remove corrupted & duplicate images.

In [None]:
from requests import exceptions
import argparse
import requests
import cv2
import os
import itertools as it
import shutil
from collections import Counter, defaultdict
import numpy as np
import random
import string
from PIL import Image

In [4]:
# set path to image download directory:

path = '/path-to-image-directory/'

In [17]:
# Run Image Download & Process:

Image_Download_Process(path)
        
# Uncomment to run functions independently:

#Bad_Pics(path)
#Check_Duplicates(path)
#Name_Class(path)

Found 846 files in directory.


Type class name: buck


50 files processed...
100 files processed...
150 files processed...
200 files processed...
250 files processed...
300 files processed...
350 files processed...
400 files processed...
450 files processed...
500 files processed...
550 files processed...
600 files processed...
650 files processed...
700 files processed...
750 files processed...
800 files processed...
Renamed 846 files

Example: /home/ubuntu/project-5/data/images/temp/buck0845.jpg


In [9]:
# process image files in directory_path to detect corrupted or None types:

def Bad_Pics(path):

    bad_pics = []
    i = 0

    for fname in os.listdir(path):
        try:
            img = Image.open(path + fname) # open image file
            img.verify() # verify file is an image
            cv2.imread(fname)
        except:
            bad_pics.append(fname)
        i += 1
    print("total files:", i)
    print("bad files:", len(bad_pics))

    if len(bad_pics) != 0:
        user_answer = input('Remove bad files? [y] or [n]')

        if user_answer == 'y':
            ctr = 0
            for bp in bad_pics:
                try:
                    print("Deleting... {}".format(bp))
                    os.remove(path + bp)
                except:
                    if bp == '.ipynb_checkpoints':
                        shutil.rmtree(path + bp)     # remove checkpoint file if present
                ctr += 1
            print('Deleted {} files.'.format(ctr))
        else:
            print('No files deleted.')

In [14]:
# Group images by size to increase computational efficiency:

def Image_Optimizer(name_size):

    short_list = []
    files_investigate = {}
    counts = dict()

    # find unique image size values & store in a dictionary:

    size_list = [n[1] for m,n in enumerate(name_size)]  

    poss_dupes = set(size_list)     

    for m, n in enumerate(name_size):
        if n[1] in poss_dupes:           
            counts[n[1]] = counts.get(n[1], 0) + 1

    # remove items with values = 1 (no need to process images of unique sizes) to use as a checksum:

    short_list = [c for c in counts if counts[c] > 1]  

    # create a dictionary where keys = file size and values = lists of file names.
    # this reduces processing by ensuring images are only compared to others within groups of identical sizes

    for ns in name_size:
        if ns[1] in files_investigate:
             files_investigate[ns[1]].append(ns[0])
        else:
            files_investigate[ns[1]] = [ns[0]]

    # dictionary filters `files_investigate` for values > 1:

    filtered_dict = defaultdict(list)

    for k, v in files_investigate.items():
        if len(v) > 1:
            filtered_dict[k].append(v)

    if len(short_list) == len(filtered_dict):
        print('>> Created {} groups of images to compare\n'.format(len(short_list)))
        return(filtered_dict)
    else:
        print("Error detected.")

In [None]:
# check for redundant images in <path> directory:

def Check_Duplicates(path):
    
    duplicates_list, corrupted_list, size_list, name_size = [], [], [], []
    comp_dict = {}

    # create list of all files in directory & check for errors:
    
    img_list = [i for i in os.listdir(path)] 
    
    for i, j in enumerate(img_list):
        read = cv2.imread(path + j)
    
        # create list of tuples in filename, size format:

        try:
            temp = read.shape
            size_list = (j, temp)
            name_size.append(size_list)

        except:
            if j == '.ipynb_checkpoints':
                shutil.rmtree(path + j)     # remove jupyter labs checkpoint file if present
            else:
                print('Bad image found:', j)
    
    print('Original image list size:', len(name_size))
    
    # optimize processing by organizing images into groups of equal size:
    
    prepped_images = Image_Optimizer(name_size)   
    
    # conduct pairwise comparison of images sharing a key:
    
    for k, v in prepped_images.items():
        v = sum(v, [])                        # flatten values list
        img_combos = it.combinations(v, 2)    # create list of pairwise combinations from values

        print('Image combinations being processed:', len(list(img_combos)))

        for i, j in enumerate(img_combos):
            try:
                original = cv2.imread(path + j[0])
                duplicate = cv2.imread(path + j[1])

                if original.shape == duplicate.shape:    # double check that image dimensions equal 
                  
                # compute image differences and split by channel:
                
                    difference = cv2.subtract(original, duplicate)
                    b, g, r = cv2.split(difference)

                    if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
                        print('Images are completely Equal:', j)
                        duplicates_list.append(j)    # append duplicate filenames to list
            
            except:
                print('Bad file(s) detected:', fname)
                corrupted_list.append(fname)
                
    print('Duplicates: ', len(duplicates_list))
    print('Corrupted: ', len(corrupted_list))
    
    if len(duplicates_list) > 0:
        
        d_list = [i[0] for i in duplicates_list]
        
        for dl in duplicates_list:
            if dl[0] in comp_dict:
                comp_dict[dl[0]].append(dl[1])
            else:
                comp_dict[dl[0]] = [dl[1]]
        
        # compile a list of values from comp_dict to delete:
        
        dump_list = []
        
        for cd, v in comp_dict.items():
            dump_list.extend(v)
        
        print('Dump List:',len(dump_list))
        
        # remove duplicate values and convert to ordered data structure
        
        dump_list = list(set(dump_list))    
        print(dump_list)
        
        # user confirmation to delete files:
        
        user_answer = input('Remove duplicate files? [y] or [n]')

        if user_answer == 'y':
            ctr = 0
            for d in dump_list:
                os.remove(path + d)
                ctr += 1
            print('Deleted {} files.'.format(ctr))

In [16]:
# Rename files to be recognized by Keras - be sure to include trailing / in path:

def Name_Class(path):
    
    backup_list = []

    print('Found {} files in directory.'.format(len(os.listdir(path))))
    prefix = input('Type class name:')

    dataset = path
    name = prefix

    for fname in os.listdir(dataset):
        backup_list.append(fname)
    
    i = 0
    for fname in os.listdir(dataset):
        if i < 10:
            dst = name + '000' + str(i) + '.jpg'
        elif (i >= 10 and i < 100):
            dst = name + '00' +  str(i) + '.jpg'
        else:
            dst = name + '0' + str(i) + '.jpg'
    
    # check to prevent overwriting existing files and if so, add a random letter suffix
    
        if dst in backup_list:
            dst = str(dst).split('.')
            suffix = random.choice(string.ascii_lowercase)
            dst = str(dst[0]) + str(suffix) + '.jpg'
            
        src = dataset + fname
        dst = dataset + dst
        os.rename(src, dst)
        
        i += 1
        if i % 50 == 0:
            print('{} files processed...'.format(i))
    
    print('Renamed {} files\n'.format(i))
    print('Example: {}'.format(dst))
    

In [7]:
def Image_Download_Process(path):
    
    # function to download images from Microsoft Cognitive Services API
    # code adapted from Adrian Rosebrock post: https://www.pyimagesearch.com/2018/04/09/how-to-quickly-build-a-deep-learning-image-dataset/
  
    API_KEY = "867-5309"
    MAX_RESULTS = int(input('Total images to download:'))
    GROUP_SIZE = 50
    image_offset = int(input('Specify offset:'))
    output_path = path
    URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"

    # build exceptions filter
  
    EXCEPTIONS = set([IOError, FileNotFoundError, exceptions.RequestException, exceptions.HTTPError,
                      exceptions.ConnectionError, exceptions.Timeout])

    # store the search term and set headers and search parameters:

    term = input('Search term:')
    headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
    params = {"q": term, "offset": image_offset, "count": GROUP_SIZE}

    # initiate search:
    
    print("[INFO] searching Bing API for '{}'".format(term))
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()

    # estimate results returned by the Bing API:
    
    results = search.json()
    estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
    print("[INFO] {} total results for '{}'".format(estNumResults,
        term))
    
    total = 0

    # loop over the estimated number of results in `GROUP_SIZE` groups
    
    for offset in range(0, estNumResults, GROUP_SIZE):
        # update the search parameters using the current offset, then
        # make the request to fetch the results
        print("[INFO] making request for group {}-{} of {}...".format(
            offset, offset + GROUP_SIZE, estNumResults))
        params["offset"] = offset
        search = requests.get(URL, headers=headers, params=params)
        search.raise_for_status()
        results = search.json()
        print("[INFO] saving images for group {}-{} of {}...".format(
            offset, offset + GROUP_SIZE, estNumResults))

    # loop over the results:
    
        for v in results["value"]:
            
            try:
                # make a request to download the image
                print("[INFO] fetching: {}".format(v["contentUrl"]))
                r = requests.get(v["contentUrl"], timeout=30)

                # build the path to the output image
                ext = v["contentUrl"][v["contentUrl"].rfind("."):]
                p = os.path.sep.join([output_path, "{}{}".format(
                    str(total).zfill(8), ext)])

                # write the image to disk
                f = open(p, "wb")
                f.write(r.content)
                f.close()

            # catch errors that may cause download to abort:
            
            except Exception as e:
                if type(e) in EXCEPTIONS:
                    print("[INFO] skipping: {}".format(v["contentUrl"]))
                    continue
            image = cv2.imread(p)
            
            # if the image is `None` then we remove it:
            
            if image is None:
                print("[INFO] deleting: {}".format(p))
                os.remove(p)
                continue
              
            total += 1
    print("\n Download Complete")
    
    # call function to check & delete downloaded images with errors:
    
    Bad_Pics(output_path)  
    
    # call function to compare downloaded images with those existing in the directory:
    
    Check_Duplicates(output_path)  
    
    # call function to rename new files in directory to prevent overwrites:
    
    Name_Class(output_path) 

Images to download: 1450
Specify offset: 50
Search term: whitetail buck


[INFO] searching Bing API for 'whitetail buck'
[INFO] 777 total results for 'whitetail buck'
[INFO] making request for group 0-50 of 777...
[INFO] saving images for group 0-50 of 777...
[INFO] fetching: http://api.ning.com/files/6M8H373xlgT0Rz6a4IFWuMl02bpeo4RJ4093TlKyOQxqa3DgIccuBQcObga9WIkb*arjz6*5pyaSJ7Fnhpu-nej59CxqtGyG/WhitetailBuckDeerWallpaperHD.jpg
[INFO] fetching: https://i1.wp.com/homesteadinghuntress.com/wp-content/uploads/2017/09/whitetail-deer-pinterest.jpg?resize=735%2C1102&ssl=1
[INFO] fetching: http://www.gameandfishmag.com/files/2013/11/1311-NAW_JohnFordWhitetail.jpg
[INFO] fetching: http://media.spokesman.com/photos/2013/11/17/jj_WT_buck_running.jpg
[INFO] fetching: http://wallpapercave.com/wp/dfMQZNf.jpg
[INFO] fetching: http://4.bp.blogspot.com/_SnuEJgrQl4g/S7A0E9ZePjI/AAAAAAAABoc/dgdu5sEKrQI/s1600/340+White-tailed+Deer+Copyright+John+Ford.jpg
[INFO] fetching: https://i.pinimg.com/736x/7e/84/24/7e842451515da1b531f1daf786a98e7f--whitetail-bucks-shotguns.jpg
[INFO] fe