<a href="https://colab.research.google.com/github/sandonli/Gecko-Binary-Classifier/blob/main/GeckoDatasetRetrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports:

In [None]:
import requests, re, time
import torch, torchvision
from torch import nn, optim
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import argparse, imghdr, pickle, posixpath, re
import signal, socket, threading
import urllib.parse, urllib.request
import datetime, os, sys, logging, hashlib
from pathlib import Path
from os import listdir
from os.path import isfile, join

%mkdir -p data/gold_dust_day_gecko
%mkdir -p data/giant_day_gecko

The following code comes from https://github.com/ostrolucky/Bulk-Bing-Image-downloader, and was used to gather the dataset. It has been adapted for the purposes of this project.

In [None]:
output_dir = '/content/data/giant_day_gecko' # When downloading other dataset, comment this line and uncomment the line below
#output_dir = '/content/data/gold_dust_day_gecko'


adult_filter = True  # Do not disable adult filter by default
socket.setdefaulttimeout(2)

tried_urls = []
image_md5s = {}
in_progress = 0
urlopenheader = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}

def download(pool_sema: threading.Semaphore, img_sema: threading.Semaphore, url: str, output_dir: str, limit: int):
    global in_progress

    if url in tried_urls:
        print('SKIP: Already checked url, skipping')
        return
    pool_sema.acquire()
    in_progress += 1
    acquired_img_sema = False
    path = urllib.parse.urlsplit(url).path
    filename = posixpath.basename(path).split('?')[0]  # Strip GET parameters from filename
    name, ext = os.path.splitext(filename)
    name = name[:36].strip()
    filename = name + ext

    try:
        request = urllib.request.Request(url, None, urlopenheader)
        image = urllib.request.urlopen(request).read()
        if not imghdr.what(None, image):
            print('SKIP: Invalid image, not saving ' + filename)
            return

        md5_key = hashlib.md5(image).hexdigest()
        if md5_key in image_md5s:
            print('SKIP: Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)
            return

        i = 0
        while os.path.exists(os.path.join(output_dir, filename)):
            if hashlib.md5(open(os.path.join(output_dir, filename), 'rb').read()).hexdigest() == md5_key:
                print('SKIP: Already downloaded ' + filename + ', not saving')
                return
            i += 1
            filename = "%s-%d%s" % (name, i, ext)

        image_md5s[md5_key] = filename

        img_sema.acquire()
        acquired_img_sema = True
        if limit is not None and len(tried_urls) >= limit:
            return

        imagefile = open(os.path.join(output_dir, filename), 'wb')
        imagefile.write(image)
        imagefile.close()
        print(" OK : " + filename)
        tried_urls.append(url)
    except Exception as e:
        print("FAIL: " + filename)
    finally:
        pool_sema.release()
        if acquired_img_sema:
            img_sema.release()
        in_progress -= 1

def fetch_images_from_keyword(pool_sema: threading.Semaphore, img_sema: threading.Semaphore, keyword: str,
                              output_dir: str, filters: str, limit: int):
    current = 0
    last = ''
    while True:
        time.sleep(0.1)

        if in_progress > 10:
            continue

        request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(keyword) + '&first=' + str(
            current) + '&count=35&adlt=' + adlt + '&qft=' + ('' if filters is None else filters)
        request = urllib.request.Request(request_url, None, headers=urlopenheader)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)
        try:
            if links[-1] == last:
                return
            for index, link in enumerate(links):
                if limit is not None and len(tried_urls) >= limit:
                    return
                t = threading.Thread(target=download, args=(pool_sema, img_sema, link, output_dir, limit))
                t.start()
                current += 1
            last = links[-1]
        except IndexError:
            print('FAIL: No search results for "{0}"'.format(keyword))
            return

def backup_history(*args):
    download_history = open(os.path.join(output_dir, 'download_history.pickle'), 'wb')
    pickle.dump(tried_urls, download_history)
    copied_image_md5s = dict(
        image_md5s)  # We are working with the copy, because length of input variable for pickle must not be changed during dumping
    pickle.dump(copied_image_md5s, download_history)
    download_history.close()
    print('history_dumped')
    if args:
        exit(0)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Bing image bulk downloader')
    parser.add_argument('-s', '--search-string', help='Keyword to search', required=False)
    parser.add_argument('-f', '--search-file', help='Path to a file containing search strings line by line',
                        required=False)
    parser.add_argument('-o', '--output', help='Output directory', required=False)
    parser.add_argument('--adult-filter-on', help='Enable adult filter', action='store_true', required=False)
    parser.add_argument('--adult-filter-off', help='Disable adult filter', action='store_true', required=False)
    parser.add_argument('--filters',
                        help='Any query based filters you want to append when searching for images, e.g. +filterui:license-L1',
                        required=False)
    parser.add_argument('--limit', help='Make sure not to search for more than specified amount of images.',
                        required=False, type=int)
    parser.add_argument('--threads', help='Number of threads', type=int, default=20)
    args = parser.parse_args()
    


    args.limit = 700;
    args.search_string = "Giant Day Gecko" # When downloading other dataset, comment this line and uncomment the line below
    #args.search_string = "Gold Dust Day Gecko"



    if (not args.search_string) and (not args.search_file):
        parser.error('Provide Either search string or path to file containing search strings')
    if args.output:
        output_dir = args.output
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_dir_origin = output_dir
    signal.signal(signal.SIGINT, backup_history)
    try:
        download_history = open(os.path.join(output_dir, 'download_history.pickle'), 'rb')
        tried_urls = pickle.load(download_history)
        image_md5s = pickle.load(download_history)
        download_history.close()
    except (OSError, IOError):
        tried_urls = []
    if adult_filter:
        adlt = ''
    else:
        adlt = 'off'
    if args.adult_filter_off:
        adlt = 'off'
    elif args.adult_filter_on:
        adlt = ''
    pool_sema = threading.BoundedSemaphore(args.threads)
    img_sema = threading.Semaphore()
    if args.search_string:
        fetch_images_from_keyword(pool_sema, img_sema, args.search_string, output_dir, args.filters, args.limit)
    elif args.search_file:
        try:
            inputFile = open(args.search_file)
        except (OSError, IOError):
            print("FAIL: Couldn't open file {}".format(args.search_file))
            exit(1)
        for keyword in inputFile.readlines():
            output_sub_dir = os.path.join(output_dir_origin, keyword.strip().replace(' ', '_'))
            if not os.path.exists(output_sub_dir):
                os.makedirs(output_sub_dir)
            fetch_images_from_keyword(pool_sema, keyword, output_sub_dir, args.filters, args.limit)
            backup_history()
            time.sleep(10)
        inputFile.close()

I chose to download 700 images from each as a way to overcome duplicates/bad images/image scraper not downloading up to the limit. The following code comes from https://github.com/KiranKumarChilla/Removing-Duplicate-Docs-Using-Hashing-in-Python and was used to remove any exact duplicates in the dataset. The code was modified for the purposes of this project.

In [None]:
input_files_path = r'/content/data/giant_day_gecko' # When removing duplicates from other dataset,
#input_files_path = r'/content/data/gold_dust_day_gecko'        # comment this line and uncomment the line below

input_files = []
input_files = [f for f in listdir(input_files_path) if isfile(join(input_files_path, f))]
input_files = [os.path.join(input_files_path, x) for x in input_files]
inp_dups = {}
unique_inps = {}

# It calculates the hash value for each file ; decrease the block size if input file size is more
def calculate_hash_val(path, blocksize=65536):
    afile = open(path, 'rb')
    hasher = hashlib.md5()
    buf = afile.read()
    while len(buf) > 0:
        hasher.update(buf)
        buf = afile.read()
    afile.close()
    return hasher.hexdigest()

# Joins two dictionaries
def find_dups(dic_unique, dict1, dict2={}):
    for key in dict1.keys():
        if key not in dict2 and key not in dic_unique:
            dic_unique[key] = dict1[key]

# Identifying unique files
def find_unique_files(dic_unique, dict1):
    for key in dict1.keys():
        if key not in dic_unique:
            dic_unique[key] = dict1[key]

def remove_duplicate_files(all_inps, unique_inps):
    for file_name in all_inps.keys():
        if all_inps[file_name] in unique_inps and file_name!=unique_inps[all_inps[file_name]]:
            os.remove(file_name)
        elif all_inps[file_name] not in unique_inps:
            os.remove(file_name)

# main function in this file which calls all other function and process inputs
def rmv_dup_process(input_files):
    all_inps={}

    for file_path in input_files:
        if Path(file_path).exists():
           files_hash = calculate_hash_val(file_path)
           inp_dups[files_hash]=file_path
           all_inps[file_path] = files_hash
        else:
            print('%s is not a valid path, please verify' % file_path)
            sys.exit()

    find_unique_files(unique_inps, inp_dups)
    remove_duplicate_files(all_inps, unique_inps)

if __name__ == '__main__':
    rmv_dup_process(input_files)

The next two cells were used to see the size of my datasets and to mount to drive.

In [None]:
img_folder_path = '/content/data/giant_day_gecko'
dirListing = os.listdir(img_folder_path)

img_folder_path2 = '/content/data/gold_dust_day_gecko'
dirListing2 = os.listdir(img_folder_path2)

print('Giant Day Gecko Images: ' + str(len(dirListing)))
print('Gold Dust Day Gecko Images: ' + str(len(dirListing2)))

In [None]:
 from google.colab import drive
 drive.mount('/content/gdrive')

Mounted at /content/gdrive
