# Search Google Images

Is it possible to scrape the web, either using a google API or a more hacky solution, to automatically get images of a certain thing from the web? e.g. 10000 images of a dog?

The way in which we do this is as follows:
    - Use the request library to go to google images and search for something, e.g. cats.
    - Parse the resulting html for image urls.
    - Then we get the images (also with request) by looping through the URLs.

In [None]:
import requests
from IPython.display import HTML

from PIL import Image
from io import BytesIO

import os

In [None]:
class ImageIterator:
    """
    Class that is used as an iterator. Runs over urls of images for a given
    search term, upto a maximum number.
    
    e.g.
    ```
    for url in ImageIterator("cat", 64):
        print(url)
    ```    
    """
    
    # These are class variables rather than belonging to an object, as they are
    # always the same.
    end_point = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
    key = "b65626c787084c74955266b935f8d1ff"
    # This is the maximum number of images that bing lets us fetch at one
    # go.
    MAX_IMAGES = 150
    
    def __init__(self, search_term, max_values):
        """
        Input:
        search_term: str of what to search for, e.g. "cat"
        max_values: int
            Stop iterating after this many values, may stop
            before this if fewer than this are returned by
            bing.
        """

        self.search_term = search_term
        self.max_values = max_values
        
    # Does a search and returns urls
    def _search(self, total, skip):
        headers = {"Ocp-Apim-Subscription-Key" : self.key}
        params  = {"q": self.search_term, "count":total, "offset":skip}
        response = requests.get(self.end_point, headers=headers, params=params)
        response.raise_for_status()
        search_results = response.json()
        self.total_images = search_results['totalEstimatedMatches']
        self.max_values = min([self.max_values, self.total_images])
        return (el['contentUrl'] for el in search_results['value'])

    def __iter__(self):
        self.page_number = 0
        self.images_returned = 0
        self.search_results = self._search(min([self.MAX_IMAGES, self.max_values]), 0)
        return self
    
    def __next__(self):
        self.images_returned += 1
        if self.images_returned > self.max_values:
            raise StopIteration
        else:
            try:
                return next(self.search_results)
            # This means that this iteration has finished, but the user still
            # wants more, which means we have to call the API again with some
            # pagination
            except StopIteration:
                remaining_images = self.max_values - self.images_returned
                next_return = min([self.MAX_IMAGES, remaining_images])
                page = self.images_returned
                self.search_results = self._search(next_return, page)
                return next(self.search_results)


In [None]:
def process_image(url, directory=None, name=None, size=400):
    """
    Gets the image from a url, makes it square and turns it black
    and white and saves to a directory.
    """
    
    try:
        response = requests.get(url, timeout=1.)
        img = Image.open(BytesIO(response.content))
        img = img.resize((size,size)).convert('L')
        if name is None:
            return img
        file_name = os.path.join(directory, name)
        img.save(file_name)
    except:
        print("Can't do {}".format(url))

In [None]:
base_dir = "../data/convnet"

In [None]:
def do_for_term(term, total=1000):
    """
    Run the whole thing for a search term.
    """
    
    safe_term = term.replace(" ", "_")

    DIR = os.path.join(base_dir, safe_term)

    try:
        os.mkdir(os.path.join(base_dir, "{}".format(safe_term)))
    except FileExistsError:
        pass
    
    for i,url in enumerate(ImageIterator(term, total)):
        print(i)
        name="{}_{}.jpg".format(safe_term, i)
        pi = process_image(url, size=128, directory=DIR, name=name)

In [None]:
do_for_term("cat")

In [None]:
do_for_term("dog")

In [None]:
do_for_term("monkey")

In [None]:
do_for_term("donald trump")

In [None]:
do_for_term("donkey")