# Google Images Web Scraper

https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d


In [36]:
#set up the driver

import os,time,io,hashlib,requests
from selenium import webdriver
from PIL import Image

# Put the path for your ChromeDriver here
DRIVER_PATH = '/Users/sarawang/server/chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH) #should open up a 'ghost browser'

In [37]:
#Searching for a particular phrase & get the image links
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)
        
    return image_urls

#Downloading the images
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

#putting it all together
def search_and_download(search_term:str,driver_path:str,number_images:int,target_path='./scraped_images'):
    
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [38]:
#search!
#images get saved to /Users/sarawang/Documents/Cornell Tech/Fall 2020/Applied Machine Learning/Project/scraped_images

#'fridge leftovers','food storage containers fridge'

term='food storage containers fridge'
n_images = 150

print(term)
search_and_download(search_term=term,driver_path=DRIVER_PATH,number_images=n_images) 

food storage containers fridge
Found: 100 search results. Extracting links from 0:100
Found: 150 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcoSCO-nw9z9tGx5DjoO8AcctdscQWOrhkSA&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/e35dad0de7.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTVMd4sWZZrir25eISkiaugfmIDMDsKELMPzw&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/fc6359dac8.jpg
ERROR - Could not save https://www.thehomeedit.com/wp-content/uploads/2020/05/04125C-04129C-04128C-04232C-04233C-04234C-04236C-04237C-ENV-1-3-scaled.jpg - cannot identify image file <_io.BytesIO object at 0x7f8b28898e50>
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRIX7XpXWWdySsugu79X8p793EYV1fkdoJjcw&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/539ff521ae.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSMBzz7x6KL4MqUiD5n6RWzzR0rJf11HUcMaA&usqp

SUCCESS - saved https://www.tupperware.com/media/catalog/product/cache/9acef7d1ef9b6ecea35dddea8ea8fdff/s/t/stackables_enviro_1000x960.jpg - as ./scraped_images/food_storage_containers_fridge/9d2a3329f9.jpg
SUCCESS - saved https://ae01.alicdn.com/kf/HTB1jLfKOVXXXXbzXFXXq6xXFXXXK/Reusable-Silicone-Vacuum-Food-Sealer-Bags-Wraps-Fridge-Food-Storage-Containers-Refrigerator-Bag-Kitchen-Colored-Ziplock.jpg - as ./scraped_images/food_storage_containers_fridge/98ccbfc1e5.jpg
SUCCESS - saved https://m.media-amazon.com/images/I/51ZxUuflQXL.jpg - as ./scraped_images/food_storage_containers_fridge/de8e1ecdbe.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTX5b4nV2CqwSTvUrCqSD8JOrzZNv9Z7cz5Xg&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/999d3f1379.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSsw5iH4GN8TbGmRbt5s78kSkNSuv7H0ZtUwQ&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/0a7d3b58c0.jpg
SUCCESS - saved https://

SUCCESS - saved https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/fridge-png-1560878124.png?resize=480:* - as ./scraped_images/food_storage_containers_fridge/f246f3cc62.jpg
SUCCESS - saved https://i.insider.com/59496033e592ed34008b59b5?width=592&format=jpeg - as ./scraped_images/food_storage_containers_fridge/a2ee20617c.jpg
SUCCESS - saved https://cdn.vox-cdn.com/thumbor/Lgu0RQ5zJerwCcV_lurmB0jqlfs=/1400x1400/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/19996923/shutterstock_710030311.jpg - as ./scraped_images/food_storage_containers_fridge/e7f0974b50.jpg
SUCCESS - saved https://image.made-in-china.com/202f0j00BqVRcodlGwkN/Food-Storage-Container-Fridge-Organizer-Case-with-Removable-Drain-Tray-to-Keep-Fresh-for-Produce-Fruits-Vegetables-Meat-and-Fish.jpg - as ./scraped_images/food_storage_containers_fridge/c7615a88ef.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQPF8sefnpLTVnQLc3GmVip2eLfMU8-k2HsmA&usqp=CAU - as ./scraped_images/f

SUCCESS - saved https://ae01.alicdn.com/kf/HTB19XVNVcbpK1RjSZFyq6x_qFXap/17PCS-Plastic-Storage-Bins-Refrigerator-Storage-Box-Food-Storage-Containers-with-Lid-Kitchen-Fridge-Cabinet-Freezer.jpg - as ./scraped_images/food_storage_containers_fridge/d5c5af6014.jpg
SUCCESS - saved https://i.insider.com/51250df66bb3f74e58000007?width=600&format=jpeg&auto=webp - as ./scraped_images/food_storage_containers_fridge/88a5a5842b.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/81SrvCybN5L._AC_SL1327_.jpg - as ./scraped_images/food_storage_containers_fridge/71c708ef49.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToivEKvSkPcB5dlIKiA3ghvj6zN1jniE6hew&usqp=CAU - as ./scraped_images/food_storage_containers_fridge/611b77ecf8.jpg
SUCCESS - saved https://hellonutritarian.com/wp-content/uploads/2016/01/3B-Refrigerator-Organization-Healthy-Vegan-Meal-Prep-Food-Storage-Containers-Marie-Kondo-Kitchen-organization-Batch-food-prep-Hello-Nutritarian.jpg - as ./scra

KeyboardInterrupt: 

In [None]:
wd.quit() #close the driver down