## Crawling Random images..

In [1]:
from __future__ import print_function
from math import ceil
import sys
import os
import json
from cffi.api import basestring
import urllib.request
from PIL import Image
import numpy as np
import imageio

import random


## ImagesDownloader: get a list of links, download the images and order them in a folder..

In [2]:
def check_folder_existance(folderpath, throw_error_if_no_folder=False, display_msg=True):
    """ check if a folder exists.
        If throw_error_if_no_folder = True and the folder does not exist:
            the method will print an error message and stop the program,
        Otherwise:
            the method will create the folder
    """
    if not os.path.exists(folderpath):
        if throw_error_if_no_folder:
            print("Error: folder '" + folderpath + "' does not exist")
            exit()
        else:
            os.makedirs(folderpath)
            if display_msg:
                print("Target folder '", folderpath, "' does not exist...")
                print(" >> Folder created")


class ImagesDownloader(object):
    """Download a list of images, rename them and save them to the specified folder"""

    images_links = []
    failed_links = []
    default_target_folder = 'images'

    def __init__(self):
        print("Preparing to download images...")

    def download(self, links, target_folder='./data'):
        """Download images from a lisk of links"""

        # check links and folder:
        if len(links) < 1:
            print("Error: Empty list, no links provided")
            exit()
        self.images_links = links
        check_folder_existance(target_folder)
        if target_folder[-1] == '/':
            target_folder = target_folder[:-1]

        # start downloading:
        print("Downloading files...")
        progress = 0
        images_nbr = len(self.images_links)
        for link in self.images_links:
            target_file = target_folder + '/' + link.split('/')[-1]
            try:
                f = urllib.request.URLopener()
                f.retrieve(link, target_file)
                    
                #Resize the image and save
                #image = imageio.imread(target_file)
                #image_resized = np.array(Image.fromarray(image).resize( (128, 128)))
                #imageio.imwrite(target_file, image_resized)

            except IOError:
                self.failed_links.append(link)
            progress = progress + 1
            print("\r >> Download progress: ", (progress * 100 / images_nbr), "%...", end="")
            sys.stdout.flush()

        print("\r >> Download progress: ", (progress * 100 / images_nbr), "%")
        print(" >> ", (progress - len(self.failed_links)), " images downloaded")

        # save failed links:
        if len(self.failed_links):
            f2 = open(target_folder + "/failed_list.txt", 'w')
            for link in self.failed_links:
                f2.write(link + "\n")
            print(" >> Failed to download ", len(self.failed_links),
                  " images: access not granted ",
                  "(links saved to: '", target_folder, "/failed_list.txt')")

## WebCrawler: fetch images from various search engines and download them..

In [6]:
class WebCrawler(object):
    """ Fetch images from various search engines and download them"""
    api_keys = []
    images_links = []
    keywords = ''

    def __init__(self, api_keys):
        self.api_keys = api_keys

    @staticmethod
    def error(msg):
        """Display an error message and exit"""
        print("Error: ", msg)
        exit()

    def collect_links_from_web(self, number_links_per_engine, remove_duplicated_links=False):
        # validate params:
        number_links = int(number_links_per_engine)
        if number_links <= 0:
            print("Warning: number_links_per_engine must be positive, value changed to default (100 links)")
            number_links = 100

        # call methods for fetching image links in the selected search engines:
        print("Start fetching...")
        extracted_links = []
        for engine, keys in self.api_keys.items():
            try:
                method = getattr(self, 'fetch_from_' + engine)
            except AttributeError:
                self.error('funciton fetch_from_' + engine + '() not defined')
            temporary_links = method(keys[0], keys[1], number_links)
            extracted_links += temporary_links
            print("\r >> ", len(temporary_links), " links extracted", end="\n")

        # remove duplicated links:
        if remove_duplicated_links:
            links_count = len(extracted_links)
            extracted_links = list(set(extracted_links))
            print(" >> ", links_count - len(extracted_links),
                " duplicated links removed, ", len(extracted_links), " kept")

        # store the links into the global list:
        self.images_links = extracted_links


    def fetch_from_flickr(self,api_key, api_secret, number_links=50):
        """ Fetch random images from Flikr """
        from flickrapi import FlickrAPI 
        
        random.seed()
        rand_page = random.randint(1,50)   #4000 pages 
    
        # calculate number of pages:
        if number_links < 30:
            items_per_page = number_links
        else:
            items_per_page = 30   # max 200 for flikr
        pages_nbr = int(ceil(number_links / items_per_page))
        links = []

        # get links from the random page:
        print("Carwling Flickr Search...")
        flickr = FlickrAPI(api_key, api_secret)
        response = flickr.photos_search(api_key=api_key,
                                        page=rand_page,
                                        per_page=items_per_page,
                                        media='photos',
                                        sort='relevance')
        images = [im for im in list(response.iter()) if im.tag == 'photo']
        for photo in images:
            photo_url = "https://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg". format(
                photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'))
            links.append(photo_url)
        print(" >> ", len(links), " links extracted...", end="")

        # get next pages:
        for i in range(1, pages_nbr):
            response = flickr.photos_search(api_key=api_key,
                                            page=rand_page+1,
                                            per_page=items_per_page,
                                            media='photos',  
                                            sort='relevance')
            images = [im for im in list(response.iter()) if im.tag == 'photo']
            for photo in images:
                link = "https://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg". format(
                    photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'))
                links.append(link)
            print("\r >> ", len(links), " links extracted...", end="")

        # store and reduce the number of images if too much:
        return links


    def save_urls(self, filename):
        """ Save links to disk """
        folder, _ = os.path.split(filename)
        if filename and not os.path.exists(folder):
            os.makedirs(folder)
        with open(filename, 'w') as links_file:
            for link in self.images_links:
                links_file.write(link + '\n')
        print("\nLinks saved to '", filename, "'")

    def load_urls(self, filename):
        """ Load links from a file"""
        if not os.path.isfile(filename):
            self.error("Failed to load URLs, file '" + filename + "' does not exist")
        with open(filename) as links_file:
            self.images_links= []
            for link in links_file:
                self.images_links.append(link)
        print("\nLinks loaded from ", filename)

    def download_images(self, target_folder='./data'):
        """ Download images and store them in the specified folder """
        print(" ")
        downloader = ImagesDownloader()
        if not target_folder:
            target_folder = './data'
        downloader.download(self.images_links, target_folder)

## Get Flickr API and crawl the web..

In [7]:
api_keys = {'flickr': ('24c5ab910f81210e9d70652814f367da', 'd52bd1a9550316d2')}
images_nbr = 10                     # number of images to fetch
download_folder = "D:/XAMPP/htdocs/Package/Image_Retrieval/data/" # folder in which the images will be stored


In [8]:
crawler = WebCrawler(api_keys)

# 1. Crawl the web and collect URLs:
crawler.collect_links_from_web(images_nbr, remove_duplicated_links=True)

# 2. Save URLs to download them later (optional):
crawler.save_urls(download_folder + "/links.txt")

#  (alernative to the previous line) Load URLs from a file instead of the web:
#crawler.load_urls(download_folder + "/links.txt")

# 3. Download the images:
crawler.download_images(target_folder=download_folder)

Start fetching...
Carwling Flickr Search...
 >>  10  links extracted
 >>  0  duplicated links removed,  10  kept

Links saved to ' ./data2/links.txt '
 
Preparing to download images...
Downloading files...
 >> Download progress:  100.0 %
 >>  10  images downloaded
