# AstroBin Data Gathering

In [None]:
import requests
import json
import glob
import string
import os
from secrets import ASTROBIN_KEY, ASTROBIN_SECRET

## Create directories

In [None]:
planets = ['sun', 'mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune', 'pluto']
path = 'data/'
if not os.path.exists(path):
    os.mkdir(path+'train/')
    os.mkdir(path+'valid/')
    os.mkdir(path+'raw/')
    for planet in planets:
        os.mkdir(path+'train/'+planet)
        os.mkdir(path+'valid/'+planet)
        os.mkdir(path+'raw/'+planet)

## Search

In [None]:
astrobin_url = 'http://www.astrobin.com'
api_url = '/api/v1/image/'
base_params = {'api_key': ASTROBIN_KEY, 'api_secret': ASTROBIN_SECRET}

In [None]:
def search(queries):
    results = []
    for query in queries:
        query_results = []
        print 'Querying... ' + str(query)
        query.update(base_params)
        search_request = requests.get(astrobin_url+api_url, params=query)
        if search_request.status_code == 200:
            page_results = json.loads(search_request.text)
            print 'Attempting to fetch ' + str(page_results['meta']['total_count']) + ' results...'
            query_results.extend(page_results['objects'])
            while page_results['meta']['next']:
                search_request = requests.get(astrobin_url+page_results['meta']['next'])
                if search_request.status_code == 200:
                    page_results = json.loads(search_request.text)
                    query_results.extend(page_results['objects'])
                    print page_results['meta']['offset']
                else:
                    print 'NEXT PAGE FAILED: ' + str(page_results['meta']['offset'])
            
            print 'No more pages!'
            
        else:
            print 'SEARCH FAILED'
            print query
        
        print 'Fetched ' + str(len(query_results)) + ' results for query ' + str(query)
        results.extend(query_results)
    
    print 'Fetched ' + str(len(results)) + ' results total.'
    return results

In [None]:
def build_metadata(results):
    metadata = {}
    for result in results:
        if result['id'] not in metadata:
            metadata[result['id']] = result
    
    print 'Found ' + str(len(metadata.keys())) + ' unique metadata entries for ' + str(len(results)) + ' results.'
    return metadata

## Image Downloading

In [None]:
def download_image(details, image_dir=''):
    image_request = requests.get(details['url_real'])
    if image_request.status_code == 200:
        with open(image_dir+str(details['id'])+'.jpg', 'wb') as image:
            image.write(image_request.content)
    else:
        raise Exception('IMAGE REQUEST FAILED: ' + str(details['id']))

In [None]:
def download_all_images(metadata, directory):
    print 'Attempting to download '+str(len(metadata.keys()))+' images...'
    found_images = glob.glob(directory+'*.jpg')
    all_images = {filename.translate(None, string.letters).translate(None, string.punctuation): True for filename in found_images}
    downloaded_images = 0
    for image_id, image_details in metadata.items():
        if image_id not in all_images:
            try:
                download_image(image_details, directory)
            except Exception as e:
                print e
            else:
                downloaded_images += 1
                # In case duplicate results weren't already removed.
                all_images[image_id] = True
    print 'Found '+str(len(found_images))+' and downloaded '+str(downloaded_images)+' images for a total of '+str(len(all_images.keys()))+' images.'

## Image Search and Downloading

In [None]:
def download_planet_images(planet):
    metadata_file = path+'raw/'+planet+'/'+planet+'_metadata.json'
    if os.path.exists(metadata_file):
        with open(metadata_file) as datafile:
            metadata = json.load(datafile)
    else:
        results = search([{'title__icontains': planet}, {'description__icontains': planet}])
        metadata = build_metadata(results)
        with open(metadata_file, 'w') as outfile:
            json.dump(metadata, outfile)
    download_all_images(metadata, path+'raw/'+planet+'/')

In [None]:
for planet in planets:
    download_planet_images(planet)