This notebook collects data from stackoverflow and berlinstartupjobs.

In [2]:
from datetime import datetime
from functools import partial
from tornado import gen 
from tornado import ioloop
from tornado import httpclient

In [3]:
@gen.coroutine
def main(targets):
    
    for name,values in targets.items():
        print(f"{name}")
        url = values['url']
        folder = values['folder']
        extension = values["extension"]
        
        now = datetime.utcnow()
        response = yield fetcher(url)
        
        file_name = "".join((folder, now.strftime("%Y-%m-%d-%H-%M"), ".", extension))
        with open(file_name, mode='wb') as f:
            f.write(response.body)
        
        print(f"\tFetched and saved as: {file_name}")

In [9]:
@gen.coroutine
def fetcher(url):
    
    print(f"\tFetching: {url}")
    
    try:
        response = yield httpclient.AsyncHTTPClient().fetch(url)
    except httpclient.HTTPError as err:
        print("Error: " + str(err))
    return response

In [5]:
targets = {}

In [6]:
# Fetching from stackoverflow
url = "https://stackoverflow.com/jobs/feed?l=Berlino%2c+Germania&d=100&u=Km"
folder = "../data/raw/stackoverflow/"
targets["stackoverflow"] = {"url": url, "folder": folder, "extension": "xml"}

In [7]:
# Fetching from berlinstartupjobs
url = "http://berlinstartupjobs.com/wp-json/wp/v2/posts?categories=9&per_page=100"
folder = "../data/raw/berlinstartupjobs/"
targets["berlinstartupjobs"] = {"url": url, "folder": folder, "extension": "json"}

In [8]:
io_loop = ioloop.IOLoop.current()
io_loop.add_callback(partial(main, targets=targets))

stackoverflow
	Fetching: https://stackoverflow.com/jobs/feed?l=Berlino%2c+Germania&d=100&u=Km
	Fetched and saved as: ../data/raw/stackoverflow/2017-10-27-07-51.xml
berlinstartupjobs
	Fetching: http://berlinstartupjobs.com/wp-json/wp/v2/posts?categories=9&per_page=100
	Fetched and saved as: ../data/raw/berlinstartupjobs/2017-10-27-07-51.json
