Skip to content
Permalink
master
Switch branches/tags
Go to file
If downloading the index files stops in the middle because of an error
then a second run needed to download all files again. This needs a lot
of time. The second run is now faster because existing index files are
not downloaded again.
1 contributor

Users who have contributed to this file

executable file 38 lines (31 sloc) 963 Bytes
#!/usr/bin/env python3
import os
import urllib.request
from xml.dom.minidom import parse
def collect(prefix):
ensuredirs(prefix)
index = ensure_index_file(prefix)
for child_prefix in extract_children(index):
collect(child_prefix)
def ensuredirs(path):
if not os.path.isdir(path):
os.makedirs(path)
def ensure_index_file(prefix):
path = prefix + "index.xml"
if not os.path.isfile(path):
download(prefix, path)
return path
def extract_children(path):
datasource = open(path)
doc = parse(datasource)
return [
common_prefix.getElementsByTagName("Prefix")[0].firstChild.data
for common_prefix
in doc.getElementsByTagName("CommonPrefixes")
]
def download(prefix, path):
url = "https://multimedia-commons.s3-us-west-2.amazonaws.com/?delimiter=/&prefix=" + prefix
print("Download " + url)
urllib.request.urlretrieve (url, path)
return path
collect("data/")