Skip to content
Newer
Older
100644 76 lines (57 sloc) 2.21 KB
1fafc4a @scottyrdc first checkin interface generator sources
authored
1 # Copyright(c)2008 Internet Archive. Software license GPL version 3.
2
3 # modified from the IACL bulk download example script by RSA 11/09
4 # takes the IACL booklist file supplied to the IACL4OPLC project
5 # and downloads all the books and covers.
6
7 # Multiple downloaders can be run at the same time using the startnum and endnum
8 # parameters, which will help to speed up the download process.
9
10 import csv
11 import os
12 import commands
13 import sys
14
15 startnum = 0
16 endnum = 999999
17
18 filenum = startnum
19 iaclBookList = open('iaclBookList.txt','r').readlines()
20
21 for line in iaclBookList:
22 lst = line.split()
23 if len(lst) != 2: # format is <olid> <iaid>
24 continue
25 id = lst[1]
26 print id
27 ollst = lst[0].split('/')
28 print ollst
29 if len(ollst) != 3: # format is '/b/<olid>'
30 continue
31 olid = ollst[2]
32 dirnum = "%09d"%filenum
33 print "downloading book #%s, id=%s" % (dirnum, id)
34
35 path = 'djvu/'
36
37 if not os.path.exists(path):
38 os.makedirs(path)
39
40
41 url = "http://www.archive.org/download/%s/%s.djvu" % (id, id)
42 dlpath = "%s/%s.djvu"%(path, id)
43
44 if not os.path.exists(dlpath):
45 #urllib.urlretrieve(url, dlpath)
46 #use rate limiting to be nicer to the cluster
47 (status, output) = commands.getstatusoutput("""wget '%s' -O '%s' --limit-rate=250k --user-agent='IA Bulk Download Script' -q""" % (url, dlpath))
48 assert 0 == status
49 else:
50 print "\talready downloaded, skipping..."
51 # get covers
52 dirnum = "%09d"%filenum
53 print "downloading cover #%s, id=%s" % (dirnum, id)
54
55 path = 'covers/'
56
57 if not os.path.exists(path):
58 os.makedirs(path)
59
60
61 url = "http://covers.openlibrary.org/b/OLID/%s-M.jpg" % (olid)
62 dlpath = "%s/%s.jpg"%(path, id)
63
64 if not os.path.exists(dlpath):
65 #urllib.urlretrieve(url, dlpath)
66 #use rate limiting to be nicer to the cluster
67 (status, output) = commands.getstatusoutput("""wget '%s' -O '%s' --limit-rate=250k --user-agent='IA Bulk Download Script' -q""" % (url, dlpath))
68 assert 0 == status
69 else:
70 print "\talready downloaded, skipping..."
71
72
73 filenum+=1
74 if (filenum > endnum):
75 sys.exit()
Something went wrong with that request. Please try again.