This repository has been archived by the owner on Sep 12, 2019. It is now read-only.
/
data-mining-the-internet-archive.html
337 lines (310 loc) · 39.1 KB
/
data-mining-the-internet-archive.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
<!DOCTYPE html>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --><!--[if lt IE 7]><html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--><!--[if IE 7]><html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--><!--[if IE 8]><html class="no-js lt-ie9" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en"><!--<![endif]--><head><meta charset="utf-8"/><title>The Programming Historian</title><!-- Mobile viewport optimized: h5bp.com/viewport --><meta content="width=device-width" name="viewport"/><link href="http://fonts.googleapis.com/css?family=Lato:300,700|Crete+Round" rel="stylesheet" type="text/css"/><link href="http://programminghistorian.org/wp-content/themes/ph-wp-theme/style.css" rel="stylesheet"/><!-- Modernizr and Friends --><script src="http://programminghistorian.org/wp-content/themes/ph-wp-theme/javascripts/modernizr.min.js"></script><script>
Modernizr.load([
{
test: Modernizr.mq(),
nope: ['http://programminghistorian.org/wp-content/themes/ph-wp-theme/javascripts/respond.min.js',
'http://programminghistorian.org/wp-content/themes/ph-wp-theme/javascripts/selectivizr.min.js']
}
]);
</script><script type="text/javascript">//<![CDATA[
// Google Analytics for WordPress by Yoast v4.3.5 | http://yoast.com/wordpress/google-analytics/
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-2752866-8']);
_gaq.push(['_setCustomVar',2,'author','adam-crymble',3],['_trackPageview']);
(function () {
var ga = document.createElement('script');
ga.type = 'text/javascript';
ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(ga, s);
})();
//]]></script><link href="http://programminghistorian.org/lessons/data-mining-the-internet-archive/feed" rel="alternate" title="The Programming Historian » Data Mining the Internet Archive Collection Comments Feed" type="application/rss+xml"/><script src="http://programminghistorian.org/wp-includes/js/jquery/jquery.js?ver=1.11.0" type="text/javascript"></script><script src="http://programminghistorian.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1" type="text/javascript"></script><script src="http://programminghistorian.org/wp-content/themes/ph-wp-theme/javascripts/bigfoot.min.js?ver=3.9.1" type="text/javascript"></script><link href="http://programminghistorian.org/xmlrpc.php?rsd" rel="EditURI" title="RSD" type="application/rsd+xml"/><link href="http://programminghistorian.org/wp-includes/wlwmanifest.xml" rel="wlwmanifest" type="application/wlwmanifest+xml"/><link href="http://programminghistorian.org/lessons/georeferencing-qgis" rel="prev" title="Georeferencing in QGIS 2.0"/><link href="http://programminghistorian.org/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown" rel="next" title="Sustainable Authorship in Plain Text using Pandoc and Markdown"/><meta content="WordPress 3.9.1" name="generator"/><link href="http://programminghistorian.org/lessons/data-mining-the-internet-archive" rel="canonical"/><link href="http://programminghistorian.org/?p=2417" rel="shortlink"/><style id="syntaxhighlighteranchor" type="text/css"></style><meta content="Caleb McDaniel" name="author"/><meta content="Data Mining the Internet Archive Collection" name="title"/><meta content="2014-03-03" name="date"/><meta content="William J Turkel" name="reviewers"/><meta content="default" name="layout"/></head><body class="single single-lesson postid-2417">
<!-- Prompt IE 6 users to install Chrome Frame. Remove this if you support IE 6.
chromium.org/developers/how-tos/chrome-frame-getting-started -->
<!--[if lt IE 7]><p class=chromeframe>Your browser is <em>ancient!</em> <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience this site.</p><![endif]-->
<script type="text/javascript">
jQuery(document).ready(function($) {
// Inside of this function, $() will work as an alias for jQuery()
// and other libraries also using $ will not be accessible under this shortcut
$.bigfoot();
});
</script>
<div role="main">
<article>
<div class="content">
<h2>Lesson Goals</h2>
<p>The collections of the <a href="http://archive.org/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">Internet Archive</a> (IA) include many digitized sources of interest to historians, including <a href="https://archive.org/details/jstor_ejc" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">early JSTOR journal content</a>, <a href="https://archive.org/details/johnadamsBPL" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">John Adams’s personal library</a>, and the <a href="https://archive.org/details/jcbhaiti" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">Haiti collection</a> at the John Carter Brown Library. In short, to quote Programming Historian <a href="http://activehistory.ca/2013/09/the-internet-archive-rocks-or-two-million-plus-free-sources-to-explore/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://activehistory.ca']);" target="_blank">Ian Milligan</a>, “The Internet Archive rocks.”</p>
<p>In this lesson, you’ll learn how to download files from such collections using a Python module specifically designed for the Internet Archive. You will also learn how to use another Python module designed for parsing MARC XML records, a widely used standard for formatting bibliographic metadata.</p>
<p>For demonstration purposes, this lesson will focus on working with the digitized version of the <a href="http://archive.org/details/bplscas" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">Anti-Slavery Collection</a> at the Boston Public Library in Copley Square. We will first download a large collection of MARC records from this collection, and then use Python to retrieve and analyze bibliographic information about items in the collection. For example, by the end of this lesson, you will be able to create a list of every named place from which a letter in the antislavery collection was written, which you could then use for a mapping project or some other kind of analysis.</p>
<h2>For Whom Is This Useful?</h2>
<p>This intermediate lesson is good for users of the Programming Historian who have completed general lessons on downloading files and performing text analysis on them, but would like an applied example of these principles. It will also be of interest to historians or archivists who work with the MARC format or the Internet Archive on a regular basis.</p>
<h2>Before You Begin</h2>
<p>We will be working with two Python modules that are not included in Python’s standard library.</p>
<p>The first, <a href="https://pypi.python.org/pypi/internetarchive" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://pypi.python.org']);" target="_blank">internetarchive</a>, provides programmatic access to the Internet Archive. The second, <a href="https://pypi.python.org/pypi/pymarc/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://pypi.python.org']);" target="_blank">pymarc</a>, makes it easier to parse MARC records.</p>
<p>The easiest way to download both is to use pip, the python package manager. Begin by installing <code>pip</code> using <a href="http://programminghistorian.org/lessons/installing-python-modules-pip/">this Programming Historian lesson</a>. Then issue these commands at the command line: To install <code>internetarchive</code>:</p>
<pre class="bash">
sudo pip install internetarchive
</pre>
<p>To install <code>pymarc</code>:</p>
<pre class="bash">
sudo pip install pymarc
</pre>
<p>Now you are ready to go to work!</p>
<h2>The Antislavery Collection at the Internet Archive</h2>
<p>The Boston Public Library’s anti-slavery collection at Copley Square contains not only the letters of William Lloyd Garrison, one of the icons of the American abolitionist movement, but also large collections of letters by and to reformers somehow connected to him. And by “large collection,” I mean large. According to the library’s estimates, there are over 16,000 items at Copley.</p>
<p>As of this writing, approximately 7,000 of those items have been digitized and uploaded to the <a href="http://archive.org/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">Internet Archive</a>. This is good news, not only because the Archive is committed to making its considerable cultural resources free for download, but also because each uploaded item is paired with a wealth of metadata suitable for machine-reading.</p>
<p>Take <a href="http://archive.org/details/lettertowilliaml00doug" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">this letter</a> sent by Frederick Douglass to William Lloyd Garrison. Anyone can read the <a href="http://archive.org/stream/lettertowilliaml00doug/39999066767938#page/n0/mode/2up" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">original manuscript</a> online, without making the trip to Boston, and that alone may be enough to revolutionize and democratize future abolitionist historiography. But you can also download <a href="http://archive.org/download/lettertowilliaml00doug" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">multiple files</a> related to the letter that are rich in metadata, like a <a href="http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_dc.xml" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">Dublin Core</a> record and a fuller <a href="http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);">MARCXML</a> record that uses the <a href="http://www.loc.gov/marc/bibliographic/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.loc.gov']);" target="_blank">Library of Congress’s MARC 21 Format for Bibliographic Data</a>.</p>
<p>Stop and think about that for a moment: every item uploaded from the Collection contains these things. Right now, that means historians have access to rich metadata, full images, and partial descriptions for <a href="http://archive.org/search.php?query=collection%3Abplscas&sort=-publicdate" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">thousands of antislavery letters, manuscripts, and publications</a>.</p>
<h2>Accessing an IA Collection in Python</h2>
<p>Internet Archive (IA) collections and items all have a unique identifier, and URLs to collections and items all look like this:</p>
<pre class="brush: plain; title: ; notranslate" title="">
http://archive.org/details/[IDENTIFIER]
</pre>
<p>So, for example, here is a URL to the Archive item discussed above, Douglass’s letter to Garrison:</p>
<pre class="brush: plain; title: ; notranslate" title="">
http://archive.org/details/lettertowilliaml00doug
</pre>
<p>And here is a URL to the entire antislavery collection at the Boston Public Library:</p>
<pre class="brush: plain; title: ; notranslate" title="">
http://archive.org/details/bplscas/
</pre>
<p>Because the URLs are so similar, the only way to tell that you are looking at a collection page, instead of an individual item page, is to examine the page layout. An item page usually has a lefthand sidebar that says “View the Book” and lists links for reading the item online or accessing other file formats. A collection page will probably have a “Spotlight Item” in the lefthand sidebar instead. You can browse to different collections through the <a href="https://archive.org/details/texts" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">eBook and Texts</a> portal, and you may also want to read a little bit about <a href="http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://blog.archive.org']);" target="_blank">the way that items and item URLs are structured</a>.</p>
<p>Once you have a collection’s identifier—in this case, <code>bplscas</code>—seeing all of the items in the collection is as easy as navigating to the Archive’s <a href="https://archive.org/advancedsearch.php" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">advanced search</a> page, selecting the id from the drop down menu next to “Collection,” and hitting the search button. Performing that search with <code>bplscas</code> selected returns <a href="https://archive.org/search.php?query=collection%3A%28bplscas%29" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">this page</a>, which as of this writing showed 7,029 results.</p>
<p>We can also <a href="https://pypi.python.org/pypi/internetarchive#searching-from-python" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://pypi.python.org']);" target="_blank">search the Archive using the Python module that we installed</a>, and doing so makes it easier to iterate over all the items in the collection for purposes of further inspection and downloading.</p>
<p>For example, let’s modify the sample code from the module’s documentation to see if we can tell, with Python, how many items are in the digital Antislavery Collection. The sample code looks like this:</p>
<pre class="python">
import internetarchive
search = internetarchive.Search('collection:nasa')
print search.num_found
</pre>
<p>All we should need to modify is the collection identifier, from <code>nasa</code> to <code>bplscas</code>. After starting your computer’s Python interpreter, try entering each of the above lines, followed by enter, but modify the collection id in the second command:</p>
<pre class="python">
search = internetarchive.Search('collection:bplscas')
</pre>
<p>After hitting enter on the <span class="reserved">print</span> command, you should see a number that matches the number of results you <a href="http://archive.org/search.php?query=collection%3Abplscas" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">saw when doing the advanced search for the collection in the browser</a>.</p>
<h2>Accessing an IA Item in Python</h2>
<p>The <code>internetarchive</code> module also allows you to access individual items using their identifiers. Let’s try that using the <a href="https://pypi.python.org/pypi/internetarchive#downloading-from-python" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://pypi.python.org']);" target="_blank">documentation’s sample code for downloading an item</a>, modifying it in order to get the Douglass letter we discussed earlier.</p>
<p>If you are still at your Python interpreter’s command prompt, you don’t need to <span class="tech">import internetarchive</span> again, so instead just enter the second and third lines from the sample code, followed each time by enter, but changing the sample identifier <span class="var">stairs</span> to our item identifier, <span class="var">lettertowilliaml00doug</span> (note that the character before the two zeroes is a lowercase L, not the number 1):</p>
<pre class="python">
item = internetarchive.Item('lettertowilliaml00doug')
item.download()
</pre>
<p>Depending on your Internet connection speed, it will now probably take a minute or two for the command prompt to return, because your computer is downloading all of the files associated with that item, including some pretty large images. But when it’s done downloading, you should be see a new directory on your computer whose name is the item identifier. To check, first exit your Python interpreter:</p>
<pre class="python">
exit()
</pre>
<p>Then list the contents of the current directory to see if a folder now appears named <code>lettertowilliaml00doug. If you list the contents of that folder, you should see a list of files similar to this:</code></p>
<pre class="brush: plain; title: ; notranslate" title="">
39999066767938.djvu
39999066767938.epub
39999066767938.gif
39999066767938.pdf
39999066767938_abbyy.gz
39999066767938_djvu.txt
39999066767938_djvu.xml
39999066767938_images.zip
39999066767938_jp2.zip
39999066767938_scandata.xml
lettertowilliaml00doug_archive.torrent
lettertowilliaml00doug_dc.xml
lettertowilliaml00doug_files.xml
lettertowilliaml00doug_marc.xml
lettertowilliaml00doug_meta.mrc
lettertowilliaml00doug_meta.xml
lettertowilliaml00doug_metasource.xml
</pre>
<p>Now that we know how to use the Search and Item functions in the <code>internetarchive</code> module, we can turn to thinking about how to make this process more effective for downloading lots of information from the collection for further analysis.</p>
<h2>Downloading MARC Records from a Collection</h2>
<p>Downloading one item is nice, but what if we want to look at thousands of items in a collection? We’re in luck, because the <code>internetarchive</code> module’s Search function allows us to iterate over all the results in a search.</p>
<p>To see how, let’s first start our Python interpreter again. We’ll need to import our module again, and perform our search again:</p>
<pre class="python">
import internetarchive
search = internetarchive.Search('collection:bplscas')
</pre>
<p>Now let’s enter the documentation’s sample code for printing out the item identifier of every item returned by our search:</p>
<pre class="python">
for result in search.results():
print result['identifier']
</pre>
<p>Note that after entering the first line, your Python interpreter will automatically print the ellipsis on line two. This is because you have started a <span class="tech">for loop</span>, and Python is expecting there to be more. It wants to know what you want to do for each result in the search. That’s also why, once you hit enter on the second line, you’ll see a third line with another ellipsis, because Python doesn’t know whether you are finished telling it what to do with each result. Hit enter again to end the for loop and execute the command.</p>
<p>You should now see your terminal begin to print out the identifiers for each result returned by our <span class="var">bplscas</span> search—in this case, all 7,029 of them! You can interrupt the print out by hitting <code>Ctrl-C</code> on your keyboard, which will return you to the prompt.</p>
<p>If you didn’t see identifiers printing out to your screen, but instead saw an error like this, you may have forgotten to enter a few spaces before your <span class="reserved">print command:</span></p>
<pre class="python">
for result in search.results():
print result['identifier']
File "", line 2
print result['identifier']
^
IndentationError: expected an indented block
</pre>
<p>Remember that whitespace matters in Python, and you need to indent the lines in a <span class="reserved">for loop</span> so that Python can tell which command(s) to perform on each item in the loop.</p>
<h2>Understanding the for loop</h2>
<p>The <span class="reserved">for loop</span>, expressed in plain English, tells Python to do something to each thing in a collection of things. In the above case, we printed the identifier for each result in the results of our collection search. Two additional points about the <span class="reserved">for loop</span>:</p>
<p>First, the word we used after <span class="reserved">for</span> is what’s called a <span class="tech">local variable</span> in Python. It serves as a placeholder for whatever instance or item we are going to be working with inside the loop. Usually it makes sense to pick a name that describes what kind of thing we are working with—in this case, a search result—but we could have used other names in place of that one. For example, try running the above for loop again, but substitute a different name for the local variable, such as:</p>
<pre class="python">
for item in search.results():
print item['identifier']
</pre>
<p>You should get the same results. Notice that we did not replace <span class="tech">search.results()</span> with <span class="tech">search.items()</span>. That’s because that instance of the word result is not a local variable, but a function (<a href="http://programminghistorian.org/lessons/code-reuse-and-modularity" target="_blank">remember those?</a>) defined in the internetarchive module.</p>
<p>The second thing to note about the <span class="reserved">for loop</span> is that the indented block could could have contained other commands. In this case, we printed each individual search result’s identifier. But we could have chosen to do, for each result, anything that we could do to an individual Internet Archive item.</p>
<p>For example, earlier we downloaded all the files associated with the item <span class="var">lettertowilliaml00doug</span>. We could have done that to each item returned by our search by changing the line <code>print result['identifier']</code> in our <span class="reserved">for loop</span> to <code>result.download()</code>.</p>
<p>We probably want to think twice before doing that, though—downloading all the files for each of the 7,029 items in the <span class="var">bplscas</span> collection is a lot of files. Fortunately, the download function in the <code>internetarchive</code> module also allows you to <a href="https://pypi.python.org/pypi/internetarchive#downloading-from-python" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://pypi.python.org']);" target="_blank">download specific files associated with an item</a>. If we had only wanted to download the MARC XML record associated with a particular item, we could have instead done this:</p>
<pre class="python">
item = internetarchive.Item('lettertowilliaml00doug')
marc = item.file('lettertowilliaml00doug_marc.xml')
marc.download()
</pre>
<p>Because Internet Archive <a href="https://archive.org/about/faqs.php#140" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">item files are named according to specific rules</a>, we can also figure out the name of the MARC file we want just by knowing the item’s unique identifier. And armed with that knowledge, we can proceed to …</p>
<h2>Download All the MARC XML Files from a Collection</h2>
<p>For the next section, we’re going to move from using the Python shell to writing a Python script that downloads the MARC record from each item in the BPL Antislavery Collection. Try putting this script into Komodo or your preferred text editor:</p>
<pre class="python">
#!/usr/bin/python
import internetarchive
search = internetarchive.Search('collection:bplscas')
for result in search.results():
itemid = result['identifier']
item = internetarchive.Item(itemid)
marc = item.file(itemid + '_marc.xml')
marc.download()
print "Downloading " + itemid + " ..."
</pre>
<p>This script looks a lot like the experiments we have done above with the Frederick Douglass letter, but since we want to download the MARC record for each item returned by our collection search, we are using an <span class="var">itemid</span> variable to account for the fact that the identifier and filename will be different for each result.</p>
<p>Before running this script (which, I should note, is going to download thousands of small XML files to your computer), make a directory where you want those MARC records to be stored and place the above script in that directory. Then run the script from within the directory so that the files will be downloaded in an easy-to-find place.</p>
<p>(Note that if you receive what looks like a <span class="tech">ConnectionError</span> on your first attempt, check your Internet connection, wait a few minutes, and then try running the script again.)</p>
<p>If all goes well, when you run your script, you should see the program begin to print out status updates telling you that it is downloading MARC records. But allowing the script to run its full course will probably take a couple of hours, so let’s stop the script and look a little more closely at ways to improve it. Pressing CONTROL-C while in your terminal window should make the script stop.</p>
<h2>Building Error Reporting into the Script</h2>
<p>Since downloading all of these records will take some time, we are probably going to want to walk away from our computer for a while. But the chances are high that during those two hours, something could go wrong that would prevent our script from working.</p>
<p>Let’s say, for example, that we had forgotten that we already downloaded an individual file into this directory. Or maybe your computer briefly loses its Internet connection, or some sort of outage happens on the Internet Archive server that prevents the script from getting the file it wants.</p>
<p>In those and other error cases, Python will raise an “exception” telling you what the problem is. Unfortunately, an exception will also crash your script instead of continuing on to the next item.</p>
<p>To prevent this, we can use what’s called a <span class="reserved">try</span> statement in Python, which does exactly what it sounds like. The statement will try to execute a certain snippet of code until it hits an exception, in which case you can give it some other code to execute instead. You can read more about <a href="http://docs.python.org/2/tutorial/errors.html#handling-exceptions" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://docs.python.org']);" target="_blank">handling exceptions</a> in the Python documentation, but for now let’s just update our above script so that it looks like this:</p>
<pre class="python">
#!/usr/bin/python
import internetarchive
import time
error_log = open('bpl-marcs-errors.log', 'a')
search = internetarchive.Search('collection:bplscas')
for result in search.results():
itemid = result['identifier']
item = internetarchive.Item(itemid)
marc = item.file(itemid + '_marc.xml')
try:
marc.download()
except Exception as e:
error_log.write('Could not download ' + itemid + ' because of error: %s\n' % e)
print "There was an error; writing to log."
else:
print "Downloading " + itemid + " ..."
time.sleep(1)
</pre>
<p>The main thing we’ve added here, after our module import statements, is a line that opens a text file called <code>bpl-marcs-errors.log</code> and prepares it to have text appended to it. We are going to use this file to log exceptions that the script raises. The <span class="reserved">try statement</span> that we have added to our <span class="reserved">for loop</span> will attempt to download the MARC record. If it can’t, it will write a descriptive statement about what went wrong to our log file. That way we can go back to the file later and identify which items we will need to try to download again. If the try clause succeeds and can download the record, then the script will execute the code in the <span class="reserved">else clause</span>.</p>
<p>One other thing we have added, upon successful download, is this line:</p>
<pre class="python">
time.sleep(1)
</pre>
<p>This line uses the <span class="var">time module</span> that we are now importing at the beginning to tell our script to pause for one second before proceeding, which is basically just a way for us to be nice to Internet Archive’s servers by not clobbering them every millisecond or so with a request.</p>
<p>Try updating your script to look like the above lines, and run it again in the directory where you want to store your MARC files. Don’t be surprised if you immediately encounter a string of error messages; that means the script is doing what it’s supposed to do! Calmly go into your text editor, while leaving the script running, and open the <code>bpl-marcs-errors.log</code> to see what exceptions have been recorded there. You’ll probably see that the script raised the exception “File already exists” for each of the files that you had already downloaded when running our earlier, shorter program.</p>
<p>If you leave the program running for a little while, the script will eventually get to items that you have not already downloaded and resume collecting your MARCs!</p>
<h2>Scraping Information from a MARC Record</h2>
<p>Once your download script has completed, you should find yourself in the possession of nearly 7,000 detailed MARC XML records about items in the Anti-Slavery Collection (or whichever other collection you may have downloaded instead; the methods above should work on any collection whose items have MARC files attached to them).</p>
<p>Now what?</p>
<p>The next step depends on what sort of questions about the collection you want to answer. The MARC formatting language captures a wealth of data about an item, as you can see if you return to <a href="http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://archive.org']);" target="_blank">the MARC XML record for the Frederick Douglass letter</a> mentioned at the outset.</p>
<p>Notice, for example, that the Douglass letter contains information about the place where the letter was written in the <span class="var">datafield</span> that is tagged <span class="var">260</span>, inside the <span class="var">subfield</span> coded <span class="var">a</span>. The person who prepared this MARC record knew to put place information in that specific field because of <a href="http://www.loc.gov/marc/bibliographic/bd260.html" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.loc.gov']);" target="_blank">rules specified for the 260 datafield</a> by the <a href="http://www.loc.gov/marc/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.loc.gov']);" target="_blank">MARC standards</a>.</p>
<p>That means that it should be possible for us to look inside all of the MARC records we have downloaded, grab the information inside of <span class="var">datafield “260,” subfield “a,”</span> and make a list of every place name where items in the collection were published.</p>
<p>To do this, we’ll use the other helpful Python module that we downloaded with <code>pip</code> at the beginning: <a href="https://github.com/edsu/pymarc" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://github.com']);" target="_blank">pymarc</a>.</p>
<p>That module makes it easy to get information out of subfields. Assuming that we have a MARC record prepared for parsing by the module assigned to the variable record, we could get the information about publication place names this way:</p>
<pre class="python">
place_of_pub = record['260']['a']
</pre>
<p>The documentation for <code>pymarc</code> is a little less complete than that for the Internet Archive, especially when it comes to parsing XML records. But a little rooting around in the source code for the module reveals some <a href="https://github.com/edsu/pymarc/blob/master/pymarc/marcxml.py" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://github.com']);" target="_blank">functions that it provides for working with MARC XML records</a>. One of these, called <span class="var">map_xml()</span> is described this way:</p>
<pre class="python">
def map_xml(function, *files):
"""
map a function onto the file, so that for each record that is
parsed the function will get called with the extracted record
def do_it(r):
print r
map_xml(do_it, 'marc.xml')
"""
</pre>
<p>Translated into plain English, this function means that we can take an XML file containing MARC data (like the nearly 7,000 we now have on our computer), pass it to the <span class="var">map_xml</span> function in the <code>pymarc</code> module, and then specify another function (that we will write) telling our program what to do with the MARC data retrieved from the XML file. In rough outline, our code will look something like this:</p>
<pre class="python">
import pymarc
def get_place_of_pub(record):
place_of_pub = record['260']['a']
print place_of_pub
pymarc.map_xml(get_place_of_pub, 'lettertowilliaml00doug_marc.xml')
</pre>
<p>Try saving that code to a script and running it from a directory where you already have the Douglass letter XML saved. If all goes well, the script should spit out this:</p>
<pre class="python">
Belfast, [Northern Ireland],
</pre>
<p>Voila! Of course, this script would be much more useful if we scraped the place of publication from every letter in our collection of MARC records. Putting together what we’ve learned from earlier in the lesson, we can do that with a script that looks like this:</p>
<pre class="python">
#!/usr/bin/python
import os
import pymarc
path = '/path/to/dir/with/xmlfiles/'
def get_place_of_pub(record):
try:
place_of_pub = record['260']['a']
print place_of_pub
except Exception as e:
print e
for file in os.listdir(path):
if file.endswith('.xml'):
pymarc.map_xml(get_place_of_pub, path + file)
</pre>
<p>This script modifies our above code in several ways. First, it uses a <span class="reserved">for loop</span> to iterate over each file in our directory. In place of the <code>internetarchive</code> search results that we iterated over in our first part of this lesson, we iterate over the files returned by <span class="var">os.listdir(path)</span> which uses the built-in Python module <span class="var">os</span> to list the contents of the directory specified in the <span class="var">path</span> variable, which you will need to modify so that it matches the directory where you have downloaded all of your MARC files.</p>
<p>We have also added some error handling to our <span class="var">get_place_of_pub()</span> function to account for the fact that some records may (for whatever reason) not contain the information we are looking for. The function will <span class="reserved">try</span> to print the place of publication, but if this raises an Exception, it will print out the information returned by the Exception instead. In this case, if the <span class="reserved">try statement</span> failed, the exception will probably print <span class="var">None</span>. Understanding why is a subject for another lesson on Python Type errors, but for now the <span class="var">None</span> printout is descriptive enough of what happened, so it could be useful to us.</p>
<p>Try running this script. If all goes well, your screen should fill with a list of the places where these letters were written. If that works, try modifying your script so that it saves the place names to a text file instead of printing them to your screen. You could then use the <a href="http://programminghistorian.org/lessons/counting-frequencies" target="_blank">Counting Frequencies</a> lesson to figure out which place names are most common in the collection. You could work with the place names to find coordinates that could be placed on a map using the <a href="http://programminghistorian.org/lessons/googlemaps-googleearth" target="_blank">Google Maps lesson</a>.</p>
<p>Or, to get a very rough visual sense of the places where letters were written, you could do what I’ve done below and simply make a <a href="http://www.wordle.net/" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.wordle.net']);" target="_blank">Wordle word cloud</a> of the text file.</p>
<div class="wp-caption alignnone" id="attachment_2419" style="width: 610px"><img alt="Wordle wordcloud" class="size-full wp-image-2419" height="450" src="http://programminghistorian.org/wp-content/uploads/2014/03/bpl-wordle.png" width="600"/><p class="wp-caption-text">Wordle wordcloud</p></div>
<p>Of course, to make such techniques useful would require more cleaning of your data. And other applications of this lesson may prove more useful. For example, working with the MARC data fields for personal names, you could create a network of correspondents. Or you could analyze which subjects are common in the MARC records. Now that you have the MARC records downloaded and can use <code>pymarc</code> to extract information from the fields, the possibilities can multiply rapidly!</p>
<!-- You can start editing here. -->
<!-- If comments are open, but there are no comments. -->
</div>
</article>
<!-- .navigation -->
</div>
<script src="http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/scripts/shCore.js?ver=3.0.83c" type="text/javascript"></script>
<script src="http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/scripts/shBrushBash.js?ver=3.0.83c" type="text/javascript"></script>
<script src="http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/scripts/shBrushPlain.js?ver=3.0.83c" type="text/javascript"></script>
<script src="http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/scripts/shBrushPython.js?ver=3.0.83c" type="text/javascript"></script>
<script type="text/javascript">
(function(){
var corecss = document.createElement('link');
var themecss = document.createElement('link');
var corecssurl = "http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/styles/shCore.css?ver=3.0.83c";
if ( corecss.setAttribute ) {
corecss.setAttribute( "rel", "stylesheet" );
corecss.setAttribute( "type", "text/css" );
corecss.setAttribute( "href", corecssurl );
} else {
corecss.rel = "stylesheet";
corecss.href = corecssurl;
}
document.getElementsByTagName("head")[0].insertBefore( corecss, document.getElementById("syntaxhighlighteranchor") );
var themecssurl = "http://programminghistorian.org/wp-content/plugins/syntaxhighlighter/syntaxhighlighter3/styles/shThemeDefault.css?ver=3.0.83c";
if ( themecss.setAttribute ) {
themecss.setAttribute( "rel", "stylesheet" );
themecss.setAttribute( "type", "text/css" );
themecss.setAttribute( "href", themecssurl );
} else {
themecss.rel = "stylesheet";
themecss.href = themecssurl;
}
//document.getElementById("syntaxhighlighteranchor").appendChild(themecss);
document.getElementsByTagName("head")[0].insertBefore( themecss, document.getElementById("syntaxhighlighteranchor") );
})();
SyntaxHighlighter.config.strings.expandSource = '+ expand source';
SyntaxHighlighter.config.strings.help = '?';
SyntaxHighlighter.config.strings.alert = 'SyntaxHighlighter\n\n';
SyntaxHighlighter.config.strings.noBrush = 'Can\'t find brush for: ';
SyntaxHighlighter.config.strings.brushNotHtmlScript = 'Brush wasn\'t configured for html-script option: ';
SyntaxHighlighter.defaults['pad-line-numbers'] = false;
SyntaxHighlighter.defaults['toolbar'] = false;
SyntaxHighlighter.all();
</script>
</body></html>