-
Notifications
You must be signed in to change notification settings - Fork 3
/
waybacktrack.py
244 lines (180 loc) · 7.4 KB
/
waybacktrack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""waybacktrack.py
Use this to extract Way Back Machine's
url-archives of any given domain!
TODO: reiterate entire design!
"""
import time
import os
import urllib2
import random
from math import ceil
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
from lxml import html
from lxml.html import clean
ARCHIVE_DOMAIN = "http://web.archive.org"
CURR_DIR = os.path.dirname(__file__)
DATASET_DIR = os.path.join(CURR_DIR, '../../dataset/')
def archive_domain(domain, year, dir_path=DATASET_DIR,
percent=0, debug=False, throttle=1):
"""
domain
@type domain: string
@param domain: the domain of the website ie. www.nytimes.com
@type year: int
@param year: the year to extract archives from
@type dir_path: string
@param dir_path: the directory path to store archive, if
empty, directory will automatically be created
TODO: Think of better solution to storing
downloaded archives
@type percent: int
@param percent: the percentage of Way Back archives to crawl
@rtype:
@return: Returns a list of archived sites
"""
# TODO: Improve this for module portability
# WARNING: Module will likely break if used outside of
# crawl-to-the-future project
# automatically find or eventually create directory
# based off domain name
# Found way to check if file is being ran in crawl-to-the-future
# super "hacky" though
# TODO: Find better way to check if module is getting ran in
# in crawl-to-the-future project
if os.path.split(
os.path.abspath(os.path.join(__file__, os.pardir)))[1] != "Way-Back":
raise Exception("Please manually specify 'dir_name' value")
if dir_path is DATASET_DIR:
dir_path = os.path.join(dir_path, domain + '/')
if not os.path.exists(dir_path):
#raise IOError("[Errno 2] No such file or directory: '" + dir_path + "'")
# this part is shady
os.makedirs(dir_path)
if not isinstance(dir_path, basestring):
raise Exception("Directory - third arg. - path must be a string.")
ia_year_url = ARCHIVE_DOMAIN + "/web/" + str(year) + \
"*/http://" + domain + "/"
ia_parsed = html.parse(ia_year_url)
domain_snapshots = list(set(ia_parsed.xpath('//*[starts-with(@id,"' +
str(year) + '-")]//a/@href')))
#snapshot_age_span is a percentage of total snapshots to process from
#the given year
#ie. if percent is 100, and there are a total of 50 snapshots for
#www.cnn.com, we will crawl (to a depth of 1 atm) all 50 snapshots
snapshot_age_span = 1 if percent <= 0 \
else len(domain_snapshots) - 1 \
if percent >= 100 \
else int(percent*len(domain_snapshots)/100)
if debug:
print "Extracting links from: ", domain
# http://margerytech.blogspot.com/2011/06/python-get-last-directory-name-in-path.html
print "Current directory: ", os.path.split(
os.path.abspath(os.path.join(__file__, os.pardir)))[1]
print "Storing files in: ", os.path.abspath(dir_path)
print "Number of domain snapshots: ", len(domain_snapshots)
print "Number of domain snapshots to process: ", snapshot_age_span + 1
random.shuffle(domain_snapshots)
forward_links = []
#for snapshot in domain_snapshots[:snapshot_age_span]:
for snapshot in domain_snapshots[:3]:
curr_snapshot_flinks = get_forwardlink_snapshots(snapshot)
forward_links.extend(curr_snapshot_flinks)
if debug:
print "snapshot url: ", snapshot
print "forward link count: ", len(curr_snapshot_flinks)
random.shuffle(forward_links)
if debug:
print "total number of foward links to download: ", len(forward_links)
random.shuffle(forward_links)
# archive forward links
archived_links = []
duds = []
for forwardlink in forward_links:
if archive(forwardlink, year, dir_path, debug, throttle):
archived_links.append(forwardlink)
else:
duds.append(forwardlink)
if debug:
print "Number of archived forward links: ", len(archived_links)
print "Number of duds: ", len(duds)
return archived_links, duds
# I know I'm breaking so many rules by not seperating concerns
def archive(page, year, dir_path, debug=False, throttle=1):
"""
Check to see if downloaded forward link
satisfies the archival year specification
ie. (2000, 2005, 2010)
"""
#files = [f for f in os.listdir(dir_path) if os.path.isfile(f)]
if debug:
print "requesting ", page
page_file = page.rsplit('/web/')[1].replace('http://', '').replace('-','_')
page_file = page_file.replace('/', '_').replace(':', '_').replace('&','_')
page_file = page_file.replace('?', '_').replace('*','_').replace('=','_')
file_path = dir_path + page_file
if os.path.isfile(file_path):
if debug:
print "Already saved: ", page_file
print
return False
try:
html_file = urllib2.urlopen(ARCHIVE_DOMAIN + page)
except IOError:
if debug:
print "Failed to open request for ", ARCHIVE_DOMAIN + page
print
return False
if html_file.getcode() == 302:
if debug:
print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page
print
return False
html_string = str(html_file.read())
if html_string.find("HTTP 302 response") != -1:
if debug:
print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page
print
return False
archival_year_spec = ARCHIVE_DOMAIN + '/web/' + str(year)
page_url = html_file.geturl()
if page_url.startswith(archival_year_spec):
if debug:
print "saving ", page_url
print
try:
with open(file_path, 'wb') as f:
f.write(BytesIO(html_string).read())
time.sleep(throttle)
except IOError as e:
if debug:
print "Got error: ", e
return False
return True
else:
return False
def get_forwardlink_snapshots(parent_site):
"""
@type index: string
@param index: the index.html page from which to extract forward links
@type year: int
@param year: the year to extract archives from
"""
try:
parsed_parent_site = html.parse(ARCHIVE_DOMAIN+parent_site)
except IOError:
print "Did not get extract links in ", ARCHIVE_DOMAIN+parent_site
return []
#cleaner = html.clean.Cleaner(scripts=True, javascript=True,style=True, kill_tags = ["img"])
cleaner = clean.Cleaner(scripts=True, javascript=True, comments=True,
style=True, meta=True, processing_instructions=True, embedded=True,
frames=True, forms=True, kill_tags=["noscript", "iframe", "img"])
parsed_parent_site = cleaner.clean_html(parsed_parent_site)
# spec archival year
# check to see if the archival year of a forwark link
# is that of the parent (ie. 2000|2005|2010)
all_forwardlinks = parsed_parent_site.xpath('//a[starts-with(@href,"' +
parent_site[:9] +'")]/@href')
return all_forwardlinks