-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawlurl.py
171 lines (130 loc) · 5.6 KB
/
crawlurl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
"""
This script is created for crawling Amazon Product Data
It searches for certain categories with specific keywords
"""
__author__ = "Shoufu Luo"
__copyright__ = "Copyright 2015, All rights reserved."
import urllib2
import datetime
import json
import string
import re, os, sys
import cookielib
import hashlib
from bs4 import BeautifulSoup
host = 'http://www.amazon.com'
MaxNumReviews = 1500
##### it seems in order to get details of product, cookie is required.
#FakeCookie='skin=noskin; x-wl-uid=1uHgA6QYtOf9VAskEst2YwASvctHz7iD/DW4sDnNew/GMyywt9FUDbwsRnzE39zseg1uFAnaoIOI=; session-token=ULgTXs9bV43xIhlut2kEZI/Le5ZL2aINFopjKFZtgrdqGxxcX/1GSZkYmvCc0+uktkYcJD657Tk9Dsi11JxPnPodmIOYJjBuc4tAAts0ZpR6lbzomtDBPlLh5LnmGAschVmi/T0BOD1Nr2+6qf/WyvMupgeEHH+ya5b4z+aYSY+5jD3LapfzmrqE3jF3ogvn1E+bbPmdR5rrwJkRej25mYbSkOrYqBNHyZNf9TCOrnCNEgOOU/g/JIjb10OaOkAB; __gads=ID=25a2624337c614a3:T=1428607844:S=ALNI_MZfckri_ZZ1-ydzF7K7bsrJYRUofA; __ar_v4=7CUFP6UIQZARTK57SDZKRU%3A20150409%3A3%7CVKZDA7NCVJAINNGOEQVAY3%3A20150409%3A3%7CIQS3HPYPHFHRHEYBEZAXCQ%3A20150409%3A3; ubid-main=188-5471422-6981114; session-id-time=2082787201l; session-id=184-9289097-6318545; csm-hit=1QC1CAANPA2T95A9NVZ4+s-1QC1CAANPA2T95A9NVZ4|1429326231448'
#FakeCookie='skin=noskin; x-wl-uid=1uHgA6QYtOf9VAskEst2YwASvctHz7iD/DW4sDnNew/GMyywt9FUDbwsRnzE39zseg1uFAnaoIOI=; session-token=ULgTXs9bV43xIhlut2kEZI/Le5ZL2aINFopjKFZtgrdqGxxcX/1GSZkYmvCc0+uktkYcJD657Tk9Dsi11JxPnPodmIOYJjBuc4tAAts0ZpR6lbzomtDBPlLh5LnmGAschVmi/T0BOD1Nr2+6qf/WyvMupgeEHH+ya5b4z+aYSY+5jD3LapfzmrqE3jF3ogvn1E+bbPmdR5rrwJkRej25mYbSkOrYqBNHyZNf9TCOrnCNEgOOU/g/JIjb10OaOkAB; __gads=ID=25a2624337c614a3:T=1428607844:S=ALNI_MZfckri_ZZ1-ydzF7K7bsrJYRUofA; __ar_v4=7CUFP6UIQZARTK57SDZKRU%3A20150409%3A3%7CVKZDA7NCVJAINNGOEQVAY3%3A20150409%3A3%7CIQS3HPYPHFHRHEYBEZAXCQ%3A20150409%3A3; skin=noskin; ubid-main=188-5471422-6981114; session-id-time=2082787201l; session-id=184-9289097-6318545; csm-hit=0K7BJKCF2ECQJKR3VDP3+s-0K7BJKCF2ECQJKR3VDP3|1432388134040'
# we want to mimic a web browser
#UserAgent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
#UserAgent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko)'
def getPage(url):
'''
Get a page with customized http header, cookie and user-agent
'''
tries = 0
# maximum three failures
while tries < 3:
try:
req = urllib2.Request(url) # pull the page
#req.add_header('Cookie', FakeCookie)
#req.add_header('User-Agent',UserAgent)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
tries += 1
continue;
print "Error: Fail to get ", url
return None;
def amazoncrawl(startUrl, keywords, path):
"""
This function crawls the item listing page returned by the search
it parses the page to extract the each item's URL and calls fetchItem()
It stops when it has enough items collected or there is no more to collect
keywords: a dictionary of keywords and required instances, e.g. 'free physics books': 200
"""
# Magic strings for crawling on amazon.com
itemlocator_class = 'a-link-normal s-access-detail-page a-text-normal'
nextlocator_class = 'pagnNext'
# iterate all keywords
for key in keywords:
# construct the listing page
newkey = string.replace(key.strip(), ' ', '+')
pageurl = host + startUrl + '&field-keywords=' + newkey
required_items = keywords[key]
collected = 0 # tracking how many we collected already
urlfilename = string.replace(path.strip(), ' ', '-') + '_' + newkey + '.txt'
urlfp = open(urlfilename, 'w')
while collected < required_items and pageurl != '':
print(pageurl)
page = getPage(pageurl)
if page is None:
break;
soup = BeautifulSoup(page) # put into soup
# iterate items in this listing page
for itemInPage in soup.find_all('a', class_=itemlocator_class):
itemurl = itemInPage.attrs['href']
print(itemurl)
if itemurl is None:
print "No Item"
continue;
###############################
#####
##### Save the URL
urlfp.write(itemurl + "\n")
continue
###############################
#####
##### Capture the data when we get the URL
record = { 'itemurl' : itemurl, 'keyword' : key }
valid = fetchItem(itemurl, record) # pull the item
if valid:
collected += 1
h = hashlib.md5()
h.update(itemurl)
print h.hexdigest()
with open(path + '/' + h.hexdigest() + '.json', 'w') as f:
json.dump(record, f)
# not enough, we are greedy. Go to next listing page
nextPage = soup.find('a', class_=nextlocator_class)
if nextPage is not None:
pageurl = host + nextPage.attrs['href']
else:
pageurl = None
urlfp.close()
def main(config):
with open(config) as f:
try:
categories = json.load(f)
except:
print "Error: invalid JSON file"
return
for type in categories:
print "Crawling [", type, "] ..."
info = categories[type]
# we could use search to find the url of product list for
# each category. For now, let's skip it
if info['url'] == '':
print "TODO: Fetching url"
continue;
if not os.path.exists(type):
os.makedirs(type)
#else:
# h = hashlib.md5()
# h.update(info['url'])
# if already exits, then just skip
# if os.path.exists(type + '/' + h.hexdigest() + '.json'):
# continue;
amazoncrawl(info['url'], info['keywords'], type)
if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: python crawl.py [books.json]"
exit(0)
if not os.path.exists(sys.argv[1]):
print "Error: file [" + sys.argv[1] + "] does not exist"
exit(0)
main(sys.argv[1])