/
scraper.py
50 lines (41 loc) · 1.71 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import scraperwiki
import lxml.html
import re
#url="http://www.ppbghana.org/contracts_results.asp?Ministry=%%25&Region=%%25&Agency=%%25&TNDType=%%25&ppb_date=356&Submit=Search&offset=%d"
url="http://www.ppaghana.org/contractdetail.asp?Con_ID=3110"
base="http://www.ppaghana.org/"
def get_lxml(url):
html=scraperwiki.scrape(url)
return lxml.html.fromstring(html)
def get_pages(url):
root=get_lxml(url)
pstr=root.cssselect("td[colspan=7] td.bodytext")[0].text_content()
pages=int(re.search("of ([0-9]+)",pstr).group(1))
return range (0,pages,10)
def get_contract_urls(url):
root=get_lxml(url)
return ["%s%s"%(base,i.get("href")) for i in root.cssselect("td[colspan=7] td a")]
def clean_name(strn):
remove=[":","(",")","/",".","\n","\r"]
for r in remove:
strn=strn.replace(r,"")
return re.sub("[ ]+"," ",strn).strip()
def get_contract_details(url):
data={"url":url}
root=get_lxml(url)
data["name"]=root.cssselect("td.subhead font")[0].text_content()
for row in root.cssselect("tr.bodymain"):
rd=row.cssselect("td")
data[clean_name(rd[0].text_content().strip())]=rd[1].text_content().strip().replace("\n"," ").replace("\r","")
if "Contract Award Price" in data.keys():
data["Contract Award Price"]=float(re.sub("[^0-9.]","",data["Contract Award Price"]))
if "Currency:" in data.keys():
scraperwiki.sqlite.save(unique_keys=["url"],data=data)
#contract_urls=set(reduce(lambda x,y: x+y,[get_contract_urls(url%p) for p in get_pages(url%00)]))
contract_urls=set(reduce(lambda x,y: x+y,[get_contract_urls(url)]))
#for cu in contract_urls:
# try:
# get_contract_details(cu)
# except:
# pass
get_contract_details(url)