-
Notifications
You must be signed in to change notification settings - Fork 69
/
utils.py
85 lines (64 loc) · 2.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# utils.py
# David Jones, ScraperWiki Limited
# Thomas Levine, ScraperWiki Limited
'''
Local version of ScraperWiki Utils, documentation here:
https://scraperwiki.com/docs/python/python_help_documentation/
'''
from __future__ import absolute_import
import os
import sys
import warnings
import tempfile
import six.moves.urllib.parse
import six.moves.urllib.request
import requests
def scrape(url, params=None, user_agent=None):
'''
Scrape a URL optionally with parameters.
This is effectively a wrapper around urllib2.urlopen.
'''
headers = {}
if user_agent:
headers['User-Agent'] = user_agent
data = params and six.moves.urllib.parse.urlencode(params) or None
req = six.moves.urllib.request.Request(url, data=data, headers=headers)
f = six.moves.urllib.request.urlopen(req)
text = f.read()
f.close()
return text
def pdftoxml(pdfdata, options=""):
"""converts pdf file to xml file"""
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
pdffout.write(pdfdata)
pdffout.flush()
xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
tmpxml = xmlin.name # "temph.xml"
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes %s "%s" "%s"' % (
options, pdffout.name, os.path.splitext(tmpxml)[0])
# can't turn off output, so throw away even stderr yeuch
cmd = cmd + " >/dev/null 2>&1"
os.system(cmd)
pdffout.close()
#xmlfin = open(tmpxml)
xmldata = xmlin.read()
xmlin.close()
return xmldata.decode('utf-8')
def _in_box():
return os.environ.get('HOME', None) == '/home'
def status(type, message=None):
assert type in ['ok', 'error']
# if not running in a ScraperWiki platform box, silently do nothing
if not _in_box():
return "Not in box"
url = os.environ.get("SW_STATUS_URL", "https://app.quickcode.io/api/status")
if url == "OFF":
# For development mode
return
# send status update to the box
r = requests.post(url, data={'type': type, 'message': message})
r.raise_for_status()
return r.content
def swimport(scrapername):
return __import__(scrapername)