Skip to content

Commit

Permalink
switch to requests
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Apr 6, 2017
1 parent 1499ad0 commit dfe77c2
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ scikit-learn >= 0.14
python-crfsuite >= 0.8.4
sklearn-crfsuite >= 0.3.3
tldextract
requests
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
],
install_requires=['six', 'lxml', 'scikit-learn', 'tldextract'],
install_requires=['six', 'lxml', 'scikit-learn', 'tldextract', 'requests'],
)
16 changes: 12 additions & 4 deletions webstruct/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
:mod:`webstruct.model` contains convetional wrappers for creating NER models.
"""
from __future__ import absolute_import
from six.moves.urllib.request import urlopen

import requests
from lxml.html import tostring

from webstruct.loaders import HtmlLoader
Expand All @@ -23,6 +24,10 @@ class NER(object):
sequences and returns lists of predicted IOB2 tags.
:func:`~.create_wapiti_pipeline` function returns such model.
"""
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
}

def __init__(self, model, loader=None, html_tokenizer=None,
entity_colors=None):
self.model = model
Expand All @@ -49,7 +54,7 @@ def extract_from_url(self, url):
A convenience wrapper for :meth:`extract` method that downloads
input data from a remote URL.
"""
data = urlopen(url).read()
data = self._download(url)
return self.extract(data)

def extract_raw(self, bytes_data):
Expand Down Expand Up @@ -79,7 +84,7 @@ def extract_groups_from_url(self, url, dont_penalize=None):
A convenience wrapper for :meth:`extract_groups` method that downloads
input data from a remote URL.
"""
data = urlopen(url).read()
data = self._download(url)
return self.extract_groups(data, dont_penalize=dont_penalize)

def build_entity(self, html_tokens):
Expand Down Expand Up @@ -107,9 +112,12 @@ def annotate_url(self, url, pretty_print=False):
Return annotated HTML data in WebAnnotator format; input is downloaded
from ``url``.
"""
data = urlopen(url).read()
data = self._download(url)
return self.annotate(data, pretty_print=pretty_print, url=url)

def _download(self, url):
return requests.get(url, headers=self.HEADERS).content

def __getstate__(self):
dct = self.__dict__.copy()
dct['entity_colors'] = dict(self.entity_colors)
Expand Down

0 comments on commit dfe77c2

Please sign in to comment.