Skip to content

Commit

Permalink
Merge pull request #136 from stefanct/master
Browse files Browse the repository at this point in the history
Various improvements
  • Loading branch information
ipeirotis committed Jun 1, 2020
2 parents a31004f + 68b295b commit 5f794ac
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 33 deletions.
77 changes: 49 additions & 28 deletions scholarly/_navigator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import print_function
from __future__ import unicode_literals

from typing import Callable
from bs4 import BeautifulSoup

import codecs
Expand All @@ -20,8 +21,6 @@
from .author import Author
from .publication import Publication

_GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
_COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}
_HEADERS = {
'accept-language': 'en-US,en',
'accept': 'text/html,application/xhtml+xml,application/xml'
Expand Down Expand Up @@ -49,6 +48,7 @@ def __init__(self):
super(Navigator, self).__init__()
logging.basicConfig(filename='scholar.log', level=logging.INFO)
self.logger = logging.getLogger('scholarly')
self._proxy_gen = None
# If we use a proxy or Tor, we set this to True
self._proxy_works = False
# If we have a Tor server that we can refresh, we set this to True
Expand All @@ -59,7 +59,7 @@ def __init__(self):
# Setting requests timeout to be reasonably long
# to accomodate slowness of the Tor network
self._TIMEOUT = 10
self._MAX_RETRIES = 5
self._max_retries = 5

def __del__(self):
if self._tor_process:
Expand All @@ -79,7 +79,7 @@ def _get_page(self, pagerequest: str) -> str:
time.sleep(random.uniform(1,5))
resp = None
tries = 0
while tries < self._MAX_RETRIES:
while tries < self._max_retries:
# If proxy/Tor was setup, use it.
# Otherwise the local IP is used
session = requests.Session()
Expand All @@ -88,36 +88,44 @@ def _get_page(self, pagerequest: str) -> str:

try:
_HEADERS['User-Agent'] = UserAgent().random
_GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
_COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}

resp = session.get(pagerequest,
headers=_HEADERS,
cookies=_COOKIES,
timeout=self._TIMEOUT)

if resp.status_code == 200:
if self._has_captcha(resp.text):
raise Exception("Got a CAPTCHA. Retrying.")
else:
session.close()
if not self._has_captcha(resp.text):
return resp.text
self.logger.info("Got a CAPTCHA. Retrying.")
else:
self.logger.info(f"""Response code {resp.status_code}.
Retrying...""")
raise Exception(f"Status code {resp.status_code}")

except Exception as e:
err = f"Exception {e} while fetching page. Retrying."
self.logger.info(err)
# Check if Tor is running and refresh it
self.logger.info("Refreshing Tor ID...")
finally:
session.close()
if self._can_refresh_tor:
self._refresh_tor_id(self._tor_control_port, self._tor_password)
time.sleep(5) # wait for the refresh to happen
else:
# we only increase the tries when we cannot refresh id
# to avod an infinite loop
tries += 1

# Check if Tor is running and refresh it
if self._can_refresh_tor:
self.logger.info("Refreshing Tor ID...")
self._refresh_tor_id(self._tor_control_port, self._tor_password)
time.sleep(5) # wait for the refresh to happen
elif self._proxy_gen:
tries += 1
self.logger.info(f"Try #{tries} failed. Switching proxy.")
# Try to get another proxy
new_proxy = self._proxy_gen()
while (not self._use_proxy(new_proxy)):
new_proxy = self._proxy_gen()
else:
# we only increase the tries when we cannot refresh id
# to avoid an infinite loop
tries += 1
raise Exception("Cannot fetch the page from Google Scholar.")

def _check_proxy(self, proxies) -> bool:
Expand All @@ -132,11 +140,13 @@ def _check_proxy(self, proxies) -> bool:
try:
# Changed to twitter so we dont ping google twice every time
resp = session.get("http://www.twitter.com", timeout=self._TIMEOUT)
self.logger.info("Proxy Works!")
return resp.status_code == 200
if resp.status_code == 200:
self.logger.info("Proxy works!")
return True
except Exception as e:
self.logger.info(f"Proxy not working: Exception {e}")
return False
self.logger.info(f"Exception while testing proxy: {e}")

return False

def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
"""Refreshes the id by using a new ToR node.
Expand All @@ -157,26 +167,37 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
self.logger.info(err)
return False

def _use_proxy(self, http: str, https: str) -> bool:
def _set_retries(self, num_retries: int) -> None:
if (num_retries < 0):
raise ValueError("num_retries must not be negative")
self._max_retries = num_retries

def _set_proxy_generator(self, gen: Callable[..., str]) -> bool:
self._proxy_gen = gen
return True

def _use_proxy(self, http: str, https: str = None) -> bool:
"""Allows user to set their own proxy for the connection session.
Sets the proxy, and checks if it woks,
Sets the proxy, and checks if it works.
:param http: the http proxy
:type http: str
:param https: the https proxy
:param https: the https proxy (default to the same as http)
:type https: str
:returns: if the proxy works
:rtype: {bool}
"""
self.logger.info("Enabling proxies: http=%r https=%r", http, https)

if https is None:
https = http

proxies = {'http': http, 'https': https}
self._proxy_works = self._check_proxy(proxies)
if self._proxy_works:
self.logger.info(f"Enabling proxies: http={http} https={https}")
self.proxies = proxies
else:
self.proxies = {'http': None, 'https': None}

self.logger.info(f"Proxy {http} does not seem to work.")
return self._proxy_works

def _setup_tor(self, tor_sock_port: int, tor_control_port: int, tor_password: str):
Expand Down
18 changes: 18 additions & 0 deletions scholarly/_scholarly.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""scholarly.py"""
import requests
from typing import Callable
from ._navigator import Navigator

_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
Expand All @@ -13,6 +14,15 @@ class _Scholarly:
def __init__(self):
self.__nav = Navigator()

def set_retries(self, num_retries: int):
"""Sets the number of retries in case of errors
:param num_retries: the number of retries
:type num_retries: int
"""

return self.__nav._set_retries(num_retries)

def use_proxy(self, http: str, https: str):
"""Setups a proxy without refreshing capabilities.
Expand All @@ -24,6 +34,14 @@ def use_proxy(self, http: str, https: str):

return self.__nav._use_proxy(http, https)

def set_proxy_generator(self, gen: Callable[..., str]):
"""Setups a function that generates new proxies on demand.
:param gen: the function to call to obtain a new proxy
"""

return self.__nav._set_proxy_generator(gen)

def use_tor(self, tor_sock_port: int, tor_control_port: int, tor_pw: str):
"""[summary]
Expand Down
13 changes: 8 additions & 5 deletions scholarly/publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ def _citation_pub(self, __data):

def _get_authorlist(self, authorinfo):
authorlist = list()
text = authorinfo.text.replace(u'\xa0', u' ')
text = text.split(' - ')[0]
text = authorinfo.split(' - ')[0]
for i in text.split(','):
i = i.strip()
if bool(re.search(r'\d', i)):
Expand Down Expand Up @@ -121,12 +120,16 @@ def _scholar_pub(self, __data):
if title.find('a'):
self.bib['url'] = title.find('a')['href']

authorinfo = databox.find('div', class_='gs_a')
authorinfo = databox.find('div', class_='gs_a').text
authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP
authorinfo = authorinfo.replace(u'&amp;', u'&') # Ampersand
self.bib["author"] = self._get_authorlist(authorinfo)

try:
self.bib['venue'], self.bib['year'] = authorinfo.text.split(
' - ')[1].split(',')
venueyear = authorinfo.split(' - ')[1].split(',')
self.bib['venue'] = ''.join(venueyear[0:-1])
self.bib['year'] = venueyear[-1]
self.bib['year'] = self.bib['year'].strip()
except Exception:
self.bib['venue'], self.bib['year'] = 'NA', 'NA'

Expand Down

0 comments on commit 5f794ac

Please sign in to comment.