Merge pull request #136 from stefanct/master

Various improvements
scholarly-python-package · Jun 1, 2020 · 5f794ac · 5f794ac
2 parents a31004f + 68b295b
commit 5f794ac
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 33 deletions.
diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py
@@ -3,6 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from typing import Callable
 from bs4 import BeautifulSoup
 
 import codecs
@@ -20,8 +21,6 @@
 from .author import Author
 from .publication import Publication
 
-_GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
-_COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}
 _HEADERS = {
     'accept-language': 'en-US,en',
     'accept': 'text/html,application/xhtml+xml,application/xml'
@@ -49,6 +48,7 @@ def __init__(self):
         super(Navigator, self).__init__()
         logging.basicConfig(filename='scholar.log', level=logging.INFO)
         self.logger = logging.getLogger('scholarly')
+        self._proxy_gen = None
         # If we use a proxy or Tor, we set this to True
         self._proxy_works = False
         # If we have a Tor server that we can refresh, we set this to True
@@ -59,7 +59,7 @@ def __init__(self):
         # Setting requests timeout to be reasonably long
         # to accomodate slowness of the Tor network
         self._TIMEOUT = 10
-        self._MAX_RETRIES = 5
+        self._max_retries = 5
 
     def __del__(self):
         if self._tor_process:
@@ -79,7 +79,7 @@ def _get_page(self, pagerequest: str) -> str:
         time.sleep(random.uniform(1,5))
         resp = None
         tries = 0
-        while tries < self._MAX_RETRIES:
+        while tries < self._max_retries:
             # If proxy/Tor was setup, use it.
             # Otherwise the local IP is used
             session = requests.Session()
@@ -88,36 +88,44 @@ def _get_page(self, pagerequest: str) -> str:
 
             try:
                 _HEADERS['User-Agent'] = UserAgent().random
+                _GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
+                _COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}
 
                 resp = session.get(pagerequest,
                                    headers=_HEADERS,
                                    cookies=_COOKIES,
                                    timeout=self._TIMEOUT)
 
                 if resp.status_code == 200:
-                    if self._has_captcha(resp.text):
-                        raise Exception("Got a CAPTCHA. Retrying.")
-                    else:
-                        session.close()
+                    if not self._has_captcha(resp.text):
                         return resp.text
+                    self.logger.info("Got a CAPTCHA. Retrying.")
                 else:
                     self.logger.info(f"""Response code {resp.status_code}.
                                     Retrying...""")
-                    raise Exception(f"Status code {resp.status_code}")
 
             except Exception as e:
                 err = f"Exception {e} while fetching page. Retrying."
                 self.logger.info(err)
-                # Check if Tor is running and refresh it
-                self.logger.info("Refreshing Tor ID...")
+            finally:
                 session.close()
-                if self._can_refresh_tor:
-                    self._refresh_tor_id(self._tor_control_port, self._tor_password)
-                    time.sleep(5) # wait for the refresh to happen
-                else:
-                    # we only increase the tries when we cannot refresh id
-                    # to avod an infinite loop
-                    tries += 1
+
+            # Check if Tor is running and refresh it
+            if self._can_refresh_tor:
+                self.logger.info("Refreshing Tor ID...")
+                self._refresh_tor_id(self._tor_control_port, self._tor_password)
+                time.sleep(5) # wait for the refresh to happen
+            elif self._proxy_gen:
+                tries += 1
+                self.logger.info(f"Try #{tries} failed. Switching proxy.")
+                # Try to get another proxy
+                new_proxy = self._proxy_gen()
+                while (not self._use_proxy(new_proxy)):
+                    new_proxy = self._proxy_gen()
+            else:
+                # we only increase the tries when we cannot refresh id
+                # to avoid an infinite loop
+                tries += 1
         raise Exception("Cannot fetch the page from Google Scholar.")
 
     def _check_proxy(self, proxies) -> bool:
@@ -132,11 +140,13 @@ def _check_proxy(self, proxies) -> bool:
             try:
                 # Changed to twitter so we dont ping google twice every time
                 resp = session.get("http://www.twitter.com", timeout=self._TIMEOUT)
-                self.logger.info("Proxy Works!")
-                return resp.status_code == 200
+                if resp.status_code == 200:
+                    self.logger.info("Proxy works!")
+                    return True
             except Exception as e:
-                self.logger.info(f"Proxy not working: Exception {e}")
-                return False
+                self.logger.info(f"Exception while testing proxy: {e}")
+
+            return False
 
     def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
         """Refreshes the id by using a new ToR node.
@@ -157,26 +167,37 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
             self.logger.info(err)
             return False
 
-    def _use_proxy(self, http: str, https: str) -> bool:
+    def _set_retries(self, num_retries: int) -> None:
+        if (num_retries < 0):
+            raise ValueError("num_retries must not be negative")
+        self._max_retries = num_retries
+
+    def _set_proxy_generator(self, gen: Callable[..., str]) -> bool:
+        self._proxy_gen = gen
+        return True
+
+    def _use_proxy(self, http: str, https: str = None) -> bool:
         """Allows user to set their own proxy for the connection session.
-        Sets the proxy, and checks if it woks,
+        Sets the proxy, and checks if it works.
 
         :param http: the http proxy
         :type http: str
-        :param https: the https proxy
+        :param https: the https proxy (default to the same as http)
         :type https: str
         :returns: if the proxy works
         :rtype: {bool}
         """
-        self.logger.info("Enabling proxies: http=%r https=%r", http, https)
+
+        if https is None:
+            https = http
 
         proxies = {'http': http, 'https': https}
         self._proxy_works = self._check_proxy(proxies)
         if self._proxy_works:
+            self.logger.info(f"Enabling proxies: http={http} https={https}")
             self.proxies = proxies
         else:
-            self.proxies = {'http': None, 'https': None}
-
+            self.logger.info(f"Proxy {http} does not seem to work.")
         return self._proxy_works
 
     def _setup_tor(self, tor_sock_port: int, tor_control_port: int, tor_password: str):

diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py
@@ -1,5 +1,6 @@
 """scholarly.py"""
 import requests
+from typing import Callable
 from ._navigator import Navigator
 
 _AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
@@ -13,6 +14,15 @@ class _Scholarly:
     def __init__(self):
         self.__nav = Navigator()
 
+    def set_retries(self, num_retries: int):
+        """Sets the number of retries in case of errors
+
+        :param num_retries: the number of retries
+        :type num_retries: int
+        """
+
+        return self.__nav._set_retries(num_retries)
+
     def use_proxy(self, http: str, https: str):
         """Setups a proxy without refreshing capabilities.
 
@@ -24,6 +34,14 @@ def use_proxy(self, http: str, https: str):
 
         return self.__nav._use_proxy(http, https)
 
+    def set_proxy_generator(self, gen: Callable[..., str]):
+        """Setups a function that generates new proxies on demand.
+
+        :param gen: the function to call to obtain a new proxy
+        """
+
+        return self.__nav._set_proxy_generator(gen)
+
     def use_tor(self, tor_sock_port: int, tor_control_port: int, tor_pw: str):
         """[summary]
 

diff --git a/scholarly/publication.py b/scholarly/publication.py
@@ -88,8 +88,7 @@ def _citation_pub(self, __data):
 
     def _get_authorlist(self, authorinfo):
         authorlist = list()
-        text = authorinfo.text.replace(u'\xa0', u' ')
-        text = text.split(' - ')[0]
+        text = authorinfo.split(' - ')[0]
         for i in text.split(','):
             i = i.strip()
             if bool(re.search(r'\d', i)):
@@ -121,12 +120,16 @@ def _scholar_pub(self, __data):
         if title.find('a'):
             self.bib['url'] = title.find('a')['href']
 
-        authorinfo = databox.find('div', class_='gs_a')
+        authorinfo = databox.find('div', class_='gs_a').text
+        authorinfo = authorinfo.replace(u'\xa0', u' ')       # NBSP
+        authorinfo = authorinfo.replace(u'&amp;', u'&')      # Ampersand
         self.bib["author"] = self._get_authorlist(authorinfo)
 
         try:
-            self.bib['venue'], self.bib['year'] = authorinfo.text.split(
-                ' - ')[1].split(',')
+            venueyear = authorinfo.split(' - ')[1].split(',')
+            self.bib['venue'] = ''.join(venueyear[0:-1])
+            self.bib['year'] = venueyear[-1]
+            self.bib['year'] = self.bib['year'].strip()
         except Exception:
             self.bib['venue'], self.bib['year'] = 'NA', 'NA'