Merge pull request #476 from scholarly-python-package/develop

Release v1.7.7
scholarly-python-package · Dec 28, 2022 · 8630f8d · 8630f8d
2 parents f5b24dd + 7d2d028
commit 8630f8d
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 16 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ bibtexparser
 deprecated
 fake_useragent
 free-proxy
+httpx
 python-dotenv
 requests[socks]
 selenium

diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py
@@ -11,6 +11,7 @@
 import random
 import time
 from requests.exceptions import Timeout
+from httpx import TimeoutException
 from selenium.webdriver.common.by import By
 from .publication_parser import _SearchScholarIterator
 from .author_parser import AuthorParser
@@ -111,7 +112,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
                 w = random.uniform(1,2)
                 time.sleep(w)
                 resp = session.get(pagerequest, timeout=timeout)
-                self.logger.debug("Session proxy config is {}".format(session.proxies))
+                self.logger.debug("Session proxy config is {}".format(pm._proxies))
 
                 has_captcha = self._requests_has_captcha(resp.text)
 
@@ -149,7 +150,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
                     self.logger.info("Will retry after %.2f seconds (with the same session).", w)
                     time.sleep(w)
                     continue
-            except Timeout as e:
+            except (Timeout, TimeoutException) as e:
                 err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args)
                 self.logger.info(err)
                 if timeout < 3*self._TIMEOUT:
@@ -164,7 +165,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
 
             tries += 1
             try:
-                session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None))
+                session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None))
             except Exception:
                 self.logger.info("No other secondary connections possible. "
                                  "Using the primary proxy for all requests.")

diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py
@@ -4,6 +4,7 @@
 import logging
 import time
 import requests
+import httpx
 import tempfile
 import urllib3
 
@@ -43,6 +44,7 @@ def __init__(self):
         # If we use a proxy or Tor, we set this to True
         self._proxy_works = False
         self.proxy_mode = None
+        self._proxies = {}
         # If we have a Tor server that we can refresh, we set this to True
         self._tor_process = None
         self._can_refresh_tor = False
@@ -183,8 +185,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
         """
         if https is None:
             https = http
+        if http[:4] != "http":
+            http = "http://" + http
+        if https[:5] != "https":
+            https = "https://" + https
 
-        proxies = {'http': http, 'https': https}
+        proxies = {'http://': http, 'https://': https}
         if self.proxy_mode == ProxyMode.SCRAPERAPI:
             r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json()
             if "error" in r:
@@ -198,7 +204,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
             self._proxy_works = self._check_proxy(proxies)
 
         if self._proxy_works:
-            self._session.proxies = proxies
+            self._proxies = proxies
             self._new_session()
 
         return self._proxy_works
@@ -353,8 +359,8 @@ def _get_webdriver(self):
     def _get_chrome_webdriver(self):
         if self._proxy_works:
             webdriver.DesiredCapabilities.CHROME['proxy'] = {
-                "httpProxy": self._session.proxies['http'],
-                "sslProxy": self._session.proxies['https'],
+                "httpProxy": self._proxies['http'],
+                "sslProxy": self._proxies['https'],
                 "proxyType": "MANUAL"
             }
 
@@ -369,8 +375,8 @@ def _get_firefox_webdriver(self):
         if self._proxy_works:
             # Redirect webdriver through proxy
             webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
-                "httpProxy": self._session.proxies['http'],
-                "sslProxy": self._session.proxies['https'],
+                "httpProxy": self._proxies['http'],
+                "sslProxy": self._proxies['https'],
                 "proxyType": "MANUAL",
             }
 
@@ -439,11 +445,12 @@ def _handle_captcha2(self, url):
         return self._session
 
     def _new_session(self):
+        init_kwargs = {}
         proxies = {}
         if self._session:
-            proxies = self._session.proxies
+            proxies = self._proxies
             self._close_session()
-        self._session = requests.Session()
+        # self._session = httpx.Client()
         self.got_403 = False
 
         # Suppress the misleading traceback from UserAgent()
@@ -453,15 +460,18 @@ def _new_session(self):
                 'accept': 'text/html,application/xhtml+xml,application/xml',
                 'User-Agent': UserAgent().random,
             }
-        self._session.headers.update(_HEADERS)
+        # self._session.headers.update(_HEADERS)
+        init_kwargs.update(headers=_HEADERS)
 
         if self._proxy_works:
-            self._session.proxies = proxies
+            init_kwargs["proxies"] = proxies #.get("http", None)
+            self._proxies = proxies
             if self.proxy_mode is ProxyMode.SCRAPERAPI:
                 # SSL Certificate verification must be disabled for
                 # ScraperAPI requests to work.
                 # https://www.scraperapi.com/documentation/
-                self._session.verify = False
+                init_kwargs["verify"] = False
+        self._session = httpx.Client(**init_kwargs)
         self._webdriver = None
 
         return self._session
@@ -496,7 +506,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120):
                 all_proxies = freeproxy.get_proxy_list()
             if proxy in self._dirty_freeproxies:
                 continue
-            proxies = {'http': proxy, 'https': proxy}
+            proxies = {'http://': proxy, 'https://': proxy}
             proxy_works = self._check_proxy(proxies)
             if proxy_works:
                 dirty_proxy = (yield proxy)

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='scholarly',
-    version='1.7.6',
+    version='1.7.7',
     author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
     author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
     description='Simple access to Google Scholar authors and citations',