From b2fe4fc58829b4654c943ed74e334ce0379c6e09 Mon Sep 17 00:00:00 2001
From: arunkannawadi <kj.arun.kj@gmail.com>
Date: Tue, 27 Dec 2022 18:26:19 -0500
Subject: [PATCH 1/2] Use httpx instead of requests

---
 requirements.txt              |  1 +
 scholarly/_navigator.py       |  7 ++++---
 scholarly/_proxy_generator.py | 34 ++++++++++++++++++++++------------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0d1d0213..5b142000 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ bibtexparser
 deprecated
 fake_useragent
 free-proxy
+httpx
 python-dotenv
 requests[socks]
 selenium
diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py
index ef7c4d21..3d856428 100644
--- a/scholarly/_navigator.py
+++ b/scholarly/_navigator.py
@@ -11,6 +11,7 @@
 import random
 import time
 from requests.exceptions import Timeout
+from httpx import TimeoutException
 from selenium.webdriver.common.by import By
 from .publication_parser import _SearchScholarIterator
 from .author_parser import AuthorParser
@@ -111,7 +112,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
                 w = random.uniform(1,2)
                 time.sleep(w)
                 resp = session.get(pagerequest, timeout=timeout)
-                self.logger.debug("Session proxy config is {}".format(session.proxies))
+                self.logger.debug("Session proxy config is {}".format(pm._proxies))
 
                 has_captcha = self._requests_has_captcha(resp.text)
 
@@ -149,7 +150,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
                     self.logger.info("Will retry after %.2f seconds (with the same session).", w)
                     time.sleep(w)
                     continue
-            except Timeout as e:
+            except (Timeout, TimeoutException) as e:
                 err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args)
                 self.logger.info(err)
                 if timeout < 3*self._TIMEOUT:
@@ -164,7 +165,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
 
             tries += 1
             try:
-                session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None))
+                session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None))
             except Exception:
                 self.logger.info("No other secondary connections possible. "
                                  "Using the primary proxy for all requests.")
diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py
index 508d740e..01f7d772 100644
--- a/scholarly/_proxy_generator.py
+++ b/scholarly/_proxy_generator.py
@@ -4,6 +4,7 @@
 import logging
 import time
 import requests
+import httpx
 import tempfile
 import urllib3
 
@@ -43,6 +44,7 @@ def __init__(self):
         # If we use a proxy or Tor, we set this to True
         self._proxy_works = False
         self.proxy_mode = None
+        self._proxies = {}
         # If we have a Tor server that we can refresh, we set this to True
         self._tor_process = None
         self._can_refresh_tor = False
@@ -183,8 +185,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
         """
         if https is None:
             https = http
+        if http[:4] != "http":
+            http = "http://" + http
+        if https[:5] != "https":
+            https = "https://" + https
 
-        proxies = {'http': http, 'https': https}
+        proxies = {'http://': http, 'https://': https}
         if self.proxy_mode == ProxyMode.SCRAPERAPI:
             r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json()
             if "error" in r:
@@ -198,7 +204,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
             self._proxy_works = self._check_proxy(proxies)
 
         if self._proxy_works:
-            self._session.proxies = proxies
+            self._proxies = proxies
             self._new_session()
 
         return self._proxy_works
@@ -353,8 +359,8 @@ def _get_webdriver(self):
     def _get_chrome_webdriver(self):
         if self._proxy_works:
             webdriver.DesiredCapabilities.CHROME['proxy'] = {
-                "httpProxy": self._session.proxies['http'],
-                "sslProxy": self._session.proxies['https'],
+                "httpProxy": self._proxies['http'],
+                "sslProxy": self._proxies['https'],
                 "proxyType": "MANUAL"
             }
 
@@ -369,8 +375,8 @@ def _get_firefox_webdriver(self):
         if self._proxy_works:
             # Redirect webdriver through proxy
             webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
-                "httpProxy": self._session.proxies['http'],
-                "sslProxy": self._session.proxies['https'],
+                "httpProxy": self._proxies['http'],
+                "sslProxy": self._proxies['https'],
                 "proxyType": "MANUAL",
             }
 
@@ -439,11 +445,12 @@ def _handle_captcha2(self, url):
         return self._session
 
     def _new_session(self):
+        init_kwargs = {}
         proxies = {}
         if self._session:
-            proxies = self._session.proxies
+            proxies = self._proxies
             self._close_session()
-        self._session = requests.Session()
+        # self._session = httpx.Client()
         self.got_403 = False
 
         # Suppress the misleading traceback from UserAgent()
@@ -453,15 +460,18 @@ def _new_session(self):
                 'accept': 'text/html,application/xhtml+xml,application/xml',
                 'User-Agent': UserAgent().random,
             }
-        self._session.headers.update(_HEADERS)
+        # self._session.headers.update(_HEADERS)
+        init_kwargs.update(headers=_HEADERS)
 
         if self._proxy_works:
-            self._session.proxies = proxies
+            init_kwargs["proxies"] = proxies #.get("http", None)
+            self._proxies = proxies
             if self.proxy_mode is ProxyMode.SCRAPERAPI:
                 # SSL Certificate verification must be disabled for
                 # ScraperAPI requests to work.
                 # https://www.scraperapi.com/documentation/
-                self._session.verify = False
+                init_kwargs["verify"] = False
+        self._session = httpx.Client(**init_kwargs)
         self._webdriver = None
 
         return self._session
@@ -496,7 +506,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120):
                 all_proxies = freeproxy.get_proxy_list()
             if proxy in self._dirty_freeproxies:
                 continue
-            proxies = {'http': proxy, 'https': proxy}
+            proxies = {'http://': proxy, 'https://': proxy}
             proxy_works = self._check_proxy(proxies)
             if proxy_works:
                 dirty_proxy = (yield proxy)

From 7d2d028ce5d06ce21a91a08e360f7dc2c0f2835c Mon Sep 17 00:00:00 2001
From: arunkannawadi <kj.arun.kj@gmail.com>
Date: Tue, 27 Dec 2022 19:08:29 -0500
Subject: [PATCH 2/2] Bump version to 1.7.7

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c6beed86..b1a3164c 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='scholarly',
-    version='1.7.6',
+    version='1.7.7',
     author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
     author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
     description='Simple access to Google Scholar authors and citations',