From b2fe4fc58829b4654c943ed74e334ce0379c6e09 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Tue, 27 Dec 2022 18:26:19 -0500 Subject: [PATCH 1/2] Use httpx instead of requests --- requirements.txt | 1 + scholarly/_navigator.py | 7 ++++--- scholarly/_proxy_generator.py | 34 ++++++++++++++++++++++------------ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0d1d0213..5b142000 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ bibtexparser deprecated fake_useragent free-proxy +httpx python-dotenv requests[socks] selenium diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py index ef7c4d21..3d856428 100644 --- a/scholarly/_navigator.py +++ b/scholarly/_navigator.py @@ -11,6 +11,7 @@ import random import time from requests.exceptions import Timeout +from httpx import TimeoutException from selenium.webdriver.common.by import By from .publication_parser import _SearchScholarIterator from .author_parser import AuthorParser @@ -111,7 +112,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: w = random.uniform(1,2) time.sleep(w) resp = session.get(pagerequest, timeout=timeout) - self.logger.debug("Session proxy config is {}".format(session.proxies)) + self.logger.debug("Session proxy config is {}".format(pm._proxies)) has_captcha = self._requests_has_captcha(resp.text) @@ -149,7 +150,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: self.logger.info("Will retry after %.2f seconds (with the same session).", w) time.sleep(w) continue - except Timeout as e: + except (Timeout, TimeoutException) as e: err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args) self.logger.info(err) if timeout < 3*self._TIMEOUT: @@ -164,7 +165,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: tries += 1 try: - session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None)) + session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None)) except Exception: self.logger.info("No other secondary connections possible. " "Using the primary proxy for all requests.") diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 508d740e..01f7d772 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -4,6 +4,7 @@ import logging import time import requests +import httpx import tempfile import urllib3 @@ -43,6 +44,7 @@ def __init__(self): # If we use a proxy or Tor, we set this to True self._proxy_works = False self.proxy_mode = None + self._proxies = {} # If we have a Tor server that we can refresh, we set this to True self._tor_process = None self._can_refresh_tor = False @@ -183,8 +185,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool: """ if https is None: https = http + if http[:4] != "http": + http = "http://" + http + if https[:5] != "https": + https = "https://" + https - proxies = {'http': http, 'https': https} + proxies = {'http://': http, 'https://': https} if self.proxy_mode == ProxyMode.SCRAPERAPI: r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json() if "error" in r: @@ -198,7 +204,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool: self._proxy_works = self._check_proxy(proxies) if self._proxy_works: - self._session.proxies = proxies + self._proxies = proxies self._new_session() return self._proxy_works @@ -353,8 +359,8 @@ def _get_webdriver(self): def _get_chrome_webdriver(self): if self._proxy_works: webdriver.DesiredCapabilities.CHROME['proxy'] = { - "httpProxy": self._session.proxies['http'], - "sslProxy": self._session.proxies['https'], + "httpProxy": self._proxies['http'], + "sslProxy": self._proxies['https'], "proxyType": "MANUAL" } @@ -369,8 +375,8 @@ def _get_firefox_webdriver(self): if self._proxy_works: # Redirect webdriver through proxy webdriver.DesiredCapabilities.FIREFOX['proxy'] = { - "httpProxy": self._session.proxies['http'], - "sslProxy": self._session.proxies['https'], + "httpProxy": self._proxies['http'], + "sslProxy": self._proxies['https'], "proxyType": "MANUAL", } @@ -439,11 +445,12 @@ def _handle_captcha2(self, url): return self._session def _new_session(self): + init_kwargs = {} proxies = {} if self._session: - proxies = self._session.proxies + proxies = self._proxies self._close_session() - self._session = requests.Session() + # self._session = httpx.Client() self.got_403 = False # Suppress the misleading traceback from UserAgent() @@ -453,15 +460,18 @@ def _new_session(self): 'accept': 'text/html,application/xhtml+xml,application/xml', 'User-Agent': UserAgent().random, } - self._session.headers.update(_HEADERS) + # self._session.headers.update(_HEADERS) + init_kwargs.update(headers=_HEADERS) if self._proxy_works: - self._session.proxies = proxies + init_kwargs["proxies"] = proxies #.get("http", None) + self._proxies = proxies if self.proxy_mode is ProxyMode.SCRAPERAPI: # SSL Certificate verification must be disabled for # ScraperAPI requests to work. # https://www.scraperapi.com/documentation/ - self._session.verify = False + init_kwargs["verify"] = False + self._session = httpx.Client(**init_kwargs) self._webdriver = None return self._session @@ -496,7 +506,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120): all_proxies = freeproxy.get_proxy_list() if proxy in self._dirty_freeproxies: continue - proxies = {'http': proxy, 'https': proxy} + proxies = {'http://': proxy, 'https://': proxy} proxy_works = self._check_proxy(proxies) if proxy_works: dirty_proxy = (yield proxy) From 7d2d028ce5d06ce21a91a08e360f7dc2c0f2835c Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Tue, 27 Dec 2022 19:08:29 -0500 Subject: [PATCH 2/2] Bump version to 1.7.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c6beed86..b1a3164c 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='scholarly', - version='1.7.6', + version='1.7.7', author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi', author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu', description='Simple access to Google Scholar authors and citations',