Skip to content

Commit

Permalink
Merge pull request #476 from scholarly-python-package/develop
Browse files Browse the repository at this point in the history
Release v1.7.7
  • Loading branch information
arunkannawadi committed Dec 28, 2022
2 parents f5b24dd + 7d2d028 commit 8630f8d
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 16 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -4,6 +4,7 @@ bibtexparser
deprecated
fake_useragent
free-proxy
httpx
python-dotenv
requests[socks]
selenium
Expand Down
7 changes: 4 additions & 3 deletions scholarly/_navigator.py
Expand Up @@ -11,6 +11,7 @@
import random
import time
from requests.exceptions import Timeout
from httpx import TimeoutException
from selenium.webdriver.common.by import By
from .publication_parser import _SearchScholarIterator
from .author_parser import AuthorParser
Expand Down Expand Up @@ -111,7 +112,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
w = random.uniform(1,2)
time.sleep(w)
resp = session.get(pagerequest, timeout=timeout)
self.logger.debug("Session proxy config is {}".format(session.proxies))
self.logger.debug("Session proxy config is {}".format(pm._proxies))

has_captcha = self._requests_has_captcha(resp.text)

Expand Down Expand Up @@ -149,7 +150,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
self.logger.info("Will retry after %.2f seconds (with the same session).", w)
time.sleep(w)
continue
except Timeout as e:
except (Timeout, TimeoutException) as e:
err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args)
self.logger.info(err)
if timeout < 3*self._TIMEOUT:
Expand All @@ -164,7 +165,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:

tries += 1
try:
session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None))
session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None))
except Exception:
self.logger.info("No other secondary connections possible. "
"Using the primary proxy for all requests.")
Expand Down
34 changes: 22 additions & 12 deletions scholarly/_proxy_generator.py
Expand Up @@ -4,6 +4,7 @@
import logging
import time
import requests
import httpx
import tempfile
import urllib3

Expand Down Expand Up @@ -43,6 +44,7 @@ def __init__(self):
# If we use a proxy or Tor, we set this to True
self._proxy_works = False
self.proxy_mode = None
self._proxies = {}
# If we have a Tor server that we can refresh, we set this to True
self._tor_process = None
self._can_refresh_tor = False
Expand Down Expand Up @@ -183,8 +185,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
"""
if https is None:
https = http
if http[:4] != "http":
http = "http://" + http
if https[:5] != "https":
https = "https://" + https

proxies = {'http': http, 'https': https}
proxies = {'http://': http, 'https://': https}
if self.proxy_mode == ProxyMode.SCRAPERAPI:
r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json()
if "error" in r:
Expand All @@ -198,7 +204,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
self._proxy_works = self._check_proxy(proxies)

if self._proxy_works:
self._session.proxies = proxies
self._proxies = proxies
self._new_session()

return self._proxy_works
Expand Down Expand Up @@ -353,8 +359,8 @@ def _get_webdriver(self):
def _get_chrome_webdriver(self):
if self._proxy_works:
webdriver.DesiredCapabilities.CHROME['proxy'] = {
"httpProxy": self._session.proxies['http'],
"sslProxy": self._session.proxies['https'],
"httpProxy": self._proxies['http'],
"sslProxy": self._proxies['https'],
"proxyType": "MANUAL"
}

Expand All @@ -369,8 +375,8 @@ def _get_firefox_webdriver(self):
if self._proxy_works:
# Redirect webdriver through proxy
webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
"httpProxy": self._session.proxies['http'],
"sslProxy": self._session.proxies['https'],
"httpProxy": self._proxies['http'],
"sslProxy": self._proxies['https'],
"proxyType": "MANUAL",
}

Expand Down Expand Up @@ -439,11 +445,12 @@ def _handle_captcha2(self, url):
return self._session

def _new_session(self):
init_kwargs = {}
proxies = {}
if self._session:
proxies = self._session.proxies
proxies = self._proxies
self._close_session()
self._session = requests.Session()
# self._session = httpx.Client()
self.got_403 = False

# Suppress the misleading traceback from UserAgent()
Expand All @@ -453,15 +460,18 @@ def _new_session(self):
'accept': 'text/html,application/xhtml+xml,application/xml',
'User-Agent': UserAgent().random,
}
self._session.headers.update(_HEADERS)
# self._session.headers.update(_HEADERS)
init_kwargs.update(headers=_HEADERS)

if self._proxy_works:
self._session.proxies = proxies
init_kwargs["proxies"] = proxies #.get("http", None)
self._proxies = proxies
if self.proxy_mode is ProxyMode.SCRAPERAPI:
# SSL Certificate verification must be disabled for
# ScraperAPI requests to work.
# https://www.scraperapi.com/documentation/
self._session.verify = False
init_kwargs["verify"] = False
self._session = httpx.Client(**init_kwargs)
self._webdriver = None

return self._session
Expand Down Expand Up @@ -496,7 +506,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120):
all_proxies = freeproxy.get_proxy_list()
if proxy in self._dirty_freeproxies:
continue
proxies = {'http': proxy, 'https': proxy}
proxies = {'http://': proxy, 'https://': proxy}
proxy_works = self._check_proxy(proxies)
if proxy_works:
dirty_proxy = (yield proxy)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='scholarly',
version='1.7.6',
version='1.7.7',
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
description='Simple access to Google Scholar authors and citations',
Expand Down

0 comments on commit 8630f8d

Please sign in to comment.