Skip to content

Commit

Permalink
Merge pull request #217 from scholarly-python-package/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
programize-admin committed Nov 25, 2020
2 parents 13b203a + db357e0 commit 4d4b120
Show file tree
Hide file tree
Showing 19 changed files with 1,136 additions and 630 deletions.
335 changes: 181 additions & 154 deletions README.md

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions docs/AuthorParser.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
AuthorParser module
-----------------------

.. automodule:: scholarly.author_parser
:members:
:undoc-members:
4 changes: 2 additions & 2 deletions docs/Author.rst → docs/DataTypes.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Author module
DataTypes module
-----------------------

.. automodule:: scholarly.author
.. automodule:: scholarly.data_types
:members:
:undoc-members:
7 changes: 7 additions & 0 deletions docs/ProxyGenerator.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

ProxyGenerator module
-----------------------

.. automodule:: scholarly._proxy_generator
:members:
:undoc-members:
4 changes: 2 additions & 2 deletions docs/Publication.rst → docs/PublicationParser.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Publication module
PublicationParser module
----------------------------

.. automodule:: scholarly.publication
.. automodule:: scholarly.publication_parser
:members:
:undoc-members:
:show-inheritance:
6 changes: 4 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#
import os
import sys
import sphinx_rtd_theme
sys.path.insert(0, os.path.abspath('..'))


Expand All @@ -22,7 +23,7 @@
author = 'Steven A. Cholewiak, Panos Ipeirotis, Victor Silva'

# The full version, including alpha/beta/rc tags
release = '0.3.1'
release = '1.0b1'
master_doc = 'index'

# -- General configuration ---------------------------------------------------
Expand All @@ -36,6 +37,7 @@
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'sphinx.ext.doctest',
'sphinx_rtd_theme',
#'sphinx.ext.napoleon'
]

Expand Down Expand Up @@ -65,7 +67,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_static_path = ['_static']


# -- Extension configuration -------------------------------------------------
Expand Down
9 changes: 5 additions & 4 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ then retrieve the titles of the papers that cite his most popular
print(author)
# Print the titles of the author's publications
print([pub.bib['title'] for pub in author.publications])
print([pub['bib']['title'] for pub in author['publications']])
# Take a closer look at the first publication
pub = author.publications[0].fill()
pub = scholarly.fill(author['publications'][0])
print(pub)
# Which papers cited that publication?
print([citation.bib['title'] for citation in pub.get_citedby()])
print([citation['bib']['title'] for citation in scholarly.citedby(pub)])
# What is the Bibtex of that publication?
print(pub.bibtex)
scholarly.bibtex(pub)
6 changes: 4 additions & 2 deletions docs/scholarly.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ scholarly Package
:maxdepth: 2

scholarly_user
Author.rst
Publication.rst
AuthorParser.rst
PublicationParser.rst
ProxyGenerator.rst
DataTypes.rst
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ stem
fake_useragent
selenium
python-dotenv
free-proxy
free-proxy
sphinx_rtd_theme
1 change: 1 addition & 0 deletions scholarly/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._scholarly import _Scholarly
from ._proxy_generator import ProxyGenerator
from .data_types import Author, Publication
scholarly = _Scholarly()
39 changes: 25 additions & 14 deletions scholarly/_navigator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
from stem import Signal
from stem.control import Controller
from fake_useragent import UserAgent
from .publication import _SearchScholarIterator
from .author import Author
from .publication import Publication
from .publication_parser import _SearchScholarIterator
from .author_parser import AuthorParser
from .publication_parser import PublicationParser
from .data_types import Author

class DOSException(Exception):
"""DOS attack was detected."""
Expand All @@ -49,7 +50,7 @@ class Navigator(object, metaclass=Singleton):

def __init__(self):
super(Navigator, self).__init__()
logging.basicConfig(filename='scholar.log', level=logging.INFO)
logging.basicConfig(filename='scholar.log', level=logging.CRITICAL)
self.logger = logging.getLogger('scholarly')
self._TIMEOUT = 5
self._max_retries = 5
Expand All @@ -59,6 +60,12 @@ def __init__(self):
self.got_403 = False


def set_logger(self, enable: bool):
"""Enable or disable the logger for google scholar."""

self.logger.setLevel((logging.INFO if enable else logging.CRITICAL))


def use_proxy(self, pg: ProxyGenerator):
if pg is not None:
self.pm = pg
Expand Down Expand Up @@ -106,9 +113,10 @@ def _get_page(self, pagerequest: str) -> str:
if not self.got_403:
self.logger.info("Retrying immediately with another session.")
else:
w = random.uniform(60, 2*60)
self.logger.info("Will retry after {} seconds (with another session).".format(w))
time.sleep(w)
if not self.pm._use_luminati:
w = random.uniform(60, 2*60)
self.logger.info("Will retry after {} seconds (with another session).".format(w))
time.sleep(w)
self._new_session()
self.got_403 = True

Expand Down Expand Up @@ -198,15 +206,16 @@ def _get_soup(self, url: str) -> BeautifulSoup:
pass
return res

def search_authors(self, url: str):
def search_authors(self, url: str)->Author:
"""Generator that returns Author objects from the author search page"""
soup = self._get_soup(url)


author_parser = AuthorParser(self)
while True:
rows = soup.find_all('div', 'gsc_1usr')
self.logger.info("Found %d authors", len(rows))
for row in rows:
yield Author(self, row)
yield author_parser.get_author(row)
cls1 = 'gs_btnPR gs_in_ib gs_btn_half '
cls2 = 'gs_btn_lsb gs_btn_srt gsc_pgn_pnx'
next_button = soup.find(class_=cls1+cls2) # Can be improved
Expand All @@ -220,7 +229,7 @@ def search_authors(self, url: str):
break

def search_publication(self, url: str,
filled: bool = False) -> Publication:
filled: bool = False) -> PublicationParser:
"""Search by scholar query and return a single Publication object
:param url: the url to be searched at
Expand All @@ -231,7 +240,7 @@ def search_publication(self, url: str,
:rtype: {Publication}
"""
soup = self._get_soup(url)
res = Publication(self, soup.find_all('div', 'gs_or')[0], 'scholar')
res = PublicationParser(self, soup.find_all('div', 'gs_or')[0], 'scholar')
if filled:
res.fill()
return res
Expand All @@ -255,8 +264,10 @@ def search_author_id(self, id: str, filled: bool = False) -> Author:
:returns: an Author object
:rtype: {Author}
"""
author_parser = AuthorParser(self)
res = author_parser.get_author(id)
if filled:
res = Author(self, id).fill()
res = author_parser.fill(res)
else:
res = Author(self, id).fill(sections=['basics'])
res = author_parser.fill(res, sections=['basics'])
return res
1 change: 1 addition & 0 deletions scholarly/_proxy_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None):
If no arguments are passed for the tor_sock_port and the tor_control_port they are automatically generated in the following ranges
- tor_sock_port: (9000, 9500)
- tor_control_port: (9500, 9999)
:param tor_cmd: tor executable location (absolute path if its not exported in PATH)
:type tor_cmd: string
:param tor_sock_port: tor socket port
Expand Down

0 comments on commit 4d4b120

Please sign in to comment.