Skip to content

Commit

Permalink
Added the ability to reset the ID in Tor
Browse files Browse the repository at this point in the history
  • Loading branch information
ipeirotis committed May 17, 2020
1 parent 9e1a873 commit 4e82ba6
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 50 deletions.
69 changes: 69 additions & 0 deletions scholarly/scholar.log
@@ -0,0 +1,69 @@
DEBUG:stem:GETCONF __owningcontrollerprocess (runtime: 0.0003)
INFO:stem:Error while receiving a control message (SocketClosed): empty socket content
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=
INFO:scholarly:Found 0 authors
INFO:scholarly:No more pages of authors
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:
INFO:scholarly:Found 6 authors
INFO:scholarly:No more pages of authors
INFO:scholarly:Reading search page
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&q=
INFO:scholarly:Got a response code 429. Retrying...
INFO:scholarly:Refreshing Tor ID...
DEBUG:stem:GETCONF __owningcontrollerprocess (runtime: 0.0003)
INFO:stem:Error while receiving a control message (SocketClosed): received exception "read of closed file"
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&q=
INFO:scholarly:Found 0 publications
INFO:scholarly:No more search pages
INFO:scholarly:Reading search page
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&q=frequency-domain%20analysis%20of%20haptic%20gratings%20cholewiak
INFO:scholarly:Found 1 publications
INFO:scholarly:No more search pages
INFO:scholarly:Getting https://scholar.googleusercontent.com/scholar.bib?q=info:MwL6-YApQfsJ:scholar.google.com/&output=citation&scisdr=CgXahJ8MGAA:AAGBfm0AAAAAXsChifalWEGXRnNkRnQSIc3O6JPbV9BO&scisig=AAGBfm0AAAAAXsChibs3jCHUaZeUBITnaSX3FzarBRtG&scisf=4&ct=citation&cd=0&hl=en
INFO:scholarly:Reading search page
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&oi=bibs&cites=18104797610932568627
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=10&hl=en&as_sdt=0,5&sciodt=0,5&cites=18104797610932568627&scipsc=
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=20&hl=en&as_sdt=0,5&sciodt=0,5&cites=18104797610932568627&scipsc=
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=30&hl=en&as_sdt=0,5&sciodt=0,5&cites=18104797610932568627&scipsc=
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=40&hl=en&as_sdt=0,5&sciodt=0,5&cites=18104797610932568627&scipsc=
INFO:scholarly:Found 9 publications
INFO:scholarly:No more search pages
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:3d_shape
INFO:scholarly:Found 2 authors
INFO:scholarly:No more pages of authors
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=cattani
INFO:scholarly:Found 10 authors
INFO:scholarly:Loading next page of authors
INFO:scholarly:Getting https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=cattani&after_author=986_AM_-__8J&astart=10
INFO:scholarly:Found 10 authors
INFO:scholarly:Loading next page of authors
INFO:scholarly:Getting https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=cattani&after_author=ourwAOP___8J&astart=20
INFO:scholarly:Found 4 authors
INFO:scholarly:No more pages of authors
INFO:scholarly:Reading search page
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&q=%22naive%20physics%22%20stability%20%223d%20shape%22
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=10&q=%22naive+physics%22+stability+%223d+shape%22&hl=en&as_sdt=0,5
INFO:scholarly:Found 10 publications
INFO:scholarly:Loading next search page
INFO:scholarly:Getting https://scholar.google.com/scholar?start=20&q=%22naive+physics%22+stability+%223d+shape%22&hl=en&as_sdt=0,5
INFO:scholarly:Found 9 publications
INFO:scholarly:No more search pages
INFO:scholarly:Reading search page
INFO:scholarly:Getting https://scholar.google.com/scholar?hl=en&q=Creating%20correct%20blur%20and%20its%20effect%20on%20accommodation
INFO:scholarly:Found 1 publications
INFO:scholarly:No more search pages
INFO:scholarly:Getting https://scholar.googleusercontent.com/scholar.bib?q=info:yaFMes1ZwnMJ:scholar.google.com/&output=citation&scisdr=CgXahJ8MGAA:AAGBfm0AAAAAXsChtI-iG2noTNG3wuJrNMgCtxCeU7A1&scisig=AAGBfm0AAAAAXsChtMO30vUXaExmGLRh2nYrBau27B13&scisf=4&ct=citation&cd=0&hl=en
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=Steven%20A.%20Cholewiak
INFO:scholarly:Found 1 authors
INFO:scholarly:No more pages of authors
INFO:scholarly:Getting https://scholar.google.com/citations?hl=en&user=4bahYMkAAAAJ&pagesize=100
103 changes: 82 additions & 21 deletions scholarly/scholarly.py
Expand Up @@ -16,6 +16,8 @@
import re
import requests
import time
from stem import Signal
from stem.control import Controller

_GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
_COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}
Expand All @@ -40,24 +42,71 @@
_SCHOLARPUBRE = r'cites=([\w-]*)'
_EMAILAUTHORRE = r'Verified email at '

_SESSION = requests.Session()
_PAGESIZE = 100
_TIMEOUT = 2

_PROXIES = {
"http": None,
"https": None,
}

_HTTP_PROXY = None
_HTTPS_PROXY = None

logging.basicConfig(filename='scholar.log', level=logging.INFO)
logger = logging.getLogger('scholarly')


def _tor_works():
"""
Checks if Tor is working
"""
with requests.Session() as session:
session.proxies = {
'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'
}
try:
resp = session.get("https://www.google.com")
if resp.status_code == 200:
return True
except Exception as e:
pass
return False

_TOR_WORKS = _tor_works()


def _refresh_tor_id():
with Controller.from_port(port = 9051) as controller:
controller.authenticate(password="scholarly_password")
controller.signal(Signal.NEWNYM)


def _can_refresh_tor():
time.sleep(1+random.uniform(0, 1))
try:
_refresh_tor_id()
return True
except:
return False

_CAN_REFRESH_TOR = _can_refresh_tor()



def use_proxy(http='socks5://127.0.0.1:9050', https='socks5://127.0.0.1:9050'):
""" Routes scholarly through a proxy (e.g. tor).
Requires pysocks
Proxy must be running."""
logger.info("Enabling proxies: http=%r https=%r", http, https)
_SESSION.proxies = {
'http': http,
'https': https
_PROXIES = {
"http": http,
"https": https,
}



'''
def use_random_proxy():
logger.info("Picking a random proxy and waiting")
Expand All @@ -81,24 +130,36 @@ def _get_page(pagerequest):
logger.info("Getting %s", pagerequest)
# Delay for avoiding overloading scholar
time.sleep(1+random.uniform(0, 1))

try:
resp = _SESSION.get(pagerequest, headers=_HEADERS, cookies=_COOKIES, timeout=1)
if resp.status_code == 200:
if 'scholarly_captcha' in resp.text:
logger.info("Got a CAPTCHA. Retrying...")
# elif 'not a robot when JavaScript is turned off' in resp.text:
# logger.info("Got a cannot verify . Retrying...")
else:
return resp.text

# If Tor is running we use the proxy
with requests.Session() as session:
if _TOR_WORKS:
# Tor uses the 9050 port as the default socks port
session.proxies = {'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'}
else:
logger.info("Got a response code %s. Retrying...", resp.status_code)
except Exception as e:
logger.info("Exception %s while fetching page. Retrying...", str(e))

# TODO: This works fine when the underlying proxy is rotating (e.g., with Tor)
# In other scenarios, we want to rotate the random proxy or switch from
# direct querying to a proxy.
session.proxies = _PROXIES

try:
resp = session.get(pagerequest, headers=_HEADERS, cookies=_COOKIES, timeout=_TIMEOUT)
if resp.status_code == 200:
if 'scholarly_captcha' in resp.text:
logger.info("Got a CAPTCHA. Retrying...")
elif 'not a robot when JavaScript is turned off' in resp.text:
logger.info("Got a cannot verify . Retrying...")
else:
return resp.text
else:
logger.info("Got a response code %s. Retrying...", resp.status_code)
except Exception as e:
logger.info("Exception %s while fetching page. Retrying...", str(e))

# We only reach this part if there was an error. We refresh our Tor identify if Tor runs
if _TOR_WORKS and _CAN_REFRESH_TOR:
logger.info("Refreshing Tor ID...")
time.sleep(2+random.uniform(0, 2))
_refresh_tor_id()

return _get_page(pagerequest)


Expand Down
58 changes: 29 additions & 29 deletions scholarly/test.py
Expand Up @@ -8,48 +8,36 @@
# rely on dynamic external data.
class TestScholarly(unittest.TestCase):

def _tor_works(self):
'''
Checks if Tor is working
'''
with requests.Session() as session:
session.proxies = {
'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'
}
try:
resp = session.get("https://www.google.com")
if resp.status_code == 200:
return True
except Exception as e:
pass
return False

def setUp(self):
if self._tor_works():
scholarly.use_tor()
#else:
# scholarly.use_random_proxy()

def test_empty_author(self):
'''Test that sholarly.search_author('') returns no authors'''
"""
Test that sholarly.search_author('') returns no authors
"""
authors = [a for a in scholarly.search_author('')]
self.assertIs(len(authors), 0)

def test_empty_keyword(self):
''' As of 2020-04-30, there are 6 individuals that match the name
'label' '''
"""
As of 2020-04-30, there are 6 individuals that match the name 'label'
"""
# TODO this seems like undesirable functionality for
# scholarly.search_keyword() with empty string. Surely, no authors
# should be returned. Consider modifying the method itself.
authors = [a for a in scholarly.search_keyword('')]
self.assertEqual(len(authors), 6)

def test_empty_publication(self):
"""
Test that searching for an empty publication returns zero results
"""
pubs = [p for p in scholarly.search_pubs_query('')]
self.assertIs(len(pubs), 0)

def test_get_cited_by(self):
"""
Testing that when we retrieve the list of publications that cite
a publication, the number of citing publication is the same as
the number of papers that are returned
"""
query = 'frequency-domain analysis of haptic gratings cholewiak'
pubs = [p for p in scholarly.search_pubs_query(query)]
self.assertGreaterEqual(len(pubs), 1)
Expand All @@ -58,18 +46,31 @@ def test_get_cited_by(self):
self.assertEqual(len(cites), filled.citedby)

def test_keyword(self):
"""
When we search for the keyword "3d_shape" the author Steven A. Cholewiak
should be among those listed
"""
authors = [a.name for a in scholarly.search_keyword('3d_shape')]
self.assertIsNot(len(authors), 0)
self.assertIn(u'Steven A. Cholewiak, PhD', authors)

def test_multiple_authors(self):
''' As of May 12, 2020 there are at least 24 'Cattanis's '''
"""
As of May 12, 2020 there are at least 24 'Cattanis's listed as authors
and Giordano Cattani is one of them
"""
authors = [a.name for a in scholarly.search_author('cattani')]
self.assertGreaterEqual(len(authors), 24)
self.assertIn(u'Giordano Cattani', authors)

def test_multiple_publications(self):
''' As of May 12, 2020 there are at least 29 pubs that fit the search term'''
"""
As of May 12, 2020 there are at least 29 pubs that fit the search term:
["naive physics" stability "3d shape"].
Check that the paper "Visual perception of the physical stability of asymmetric three-dimensional objects"
is among them
"""
pubs = [p.bib['title'] for p in scholarly.search_pubs_query('"naive physics" stability "3d shape"')]
self.assertGreaterEqual(len(pubs), 29)

Expand All @@ -91,7 +92,6 @@ def test_publication_contents(self):
self.assertTrue(filled.bib['year'] == u'2018')

def test_single_author(self):

query = 'Steven A. Cholewiak'
authors = [a for a in scholarly.search_author(query)]
self.assertGreaterEqual(len(authors), 1)
Expand Down
10 changes: 10 additions & 0 deletions setup_tor.sh
@@ -0,0 +1,10 @@
#!/bin/bash

tor_password="scholarly_password"
hashed_password=$(tor --hash-password $tor_password)
echo "ControlPort 9051" | sudo tee /etc/tor/torrc
echo "HashedControlPassword $hashed_password" | sudo tee -a /etc/tor/torrc

sudo service tor stop

sudo service tor start

0 comments on commit 4e82ba6

Please sign in to comment.