<a href="https://colab.research.google.com/github/shllgtca/gscholar_downloads/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This notebook downloads paper files (PDF) that are results from a Google Scholar key-words search (scraped via scrapy api). Google scholar scraped data is enriched by title2bib (api.crossref.org), and downloaded using sci-hub (via scidownl).**

# **Setup**

In [1]:
!pip install scrapy > /dev/null
!pip install title2bib > /dev/null
!pip install scidownl > /dev/null # https://pypi.org/project/scidownl/

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.9.0 requires jedi>=0.10, which is not installed.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m


In [2]:
import scrapy
from urllib.parse import urlencode
from urllib.parse import urlparse
import json
from datetime import datetime
import pandas as pd
from multiprocessing import Pool
from title2bib.crossref import get_bib_from_title
import bibtexparser
from scrapy.crawler import CrawlerProcess
import ipywidgets as widgets
from ipywidgets import Layout
from os.path import exists, isdir, join
import datetime as dt
from os import makedirs
from os import listdir
from scidownl import scihub_download

In [3]:
from google.colab import drive
# Mount your Google Drive
drive.mount('/content/drive', force_remount=False)

################################################################################
# INSERT YOUR OWN ROOT PATH DIRECTORY
################################################################################

root_path = '/content/drive/My Drive/covid_XAI/results/review/XAI_img/'

################################################################################
################################################################################

review_status = 'new'# new update continue_last

################################################################################
################################################################################

if review_status == 'continue_last':
  
  dt_dirs = [name for name in listdir(root_path) if isdir(join(root_path, name))]
  dt_dirs.remove('final_selection')
  if len(dt_dirs)>0: 
    last_rev_dt = max([dt.datetime.strptime(date, "%Y-%m-%d").date() for date in dt_dirs])
  else:
    last_rev_dt = []  
  today_dt_str = str(last_rev_dt)  

else:#update or new
  
  today_dt_str = str(dt.date.today())
  makedirs(root_path+today_dt_str)
  if review_status == 'new':
    makedirs(root_path+'final_selection')

path_dt = root_path+today_dt_str+'/'
%cd $path_dt

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1--6j3l2-CYP97e1rjr39g36tKHusOLnh/covid_XAI/results/review/XAI_img/2022-11-03


# **Parameters**

In [4]:
################################################################################
# INSERT YOUR OWN KEYWORDS                  
################################################################################
# keywords = ['XAI', 'explainable AI', 
#              'explainable artificial intelligence','GANs',
#              'generative adversarial nets','generative adversarial networks',
#              'image processing','image classification','machine learning',
#              'deep learning', 'artificial intelligence']

keywords = ['("XAI" OR "explainable AI" OR "explainable artificial intelligence")'\
            ' AND '\
            '("image processing" OR "image recognition" OR "image classification")']

filename_dt = 'papers_'+today_dt_str     
################################################################################
# INSERT YOUR OWN scrapy_API_PERSONAL_KEY OBTAINED IN :
#                   https://dashboard.scraperapi.com/dashboard                  
################################################################################
scrapy_API_PERSONAL_KEY = '99999999999999999999999999999999' #

# **Scrap data unreviewed from google scholar**

Run scrapy from script 

- Scrapy from script : https://docs.scrapy.org/en/latest/topics/practices.html?highlight=run%20scrapy%20as%20script#run-scrapy-from-a-script

- Iterate over pages : https://dev.to/iankerins/build-your-own-google-scholar-api-with-python-scrapy-4p73

- Google only provides 1000 results : https://www.quora.com/Google-scholar-search-results-seem-to-max-out-at-1000-100-pages-10-results-page-for-a-specific-search-Can-I-program-Google-to-go-beyond-1000

**Alternative** : https://stackoverflow.com/questions/44938888/construct-dataframe-from-scraped-data-using-scrapy

In [5]:
def get_url(url):
    payload = {'api_key': scrapy_API_PERSONAL_KEY, 'url': url, 'country_code': 'us'}
    proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
    return proxy_url

class scholar_spyder(scrapy.Spider):
    name = 'scholar_spyder'
    allowed_domains = ['api.scraperapi.com']
    custom_settings = {'CLOSESPIDER_ITEMCOUNT': 2,### USED ONLY FOR SMALL TESTS
                       'ITEM_CLOSESPIDER':3} ### USED ONLY FOR SMALL TESTS
    def start_requests(self):
        headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
        for key in keywords:
            url = 'https://scholar.google.com/scholar?' + urlencode({'q': key,'hl': 'en','as_sdt':'0,5','as_ylo':'2019','as_yhi':''})#FUNC {'hl': 'en', 'q': key}
            yield scrapy.Request(get_url(url), headers=headers, callback=self.parse, meta={'position': 0})

    def parse(self, response):
        position = response.meta['position']
        for res in response.xpath('//*[@data-rp]'):
            link = res.xpath('.//h3/a/@href').extract_first()
            temp = res.xpath('.//h3/a//text()').extract()
            if not temp:
                title = "[C] " + "".join(res.xpath('.//h3/span[@id]//text()').extract())
            else:
                title = "".join(temp)
            snippet = "".join(res.xpath('.//*[@class="gs_rs"]//text()').extract())
            cited = res.xpath('.//a[starts-with(text(),"Cited")]/text()').extract_first()
            temp = res.xpath('.//a[starts-with(text(),"Related")]/@href').extract_first()
            related = "https://scholar.google.com" + temp if temp else ""
            num_versions = res.xpath('.//a[contains(text(),"version")]/text()').extract_first()
            published_data = "".join(res.xpath('.//div[@class="gs_a"]//text()').extract())
            position += 1
            item = {'title_GS': title, 'link': link, 'cited': cited, 'relatedLink': related, 'position': position,
                    'numOfVersions': num_versions, 'publishedData': published_data, 'snippet': snippet}
            yield item
        next_page = response.xpath('//td[@align="left"]/a/@href').extract_first()

        if next_page:
            url = "https://scholar.google.com" + next_page
            yield scrapy.Request(get_url(url), callback=self.parse,meta={'position': position}, dont_filter=True)

In [6]:
if review_status == 'continue_last':
    papers_dt = pd.read_json(path_dt+filename_dt+'.json')
else:
    process = CrawlerProcess(settings={
      "FEEDS": {
          filename_dt+'.json' : {"format": "json"},
        },
    })
    process.crawl(scholar_spyder)
    process.start(stop_after_crawl=True) # the script will block here until the crawling is finished  
    papers_dt = pd.read_json(path_dt+filename_dt+'.json')

INFO:scrapy.utils.log:Scrapy 2.7.1 started (bot: scrapybot)
2022-11-03 13:33:45 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-03 13:33:45 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
INFO:scrapy.crawler:Overridden settings:
{'CLOSESPIDER_ITEMCOUNT': 2}
2022-11-03 13:33:45 [scrapy.crawler] INFO: Overridden settings:
{'CLOSESPIDER_ITEMCOUNT': 2}


See the documentation of the 'REQUEST_FINGERPRINTER_I

## Exclude papers already reviewd

In [7]:
# Exclude papers already reviewd
papers_reviewd_path = root_path+'final_selection/papers_reviewd.csv'

if review_status=='new':
  papers_unreviewd = papers_dt
else:
  try:
    papers_reviewd = pd.read_csv(papers_reviewd_path)
    papers_unreviewd = papers_dt [~papers_dt.title_GS.isin(papers_reviewd.title_GS)] 
  except:
    print('1st reviewd not finished.')
    papers_unreviewd = papers_dt

In [8]:
if papers_unreviewd.shape[0]==0:
  print('Dear Researcher, no new papers of your interest.')

In [9]:
counts = pd.Series(papers_unreviewd.shape[0],index=['Keywords'])

## **Enrich database**

In [10]:
def get_bibs(title):
  try:
    found, bib_string = get_bib_from_title(title,get_first=True)
    bib = bibtexparser.loads(bib_string).entries[0]
  except: 
    bib = []
  bib = pd.Series(bib)
  return bib

In [11]:
if papers_unreviewd.shape[0]>0:
  pool = Pool()
  bibs_data = pool.map(get_bibs,list(papers_unreviewd.title_GS))
  bibs_data = pd.concat(bibs_data,axis=1).fillna('Not founded')
  papers_unreviewd = papers_unreviewd.reset_index(drop=True)
  papers_unreviewd = pd.concat([papers_unreviewd,bibs_data.T],axis=1)
  papers_unreviewd.to_csv(path_dt+'papers_unreviewd_'+today_dt_str+'.csv')

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
2022-11-03 13:33:47 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): api.crossref.org:80
2022-11-03 13:33:47 [urllib3.connectionpool] DEBUG: Starting new

In [12]:
# Remove papers_unreviewd published 3 years back
today_dt = dt.date.today()
idx_bool = pd.to_numeric(papers_unreviewd.year.replace('Not founded',0))>int(today_dt.year)-3
papers_unreviewd = papers_unreviewd[idx_bool]
last3 = pd.Series(papers_unreviewd.shape[0],index=['Published in the last 3 years'])
counts = pd.concat([counts,last3],axis=0)

In [13]:
papers_unreviewd.ENTRYTYPE.value_counts()

article          7
inproceedings    1
Name: ENTRYTYPE, dtype: int64

In [14]:
# Remove unwanted text formats
idx_entrytype = papers_unreviewd.ENTRYTYPE.isin(['book',
                                                 'inproceedings',
                                                 'misc',
                                                 'incollection',
                                                 'techreport'])
papers_unreviewd = papers_unreviewd[~(idx_entrytype)]
c_paper = pd.Series(papers_unreviewd.shape[0],index=['Published as Paper'])
counts = pd.concat([counts,c_paper],axis=0)

In [15]:
papers_unreviewd = papers_unreviewd.drop_duplicates(subset='title_GS')
c_dropdup = pd.Series(papers_unreviewd.shape[0],index=['Not duplicated'])
counts = pd.concat([counts,c_dropdup],axis=0)

# **Prisma 1st step - Title analysis**

In [16]:
def get_it_df(data,values):
  layout_label = Layout(width='auto',height='auto',
                    flex_flow='column',
                    description_width='auto',
                    description_height='auto')
  checkbox_objs = [widgets.Checkbox(value=val, 
                                       description=items,
                                       layout=layout_label) for items,val in zip(data,values)]
  names = data.tolist()
  arg_dict = {names[i]: checkbox for i, checkbox in enumerate(checkbox_objs)}
  ui = widgets.VBox(children=checkbox_objs,layout=layout_label)
  return ui

In [17]:
if review_status == 'continue_last':
  papers_fst_sel = pd.read_csv(path_dt+'papers_fst_sel_'+today_dt_str+'.csv') 
  idx_sel_val = papers_unreviewd.title_GS.isin(papers_fst_sel.title_GS)
else:
  idx_sel_val = pd.Series(True,index=papers_unreviewd.title_GS)
ui_fst = get_it_df(papers_unreviewd.title_GS,idx_sel_val)

display(ui_fst)

VBox(children=(Checkbox(value=True, description='Neuroscope: An explainable ai toolbox for semantic segmentati…

In [18]:
fst_sel = [ui_fst.children[row].value for row in range(0,papers_unreviewd.shape[0])]
papers_fst_sel = papers_unreviewd[fst_sel]
papers_fst_sel.to_csv(path_dt+'papers_fst_sel_'+today_dt_str+'.csv')
c_fst_sel = pd.Series(papers_fst_sel.shape[0],index=['Prisma 1st step'])
counts = pd.concat([counts,c_fst_sel],axis=0)

# **Download papers by DOI**

In [19]:
if papers_unreviewd.shape[0]>0:
  # https://pypi.org/project/scidownl/
  # paper_type = "doi"
  papers_not_downl = []

  for doi,title_GS in zip(papers_fst_sel.doi,papers_fst_sel.title_GS):
    if doi != 'no_doi':
      try:
        sci = scihub_download(doi, paper_type="doi", out=path_dt+title_GS+'.pdf')
      except:
        try:
          sci = scihub_download(title_GS, paper_type="title", out=path_dt+title_GS+'.pdf')
        except: 
          print('Error on paper '+title_GS)
          papers_not_downl.append(title_GS)      

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): tool.yovisun.com:80
2022-11-03 14:09:09 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): tool.yovisun.com:80
DEBUG:urllib3.connectionpool:http://tool.yovisun.com:80 "GET /scihub HTTP/1.1" 301 169
2022-11-03 14:09:09 [urllib3.connectionpool] DEBUG: http://tool.yovisun.com:80 "GET /scihub HTTP/1.1" 301 169
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): tool.yovisun.com:443
2022-11-03 14:09:09 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): tool.yovisun.com:443
DEBUG:urllib3.connectionpool:https://tool.yovisun.com:443 "GET /scihub HTTP/1.1" 301 169
2022-11-03 14:09:10 [urllib3.connectionpool] DEBUG: https://tool.yovisun.com:443 "GET /scihub HTTP/1.1" 301 169
DEBUG:urllib3.connectionpool:https://tool.yovisun.com:443 "GET /scihub/ HTTP/1.1" 200 24161
2022-11-03 14:09:10 [urllib3.connectionpool] DEBUG: https://tool.yovisun.com:443 "GET /scihub/ HTTP/1.1" 200 24161
[INFO] | 2022




DEBUG:urllib3.connectionpool:https://sci-hub.se:443 "POST / HTTP/1.1" 302 None
2022-11-03 14:09:14 [urllib3.connectionpool] DEBUG: https://sci-hub.se:443 "POST / HTTP/1.1" 302 None
DEBUG:urllib3.connectionpool:https://sci-hub.se:443 "GET /10.36227/techrxiv.19310489.v1 HTTP/1.1" 200 None
2022-11-03 14:09:14 [urllib3.connectionpool] DEBUG: https://sci-hub.se:443 "GET /10.36227/techrxiv.19310489.v1 HTTP/1.1" 200 None
[INFO] | 2022/11/03 14:09:14 | <- Request: scihub_url=https://sci-hub.se, source=DoiSource[type=doi, id=10.36227/techrxiv.19310489.v1]
[INFO] | 2022/11/03 14:09:14 | -> Response: status_code=200, content_length=5838
[INFO] | 2022/11/03 14:09:14 | Choose scihub url [1]: http://sci-hub.se
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): sci-hub.se:80
2022-11-03 14:09:14 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): sci-hub.se:80
DEBUG:urllib3.connectionpool:http://sci-hub.se:80 "POST / HTTP/1.1" 308 569
2022-11-03 14:09:14 [urllib3.connectionpo




DEBUG:urllib3.connectionpool:https://sci-hub.se:443 "POST / HTTP/1.1" 302 None
2022-11-03 14:09:30 [urllib3.connectionpool] DEBUG: https://sci-hub.se:443 "POST / HTTP/1.1" 302 None
DEBUG:urllib3.connectionpool:https://sci-hub.se:443 "GET /10.3390/s21165657 HTTP/1.1" 200 None
2022-11-03 14:09:30 [urllib3.connectionpool] DEBUG: https://sci-hub.se:443 "GET /10.3390/s21165657 HTTP/1.1" 200 None
[INFO] | 2022/11/03 14:09:30 | <- Request: scihub_url=https://sci-hub.se, source=DoiSource[type=doi, id=10.3390/s21165657]
[INFO] | 2022/11/03 14:09:30 | -> Response: status_code=200, content_length=5826
[INFO] | 2022/11/03 14:09:30 | Choose scihub url [1]: http://sci-hub.se
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): sci-hub.se:80
2022-11-03 14:09:30 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): sci-hub.se:80
DEBUG:urllib3.connectionpool:http://sci-hub.se:80 "POST / HTTP/1.1" 308 569
2022-11-03 14:09:31 [urllib3.connectionpool] DEBUG: http://sci-hub.se:80 "POS

Alternative : https://scihub.copernicus.eu/userguide/BatchScripting

# **Prisma 2nd step - Snipet analysis**

In [23]:
if papers_unreviewd.shape[0]>0:
    if review_status == 'continue_last':
        papers_fst_sel = pd.read_csv(path_dt+'papers_fst_sel_'+today_dt_str+'.csv')
    idx_sel_val = pd.Series(True,index=papers_fst_sel.title_GS)
    ui_snd = get_it_df(papers_fst_sel.title_GS,idx_sel_val)
    display(ui_snd)

VBox(children=(Checkbox(value=True, description='Neuroscope: An explainable ai toolbox for semantic segmentati…

In [24]:
if papers_unreviewd.shape[0]>0:
  snd_sel = [ui_snd.children[row].value for row in range(0,papers_fst_sel.shape[0])]
  papers_snd_sel = papers_fst_sel[snd_sel]
  papers_snd_sel.to_csv(path_dt+'papers_snd_sel_'+today_dt_str+'.csv')

In [25]:
# Append papers_unreviewd to already reviewd papers
if review_status == 'new':
  papers_reviewd = papers_unreviewd
else:
  papers_reviewd = pd.concat([papers_unreviewd,papers_reviewd],axis=0) 
papers_reviewd.to_csv(root_path+'final_selection/papers_reviewd.csv')
c_snd_sel = pd.Series(papers_snd_sel.shape[0],index=['Prisma 2nd step'])
counts = pd.concat([counts,c_snd_sel],axis=0)