[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/restrepo/medicion/blob/master/tools/GoogleScholar.ipynb)

# Google scholar

Search for similarity in title and one auhor. Returns Python dictionary

__WARNING__: Does not work from Colaboratory

In [15]:
#%%writefile ../cienciometria/googlescholar.py
import Levenshtein
import re
from bs4 import BeautifulSoup
import requests

def firefox_get(url):
    from pyvirtualdisplay import Display
    from selenium import webdriver

    display = Display(visible=0, size=(800, 600))
    display.start()

    try:
        browser = webdriver.Firefox()
    except:
        return 'FAILED'    
    #https://stackoverflow.com/a/17536547/2268280
    browser.set_page_load_timeout(30)
    try:
        browser.get(url)
    except:
        return 'FAILED'
        
    html = browser.page_source

    browser.close()
    kk=display.stop()
    return html

def get_google_scholar(record):
    '''
    Analyise the BeautifulSoup record for an article 
    in Google Scholar.
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','Jornal','Year',
    'abstract','cites','cites_link'
    '''
    import random
    import time
    gsr={}
    try:
        cites=record.find_all('a',{"href":re.compile( "/scholar\?cites*" )})[0]
        try:
            gsr['cites']=int( cites.text.split()[-1] )
            gsr['cites_link']=cites.attrs.get('href')
        except:
            gsr['cites']=0
    except:
        cites=None

    # Title
    try:
        lstt=record.find_all('a',{"data-clk":re.compile( ".*")})[0].contents
    except:
        lstt=[]

    # Find title interpreting weird characters
    tc=''
    for tip in lstt:
        if tip.find('svg')==-1:
            tc=tc+str(tip)
        else:
            try:
                tc=tc+tip.get('aria-label') # aria label in Accessibility
            except TypeError:
                pass
    gsr['title']=tc
    
    # Explore authors, google scholar profile, Journal and Year
    gpa=None
    try:
        gpa=record.find_all('div',{"class":"gs_a"})[0]
        gsr['authors']=gpa.text.split('-')[0].strip()
        try:
            jy=gpa.text.split('-')[1].strip()
            gsr['Journal']=jy.split(',')[0]
            try:
                gsr['Year']=eval(jy.split(',')[1])
            except:
                gsr['Year']=-1
        except:
            gsr['Journal']=''
    except:
        gsr['authors']=''

    #Abstract:
    try:
        gsr['abstract']=record.find_all('div',{'class':'gs_rs'})[0].text.replace('\xa0…','')
    except:
        gsr['abstract']=''
    # citations
    if gpa:
        lpr=gpa.find_all("a",{ "href":re.compile("/citations\?user=*")   } )
        prf={}
        for pr in lpr:
            prf[ pr.text ]=pr.attrs.get('href').split('?')[-1].split('&')[0].split('user=')[-1]
        gsr['profiles']=prf
    
    time.sleep( random.randint(1,3)  ) # avoid robots
    return gsr

def google_scholar_url(url):
    '''
    Convert a Google Scholar page into a list
    of dictionaries with metadata info
    '''
    r=requests.get(url)

    soup = BeautifulSoup(r.text, "html.parser")
    rgs=soup.find_all('div', {'class':'gs_ri' })

    citations=[]
    for record in rgs:
        citations.append( get_google_scholar(record) )
        
    return citations

def google_scholar_query(title='relativity theory',author="A Einstein",
                         DOI=None,DEBUG=False):
    '''
    Search Google scholar for similarity in title and author.
    Only the first result is analized. The output includes 
    a quality measurements between the query and the results 
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','cites','cites_link',
    'quality_title','quality_author'
    '''
    # + → %2B in query formula:
    if DOI:
        url='https://scholar.google.com/scholar?q="{}"'.format(DOI)
    else:
        url='https://scholar.google.com/scholar?q="{}"%2B{}'.format(title,author)
    if DEBUG:
        print(url)
        
        
    headers_Get = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    

    
    
    #s = requests.Session()
    rtext=firefox_get(url)

    #soup = BeautifulSoup(r.text, "html.parser")
    soup = BeautifulSoup(rtext, "html.parser")

    rgs=soup.find_all('div', {'class':'gs_ri' })

    gs={}

    try:
        record=rgs[0]
    except IndexError:
        return gs
    gs.update(get_google_scholar(record))
        
    sau=0
    for a in gs['authors'].split(','):
        saun=Levenshtein.ratio(author.lower(),a.lower().strip())
        if saun>sau:
            sau=saun
    if not DOI:        
        gs['quality_author']=round(sau,2)

        gs['quality_title']=round( Levenshtein.ratio(
                   title.lower(),gs['title'].lower() ),2 )

    if DEBUG:
        return gs,record
    return gs

In [12]:
gs=google_scholar_query(title='Fermionic triplet dark matter in an SO(10)-inspired left right model',
              author='D Restrepo',DEBUG=False)

In [13]:
gs

{'Journal': 'Physical Review D',
 'Year': 2017,
 'abstract': 'We study a left right (LR) extension of the Standard Model (SM) where the Dark Matter (DM) candidate is composed of a set of fermionic Majorana triplets. The DM is stabilized by a remnant Z 2 symmetry from the breaking of the LR group to the SM. Two simple scenarios where the DM particles plus a certain set of extra fields lead to gauge coupling unification with a low LR scale are explored. The constraints from relic density and predictions for direct detection are discussed for both scenarios. The first scenario with a SU (2) R vectorlike',
 'authors': 'C Arbeláez, M Hirsch, D Restrepo',
 'cites': 7,
 'cites_link': '/scholar?cites=10539747452972188249&as_sdt=400005&sciodt=0,14&hl=en',
 'profiles': {'D Restrepo': '1sKULCoAAAAJ'},
 'quality_author': 1.0,
 'quality_title': 0.99,
 'title': 'Fermionic triplet dark matter in an SO(10)-inspired left-right model'}

In [6]:
1+1

2

## Get citatios

In [10]:
url='https://scholar.google.com{}'.format(gs.get('cites_link'))

In [11]:
import pandas as pd
pd.DataFrame(google_scholar_url( url))

Unnamed: 0,Journal,Year,abstract,authors,cites,cites_link,profiles,title
0,Journal of High Energy …,2017,A bstract We consider a simple extension of th...,"D Borah, A Dasgupta, UK Dey, S Patra…",16.0,/scholar?cites=11724525136304649044&as_sdt=200...,"{'A Dasgupta': '62NjhqsAAAAJ', 'S Patra': '06C...",Multi-component fermionic dark matter and IceC...
1,arXiv preprint arXiv …,2017,We perform a detailed study of the grand unifi...,"J Chakrabortty, R Maji, S Mohanty, SK Patra…",2.0,/scholar?cites=10083320134459830263&as_sdt=200...,"{'J Chakrabortty': 'eg1qeFoAAAAJ', 'S Mohanty'...",Roadmap of left-right models rooted in GUT
2,Physics Letters B,2018,Abstract An extra SU (2) D gauge factor is add...,E Ma,1.0,/scholar?cites=1113676709270999285&as_sdt=2005...,{},[SU (2)] 3 dark matter
3,Journal of High Energy …,2018,A bstract In the framework of the (B− L) Super...,"L Delle Rose, S Khalil, SJD King, S Kulkarni…",1.0,/scholar?cites=12710934755284528743&as_sdt=200...,{'S Khalil': '7T5M4SkAAAAJ'},Sneutrino Dark Matter in the BLSSM
4,Physical Review D,2018,We perform a detailed study of the grand unifi...,"J Chakrabortty, R Maji, SK Patra, T Srivastava…",2.0,/scholar?cites=7825805590019213068&as_sdt=2005...,"{'J Chakrabortty': 'eg1qeFoAAAAJ', 'SK Patra':...",Roadmap of left-right models based on GUTs
5,arXiv preprint arXiv:1803.03891,2018,In any gauge extension of the standard model (...,E Ma,,,{},Predestined Dark Matter in Gauge Extensions of...
6,Journal of Physics G: Nuclear …,2018,We demonstrate how a class of non-supersymmetr...,"M Sruthilaya, R Mohanta, S Patra",,,"{'S Patra': '06CJvKUAAAAJ', 'R Mohanta': 'VTe7...",Neutrino mass and neutrinoless double beta dec...


#  Search by DOI

In [17]:
google_scholar_query(DOI='10.1186/1556-276x-7-586')

{'Journal': 'Nanoscale research\xa0…',
 'Year': 2012,
 'abstract': '… Keywords. Impurities Quantum well Dilute nitride. Electronic supplementary material. The online\nversion of this article (doi:10.1186/1556-276X-7-586) contains supplementary material, which\nis available to authorized users. Download fulltext PDF. Review. Background \n',
 'authors': 'U Yesilgul, F Ungan, S Şakiroğlu, C Duque…',
 'cites': 11,
 'cites_link': '/scholar?cites=759647463462126119&as_sdt=400005&sciodt=0,14&hl=en',
 'profiles': {'C Duque': 'emE-TscAAAAJ', 'F Ungan': '-3R6EmMAAAAJ'},
 'title': 'The effect of magnetic field on the impurity binding energy of shallow donor impurities in a GaInNAs\xa0…'}

In [14]:
google_scholar_query(DOI='10.1103/physrevd.98.055016')

{'Journal': 'Physical Review D',
 'Year': 2018,
 'abstract': 'Page 1. Probing a simplified W0 model of RðDðÃÞÞ anomalies using b tags, τ\nleptons, and missing energy Mohammad Abdullah,1 Julián Calle,3 Bhaskar\nDutta,1 Andrés Flórez,2 and Diego Restrepo3 1Mitchell Institute for \n',
 'authors': 'M Abdullah, J Calle, B Dutta, A Flórez, D Restrepo',
 'profiles': {'A Flórez': 'SUG6ga0AAAAJ', 'D Restrepo': '1sKULCoAAAAJ'},
 'title': 'Probing a simplified W′ model of R(D(*)) anomalies using b tags, τ leptons, and missing energy'}

In [22]:
url='https://scholar.google.com/scholar?q="{}"'.format('10.1186/s40104-017-0225-x')
r=requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
rgs=soup.find_all('div', {'class':'gs_ri' })

In [40]:
google_scholar_query(DOI='10.1038/s41598-017-12316-4')

{}

In [31]:
import unidecode

In [34]:
unidecode.unidecode( 'Journal of animal\xa0…' )

'Journal of animal ...'

In [58]:
import pandas as pd

In [62]:
pd.read_excel('../cienciometria/gs.xlsx').to_excel('gs3.xlsx',index=False)

In [7]:
1+1

2

In [57]:
0%40

0