[View in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/tools/GoogleScholar.ipynb)

# Google scholar

Search for similarity in title and one auhor. Returns Python dictionary

__WARNING__: Does not work from Colaboratory

In [1]:
import Levenshtein
import re
from bs4 import BeautifulSoup
import requests

In [26]:
def get_google_scholar(record):
    '''
    Analyise the BeautifulSoup record for an article 
    in Google Scholar.
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','Jornal','Year',
    'abstract','cites','cites_link'
    '''
    import random
    import time
    gsr={}
    try:
        cites=record.find_all('a',{"href":re.compile( "/scholar\?cites*" )})[0]
        try:
            gsr['cites']=int( cites.text.split()[-1] )
            gsr['cites_link']=cites.attrs.get('href')
        except:
            gsr['cites']=0
    except:
        cites=None

    # Title
    try:
        lstt=record.find_all('a',{"data-clk":re.compile( ".*")})[0].contents
    except:
        lstt=[]

    # Find title interpreting weird characters
    tc=''
    for tip in lstt:
        if tip.find('svg')==-1:
            tc=tc+str(tip)
        else:
            tc=tc+tip.get('aria-label') # aria label in Accessibility
    gsr['title']=tc
    
    # Explore authors, google scholar profile, Journal and Year
    gpa=None
    try:
        gpa=record.find_all('div',{"class":"gs_a"})[0]
        gsr['authors']=gpa.text.split('-')[0].strip()
        try:
            jy=gpa.text.split('-')[1].strip()
            gsr['Journal']=jy.split(',')[0]
            try:
                gsr['Year']=eval(jy.split(',')[1])
            except:
                gsr['Year']=-1
        except:
            gsr['Journal']=''
    except:
        gsr['authors']=''

    #Abstract:
    try:
        gsr['abstract']=record.find_all('div',{'class':'gs_rs'})[0].text.replace('\xa0…','')
    except:
        gsr['abstract']=''
    # citations
    if gpa:
        lpr=gpa.find_all("a",{ "href":re.compile("/citations\?user=*")   } )
        prf={}
        for pr in lpr:
            prf[ pr.text ]=pr.attrs.get('href').split('?')[-1].split('&')[0].split('user=')[-1]
        gsr['profiles']=prf
    
    time.sleep( random.randint(1,3)  ) # avoid robots
    return gsr

In [27]:
def google_scholar_query(title='relativity theory',author="A Einstein",DEBUG=False):
    '''
    Search Google scholar for similarity in title and author.
    Only the first result is analized. The output includes 
    a quality measurements between the query and the results 
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','cites','cites_link',
    'quality_title','quality_author'
    '''
    # + → %2B in query formula:
    url='https://scholar.google.com/scholar?q="{}"%2B{}'.format(title,author)
    if DEBUG:
        print(url)
    r=requests.get(url)

    soup = BeautifulSoup(r.text, "html.parser")

    rgs=soup.find_all('div', {'class':'gs_ri' })

    gs={}

    record=rgs[0]
    gs.update(get_google_scholar(record))
        
    sau=0
    for a in gs['authors'].split(','):
        saun=Levenshtein.ratio(author.lower(),a.lower().strip())
        if saun>sau:
            sau=saun
    gs['quality_author']=round(sau,2)
        
    gs['quality_title']=round( Levenshtein.ratio(
               title.lower(),gs['title'].lower() ),2 )

    if DEBUG:
        return gs,record
    return gs

In [28]:
r=google_scholar_query(title='Fermionic triplet dark matter in an SO(10)-inspired left right model',
              author='D Restrepo',DEBUG=False)

In [29]:
r

{'Journal': 'Physical Review D',
 'Year': 2017,
 'abstract': 'We study a left right (LR) extension of the Standard Model (SM) where the Dark Matter (DM) candidate is composed of a set of fermionic Majorana triplets. The DM is stabilized by a remnant Z 2 symmetry from the breaking of the LR group to the SM. Two simple scenarios where the DM particles plus a certain set of extra fields lead to gauge coupling unification with a low LR scale are explored. The constraints from relic density and predictions for direct detection are discussed for both scenarios. The first scenario with a SU (2) R vectorlike',
 'authors': 'C Arbeláez, M Hirsch, D Restrepo',
 'cites': 7,
 'cites_link': '/scholar?cites=10539747452972188249&as_sdt=2005&sciodt=0,5&hl=en&oe=ASCII',
 'profiles': {'D Restrepo': '1sKULCoAAAAJ'},
 'quality_author': 1.0,
 'quality_title': 0.99,
 'title': 'Fermionic triplet dark matter in an SO(10)-inspired left-right model'}