# Google Scholar 
Try avoid robots by explicit clicking on the previously loaded 
page

See: https://academia.stackexchange.com/q/2567

## Choose between selenium or requests

In [1]:
url='https://scholar.google.com'
CITES=False
if CITES:
    url=url+'/scholar?cites=16729892078990709043&as_sdt=2005&sciodt=0,5&hl=es'
SELENIUM=True
if SELENIUM:
    #pip3 install webdriver_manager
    from webdriver_manager.firefox import GeckoDriverManager
    from selenium import webdriver
    from pathlib import Path
    home = str(Path.home())
    try:
        browser = webdriver.Firefox(executable_path=
          '/home/restrepo/.wdm/geckodriver/v0.23.0/linux64/geckodriver')
    except:
        browser = webdriver.Firefox(
            executable_path= GeckoDriverManager().install())
    
    browser.get(url)

In [2]:
if not SELENIUM:
    import requests
    headers_Get = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
    r=requests.Session()
    browser=None

In [2]:
#import googlescholar as gs
#%%writefile ../cienciometria/googlescholar.py
import Levenshtein
import re
from bs4 import BeautifulSoup
import requests

def firefox_get(url,browser=browser):
    try:
        if browser:
            q=url.split('=')[-1]
            browser.find_element_by_id("gs_hdr_tsi").clear()
            browser.find_element_by_id("gs_hdr_tsi").send_keys(q)
            browser.find_element_by_id('gs_hdr_tsb').click()
        else:
            rget=r.get(url,headers=headers_Get)
    except:
        return 'FAILED'
    
    if browser:
        html=browser.page_source
    else:
        html = rget.text
    if html.lower().find('gs_captcha_f')>-1:
        input('check robots')
    #if html.find('Sorry')>-1:
    #    input('check sorry')

    return html

def get_google_scholar(record):
    '''
    Analyise the BeautifulSoup record for an article 
    in Google Scholar.
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','Jornal','Year',
    'abstract','cites','cites_link'
    '''
    import random
    import time
    gsr={}
    try:
        cites=record.find_all('a',{"href":re.compile( "/scholar\?cites*" )})[0]
        try:
            gsr['cites']=int( cites.text.split()[-1] )
            gsr['cites_link']=cites.attrs.get('href')
        except:
            gsr['cites']=0
    except:
        cites=None

    # Title
    try:
        lstt=record.find_all('a',{"data-clk":re.compile( ".*")})[0].contents
    except:
        lstt=[]

    # Find title interpreting weird characters
    tc=''
    for tip in lstt:
        if tip.find('svg')==-1:
            tc=tc+str(tip)
        else:
            try:
                tc=tc+tip.get('aria-label') # aria label in Accessibility
            except TypeError:
                pass
    gsr['title']=tc
    
    # Explore authors, google scholar profile, Journal and Year
    gpa=None
    try:
        gpa=record.find_all('div',{"class":"gs_a"})[0]
        gsr['ref']=gpa.text.strip()
        gsr['authors']=gpa.text.split('-')[0].strip()
        try:
            jy=gpa.text.split('-')[1].strip()
            gsr['Journal']=jy.split(',')[0]
            try:
                gsr['Year']=eval(jy.split(',')[1])
            except:
                gsr['Year']=-1
        except:
            gsr['Journal']=''
    except:
        gsr['authors']=''
        gsr['ref']=''

    #Abstract:
    try:
        gsr['abstract']=record.find_all('div',{'class':'gs_rs'})[0].text.replace('\xa0…','')
    except:
        gsr['abstract']=''
    # citations
    if gpa:
        lpr=gpa.find_all("a",{ "href":re.compile("/citations\?user=*")   } )
        prf={}
        for pr in lpr:
            prf[ pr.text ]=pr.attrs.get('href').split('?')[-1].split('&')[0].split('user=')[-1]
        gsr['profiles']=prf
    
    time.sleep( random.randint(1,3)  ) # avoid robots
    return gsr

def request_google_scholar_url(url):
    return requests.get(url)

def google_scholar_page(html):
    '''
    Convert a Google Scholar page into a list
    of dictionaries with metadata info
    '''
    if html.lower().find('gs_captcha_f')>-1:
        input('check robots')
   
    soup = BeautifulSoup(html, "html.parser")
    rgs=soup.find_all('div', {'class':'gs_ri' })

    citations=[]
    for record in rgs:
        citations.append( get_google_scholar(record) )
        
    return citations

def google_scholar_query(title='relativity theory',author="A Einstein",
                         DOI=None,DEBUG=False):
    '''
    Search Google scholar for similarity in title and author.
    Only the first result is analized. The output includes 
    a quality measurements between the query and the results 
    Output is a Python dictionary with keys: 
    'title', 'authors','profiles','cites','cites_link',
    'quality_title','quality_author'
    '''
    # + → %2B in query formula:
    if DOI:
        url='https://scholar.google.com/scholar?q="{}"'.format(DOI)
    else:
        url='https://scholar.google.com/scholar?q="{}"%2B{}'.format(title,author)
    if DEBUG:
        print(url)   
    
    #s = requests.Session()
    rtext=firefox_get(url)

    #soup = BeautifulSoup(r.text, "html.parser")
    soup = BeautifulSoup(rtext, "html.parser")

    rgs=soup.find_all('div', {'class':'gs_ri' })

    gs={}

    try:
        record=rgs[0]
    except IndexError:
        return gs
    gs.update(get_google_scholar(record))
        
    sau=0
    for a in gs['authors'].split(','):
        saun=Levenshtein.ratio(author.lower(),a.lower().strip())
        if saun>sau:
            sau=saun
    if not DOI:        
        gs['quality_author']=round(sau,2)

        gs['quality_title']=round( Levenshtein.ratio(
                   title.lower(),gs['title'].lower() ),2 )

    if DEBUG:
        return gs,record
    return gs

def get_cites_refs(browser,url,maxcites=65,t=60):
    import random
    import time
    url='https://scholar.google.com'+url
    browser.get(url)
     
    endpage=int(maxcites/10)+1
    refs=''
    
    kk=google_scholar_page( browser.page_source )
    try:
        refs=refs+'\n'.join( list((pd.DataFrame(kk)['title']+'; '
                                  +pd.DataFrame(kk)['ref']).values) )+'\n' 
    except:
        refs=''
    
    
    for i in range(endpage):
        try:
            browser.find_element_by_class_name('gs_ico_nav_next').click()
            kk=google_scholar_page( browser.page_source )
            try:
                refs=refs+'\n'.join( list( (pd.DataFrame(kk)['title']+'; '
                                +pd.DataFrame(kk)['ref']).values ) )+'\n' 
            except:
                refs=''
        except:
            break
            
    time.sleep(random.uniform(0.9*t,1.1*t))
    return refs

In [3]:
import wosplus as wp

In [4]:
%%writefile drive.cfg
[FILES]
DOIS.xlsx               = 1bikNT7Gmp4G7dfeMuGsF-az7D8lskK0O
UDEA_WOS_SCI_SCP.xlsx   = 1o9otmklgh-0w18Avv2ZTKOXr3vZbjwvj
oaudea.xlsx             = 1CcwobiEFACIbffNzNdLxpdxQukr8cZ5x
datos1.csv              = 11CyLRZZwVbgw6YAC-igRJ3mrkIwk0aaiXnd-EOofYTI

Overwriting drive.cfg


In [5]:
oa=wp.wosplus('drive.cfg')

In [6]:
oaudea=oa.read_drive_excel('oaudea.xlsx')

In [8]:
oaudea=wp.fill_NaN(oaudea)

In [6]:
import time
import random
import pandas as pd

In [14]:
google_scholar_query(DOI='10.1155/2015/978379')

{'Journal': 'Advances in\xa0…',
 'Year': 2015,
 'abstract': '… Sciences Volume 2015 (2015), Article ID 978379, 8 pages http://dx.doi.org/10.1155/2015/978379\nResearch Article. Antileishmanial Effect of 5,3′-Hydroxy-7,4′-dimethoxyflavanone of Picramnia\ngracilis Tul. (Picramniaceae) Fruit: In Vitro and In Vivo Studies \n',
 'authors': 'SM Robledo, W Cardona, K Ligardo, J Henao…',
 'cites': 4,
 'cites_link': '/scholar?cites=11789187670417306170&as_sdt=2005&sciodt=0,5&hl=es',
 'profiles': {},
 'title': 'Antileishmanial Effect of 5, 3′-Hydroxy-7, 4′-dimethoxyflavanone of Picramnia gracilis Tul.(Picramniaceae) Fruit: In Vitro and In Vivo Studies'}

In [29]:
oaudea[oaudea['OA_Green']=='10.1590/s0103-84782009000100018']

Unnamed: 0,DI,Open_Access,OA_Yellow,OA_Green,ISSNs,OA_Green_Article,ISSNWOS,APC,OAGISSN,APC_USD
606,10.1590/s0103-84782009000100018,10.1590/s0103-84782009000100018,,10.1590/s0103-84782009000100018,0103-8478,,"0103-8478,1678-4596",700BRL,0103-8478,188


In [22]:
oaudea.shape

(7999, 10)

In [14]:
oaudea[oaudea['OA_Yellow']!=''].shape

(1505, 10)

In [7]:
REDALYC=True

In [8]:
#JUST START AFTER FINISH REDALYC
nini=409# initial doi
n=1506 # Total of DOIs
T=12 #hours of search
t=T/n*3600 # [s] query time
t=60
day=24*3600 #s
mintime=0.9*t*n # [s] minimal time search
wait=30#day-mintime # maximum wait

In [9]:
if REDALYC:
    nini=0

In [24]:
oaudea[oaudea['OA_Yellow']!=''].reset_index(drop=True)[1502:1505]

Unnamed: 0,DI,Open_Access,OA_Yellow,OA_Green,ISSNs,OA_Green_Article,ISSNWOS,APC,OAGISSN,APC_USD
1502,10.17533/udea.ikala.v19n3a05,10.17533/udea.ikala.v19n3a05,10.17533/udea.ikala.v19n3a05,,,,0123-3432,,,0
1503,10.11144/javeriana.mavae9-2.eacv,10.11144/javeriana.mavae9-2.eacv,10.11144/javeriana.mavae9-2.eacv,,,,1794-6670,,,0
1504,10.11144/javeriana.rgyps13-26.eaht,10.11144/javeriana.rgyps13-26.eaht,10.11144/javeriana.rgyps13-26.eaht,,,,1657-7027,,,0


In [30]:
dfgsn=pd.read_json('yellowno.json')

In [31]:
if REDALYC:
    import unidecode
    datos=pd.read_json('redaylicmiising.json')
    datos['title_simple']=datos.title.str.lower().str.replace(
    '\n',' ').str.replace('\\\\n',' ').map(unidecode.unidecode).str.strip()
    datos['name_simple']=datos.name.str.split("'").str[1].str.strip().str.lower().str.split().str[0].str.strip()+' '+datos.lastname.str.split("'").str[1].str.replace('\\\\n',' '
        ).str.strip().str.lower().str.split(' ').str[0]
    print('SKYP to next cell')

SKYP to next cell


In [None]:
dfgs=pd.DataFrame()
i=1
#oaudea[oaudea['OA_Yellow']!='']['OA_Yellow']
for doi in dfgsn.DOI.values[nini:nini+n]:
    print(i,doi)
    gsd=google_scholar_query(DOI=doi)
    if not gsd:
        print('WARNING: Not found')
    gsd['DOI']=doi
    dfgs=dfgs.append(gsd,ignore_index=True )
    dfgs.to_json('gas.json')
    time.sleep(random.uniform(0.9*t,1.1*t))
    if i%n==0:
        time.sleep(wait)        
    i=i+1

In [None]:
#REDALYC
dfgs=pd.DataFrame()
for ii in datos[nini:].index:
    print(ii,datos.loc[ii,'name_simple'])
    #gsd=google_scholar_query(DOI=doi)
    gsd=google_scholar_query(DOI=datos.loc[ii,'doi'])
                     #title =datos.loc[ii,'title_simple'],
                     #author=datos.loc[ii,'name_simple'])
    gsd['old_title']=datos.loc[ii,'title']
    dfgs=dfgs.append(gsd,ignore_index=True )
    dfgs.to_json('gsta.json')
    time.sleep(random.uniform(0.9*t,1.1*t))

In [21]:
rdlyc=pd.read_json('cites_refs_missing.json').sort_index().fillna('').reset_index(drop=True)

In [19]:
refs=get_cites_refs(browser,url,maxcites=65)

In [22]:
refs

"A longitudinal study assessing the maintenance condition of cadres of four types of wheelchairs provided in low-resource areas; K Rispin, K Riseling, J Wee\xa0- Disability and Rehabilitation\xa0…, 2018 - Taylor & Francis\nInnovations for Evaluation Research: Multiform Protocols, Visual Analog Scaling, and the Retrospective Pretest–Posttest Design; R Chang, TD Little\xa0- Evaluation & the health professions, 2018 - journals.sagepub.com\nWinning the war on state-sponsored propaganda; E Murrock, J Amulya, M Druckman, T Liubyva - irex.org\nVisual analogue mood scale scores in healthy young versus older adults; L Machado, LM Thompson, CHR Brett\xa0- International psychogeriatrics, 2018 - cambridge.org\nFOAM EM RSS; S Sick - foamem.com\nFOAM EM RSS; C McLeod - foamem.com\nToward a better measure: Integrating noncognitive constructs using Action-Control Theory; R Chang - 2016 - ttu-ir.tdl.org\nSelf-reported pain scales; C McLeod\xa0- Don't Forget The Bubbles, 2018 - dontforgetthebubbles.com\

In [23]:
rdlyc[rdlyc.GS_cites_link!=''].GS_cites_link[0]

'/scholar?cites=11746986714791086750&as_sdt=2005&sciodt=0,5&hl=en'

In [None]:
ini=0
maxcites=65
df=pd.DataFrame()
i=1
for url in rdlyc[rdlyc.GS_cites_link!=''].GS_cites_link[ini:]:
    print(i)
    i=i+1
    refs=get_cites_refs(browser,url,maxcites=maxcites)
    df=df.append({'GS_cites_link':url,'cites_refs':refs},
                ignore_index=True)
    df.to_json('cites_refs.json')
    time.sleep(random.uniform(0.9*t,1.1*t))

1
check robots
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [37]:
rdlyc[rdlyc.GS_cites_link!=''].GS_cites_link[ini:].shape

(148,)

In [None]:
refs=get_cites_refs(browser,url,maxcites=65)

In [33]:
cp gsta.json gsta_missingdois.json

In [29]:
rd=pd.read_json('gas.json')

In [68]:
pd.DataFrame().append({},ignore_index=True)

0


See: https://stackoverflow.com/a/43413418/2268280

In [54]:
SAVERUN=False
if SAVERUN:
    dfgsy,dfgsn=wp.df_split( dfgs,on='title',Operator='!=',condition='')
    dfgsy.to_json('yellow1.json')
    dfgsn.to_json('yellowno.json')

array(['10.7705/biomedica.v37i3.4058', '10.17533/udea.esde.v73n162a06',
       '10.24201/hm.v64i4.3123', ..., '10.17533/udea.ikala.v19n3a05',
       '10.11144/javeriana.mavae9-2.eacv',
       '10.11144/javeriana.rgyps13-26.eaht'], dtype=object)

In [6]:
import pandas as pd

In [7]:
kk=pd.read_excel('gs_516_560.xlsx').append(pd.read_excel('gs_560_844'))

In [11]:
516+284

800

In [13]:
kk[1:284].to_excel('gs_516_800.xlsx',index=False)

In [24]:
adfgs=wp.fill_NaN(dfgs)

In [25]:
dfgs[:41].to_excel('gs1.xlsx',index=False)

In [8]:
dfgs[:50]

NameError: name 'dfgs' is not defined

# Consultar  DOAJ API con DOI
Ver: https://doaj.org/api/v1/docs#!/Search/get_api_v1_search_articles_search_query

## Carga navegador JSON

In [0]:
import pandas as pd

## Consulta

In [0]:
DOI='10.1016/j.physletb.2018.01.009'
df=pd.read_json('https://doaj.org/api/v1/search/articles/{}'.format(DOI))

## Análisis de resultado:
Nota: La celdas con estructuras anidadas (que comienzan con `[` ó `{`) sa analizan secuencialmente

In [0]:
df

Unnamed: 0,last,page,pageSize,query,results,timestamp,total
0,https://doaj.org/api/v1/search/articles/10.101...,1,10,10.1016/j.physletb.2018.01.009,"{u'last_updated': u'2018-03-05T12:09:16Z', u'i...",2018-1010T04:23:08Z,1


Desplegar colunna __results__:

In [0]:
p=pd.DataFrame(  df['results'].values[0] )
p

Unnamed: 0,bibjson,created_date,id,last_updated
abstract,The observation of neutrino oscillations estab...,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
author,"[{u'affiliation': u'Physics Department, Univer...",2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
end_page,100,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
identifier,"[{u'type': u'pissn', u'id': u'0370-2693'}, {u'...",2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
journal,"{u'publisher': u'Elsevier', u'language': [u'EN...",2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
link,[{u'url': u'http://www.sciencedirect.com/scien...,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
month,3,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
start_page,94,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
subject,"[{u'code': u'QC1-999', u'term': u'Physics', u'...",2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z
title,Expanding the reach of heavy neutrino searches...,2018-03-05T12:09:16Z,03ba3f288d8341d380a286a6d0f64de4,2018-03-05T12:09:16Z


Desplegar cada una de las celdas anidadas de la columna __bibjson__

In [0]:
pd.DataFrame(  p.loc['author','bibjson'] )

Unnamed: 0,affiliation,name
0,"Physics Department, Universidad de los Andes, ...",Andrés Flórez
1,"Department of Physics and Astronomy, Vanderbil...",Kaiwen Gui
2,"Department of Physics and Astronomy, Vanderbil...",Alfredo Gurrola
3,"Physics Department, Universidad de los Andes, ...",Carlos Patiño
4,"Department of Physics, Universidad de Antioqui...",Diego Restrepo


In [0]:
pd.DataFrame(  p.loc['identifier','bibjson'] )

Unnamed: 0,id,type
0,0370-2693,pissn
1,1873-2445,eissn
2,10.1016/j.physletb.2018.01.009,doi


In [0]:
 pd.DataFrame( [p.loc['journal','bibjson'] ] )


Unnamed: 0,country,issns,language,license,number,publisher,title,volume
0,NL,"[0370-2693, 1873-2445]",[EN],[{u'url': u'http://www.elsevier.com/journals/p...,C,Elsevier,Physics Letters B,778


In [0]:
pd.DataFrame( p.loc['journal','bibjson']['license'] )

Unnamed: 0,open_access,title,type,url
0,True,CC BY,CC BY,http://www.elsevier.com/journals/physics-lette...


In [0]:
pd.DataFrame(  p.loc['link','bibjson']  )

Unnamed: 0,type,url
0,fulltext,http://www.sciencedirect.com/science/article/p...


# Dask

In [0]:
! pip install dask cloudpickle > /dev/null

In [0]:
1+1

2

In [0]:
import dask as dk

In [0]:
from dask import dataframe

In [0]:
%%writefile kk.json
{"A":1,"B":3}
{"A":5,"B":6}

Writing kk.json


In [0]:
df=dataframe.read_json('kk.json')

In [0]:
df.compute()

Unnamed: 0,A,B
0,1,3
1,5,6


In [0]:
import pandas as pd

In [0]:
pd.read_json('kk.json',orient='records',lines=True)

Unnamed: 0,A,B
0,1,3
1,5,6


In [0]:
import os
os.chdir('drive/My Drive/Colab Notebooks/Data')

In [0]:
!head -n10 oagreenarticle.json > tmp.json

In [0]:
import pandas as pd

In [0]:
pd.read_json('tmp.json',orient='records',lines=True)[:3]

Unnamed: 0,best_oa_location,data_standard,doi,doi_url,genre,is_oa,journal_is_in_doaj,journal_is_oa,journal_issns,journal_name,oa_locations,published_date,publisher,title,updated,year,z_authors
0,{'url': 'https://doi.org/10.1016/j.ahj.2015.02...,2,10.1016/j.ahj.2015.02.019,https://doi.org/10.1016/j.ahj.2015.02.019,journal-article,True,False,False,0002-8703,American Heart Journal,[{'url': 'https://doi.org/10.1016/j.ahj.2015.0...,2015-06-01,Elsevier BV,Biomarkers for risk stratification of patients...,2018-06-18T19:10:12.692979,2015.0,"[{'given': 'Matthijs A.', 'family': 'Velders'}..."
1,{'url': 'https://doi.org/10.1021/acsnano.7b049...,2,10.1021/acsnano.7b04994,https://doi.org/10.1021/acsnano.7b04994,journal-article,True,False,False,"1936-0851,1936-086X",ACS Nano,[{'url': 'https://doi.org/10.1021/acsnano.7b04...,2017-09-08,American Chemical Society (ACS),Hybrid Surface Patterns Mimicking the Design o...,2018-06-14T11:39:08.680219,2017.0,[{'ORCID': 'http://orcid.org/0000-0001-5512-74...
2,{'url': 'https://doi.org/10.1016/s0925-4773(00...,2,10.1016/s0925-4773(00)00472-x,https://doi.org/10.1016/s0925-4773(00)00472-x,journal-article,True,False,False,0925-4773,Mechanisms of Development,[{'url': 'https://doi.org/10.1016/s0925-4773(0...,2000-12-01,Elsevier BV,Embryonic expression of a P2X 3 receptor encod...,2018-06-18T04:15:29.411710,2000.0,"[{'given': 'William H.J', 'family': 'Norton'},..."


In [0]:
tmp=dataframe.read_json('tmp.json')

In [0]:
tmp.compute()

Unnamed: 0,best_oa_location,data_standard,doi,doi_url,genre,is_oa,journal_is_in_doaj,journal_is_oa,journal_issns,journal_name,oa_locations,published_date,publisher,title,updated,year,z_authors
0,{'url': 'https://doi.org/10.1016/j.ahj.2015.02...,2,10.1016/j.ahj.2015.02.019,https://doi.org/10.1016/j.ahj.2015.02.019,journal-article,True,False,False,0002-8703,American Heart Journal,[{'url': 'https://doi.org/10.1016/j.ahj.2015.0...,2015-06-01,Elsevier BV,Biomarkers for risk stratification of patients...,2018-06-18T19:10:12.692979,2015.0,"[{'given': 'Matthijs A.', 'family': 'Velders'}..."
1,{'url': 'https://doi.org/10.1021/acsnano.7b049...,2,10.1021/acsnano.7b04994,https://doi.org/10.1021/acsnano.7b04994,journal-article,True,False,False,"1936-0851,1936-086X",ACS Nano,[{'url': 'https://doi.org/10.1021/acsnano.7b04...,2017-09-08,American Chemical Society (ACS),Hybrid Surface Patterns Mimicking the Design o...,2018-06-14T11:39:08.680219,2017.0,[{'ORCID': 'http://orcid.org/0000-0001-5512-74...
2,{'url': 'https://doi.org/10.1016/s0925-4773(00...,2,10.1016/s0925-4773(00)00472-x,https://doi.org/10.1016/s0925-4773(00)00472-x,journal-article,True,False,False,0925-4773,Mechanisms of Development,[{'url': 'https://doi.org/10.1016/s0925-4773(0...,2000-12-01,Elsevier BV,Embryonic expression of a P2X 3 receptor encod...,2018-06-18T04:15:29.411710,2000.0,"[{'given': 'William H.J', 'family': 'Norton'},..."
3,{'url': 'https://doi.org/10.1016/0021-8693(88)...,1,10.1016/0021-8693(88)90058-0,https://doi.org/10.1016/0021-8693(88)90058-0,journal-article,True,False,False,0021-8693,Journal of Algebra,[{'url': 'https://doi.org/10.1016/0021-8693(88...,1988-10-01,Elsevier BV,Corrigendum,2018-06-18T04:18:43.222135,1988.0,
4,{'url': 'https://doi.org/10.1016/j.mod.2006.03...,2,10.1016/j.mod.2006.03.003,https://doi.org/10.1016/j.mod.2006.03.003,journal-article,True,False,False,0925-4773,Mechanisms of Development,[{'url': 'https://doi.org/10.1016/j.mod.2006.0...,2006-05-01,Elsevier BV,Impaired development of the Harderian gland in...,2018-06-18T04:20:45.498171,2006.0,"[{'given': 'Andreas', 'family': 'Schild'}, {'g..."
5,{'url': 'https://doi.org/10.1103/physrevc.96.0...,2,10.1103/physrevc.96.014310,https://doi.org/10.1103/physrevc.96.014310,journal-article,True,False,False,"2469-9985,2469-9993",Physical Review C,[{'url': 'https://doi.org/10.1103/physrevc.96....,2017-07-14,American Physical Society (APS),Nuclear deformation in the A≈100 region: Compa...,2018-06-16T19:57:39.403083,2017.0,"[{'given': 'A.', 'family': 'de Roubin'}, {'giv..."
6,{'url': 'https://doi.org/10.1016/s0960-9822(07...,2,10.1016/s0960-9822(07)00493-9,https://doi.org/10.1016/s0960-9822(07)00493-9,journal-article,True,False,False,0960-9822,Current Biology,[{'url': 'https://doi.org/10.1016/s0960-9822(0...,1998-10-01,Elsevier BV,Protein kinase B (PKB/Akt) activity is elevate...,2018-06-18T04:21:43.465040,1998.0,"[{'given': 'Daphne', 'family': 'Haas-Kogan'}, ..."
7,{'url': 'https://doi.org/10.1016/s1470-2045(13...,2,10.1016/s1470-2045(13)70163-3,https://doi.org/10.1016/s1470-2045(13)70163-3,journal-article,True,False,False,1470-2045,The Lancet Oncology,[{'url': 'https://doi.org/10.1016/s1470-2045(1...,2013-07-01,Elsevier BV,Panitumumab and irinotecan versus irinotecan a...,2018-06-18T19:11:38.984437,2013.0,"[{'given': 'Matthew T', 'family': 'Seymour'}, ..."
8,{'url': 'https://doi.org/10.24159/joec.2014.20...,2,10.24159/joec.2014.20.4.281,https://doi.org/10.24159/joec.2014.20.4.281,journal-article,True,False,False,1598-9283,Journal of Education & Culture,[{'url': 'https://doi.org/10.24159/joec.2014.2...,2014-12-01,Education Research Institute,The Research on the Values and Lifestyles of K...,2018-06-18T05:26:34.731357,2014.0,"[{'family': 'Kim,Heung-Kyu'}, {'family': '이상란'}]"
9,{'url': 'https://doi.org/10.5194/acp-2016-560-...,2,10.5194/acp-2016-560-ac1,https://doi.org/10.5194/acp-2016-560-ac1,component,True,False,False,,,[{'url': 'https://doi.org/10.5194/acp-2016-560...,,Copernicus GmbH,,2018-06-15T17:12:08.805398,,


In [0]:
dataframe.read_json?

In [0]:
df=dataframe.read_json('oagreenarticle.json')

NameError: ignored

In [0]:
import dask.bag as db

In [0]:
import json
b = db.read_text('tmp.json').map(json.loads)

In [0]:
bb=b.to_dataframe()

In [0]:
bbb=bb['doi']

In [2]:
from webdriver_manager.firefox import GeckoDriverManager

from selenium import webdriver

ls -l /home/restrepo/.wdm/geckodriver/v0.23.0/linux64/geckodriver

driver = webdriver.Firefox(executable_path=
                '/home/restrepo/.wdm/geckodriver/v0.23.0/linux64/geckodriver')
#reinstall path: GeckoDriverManager().install())

In [16]:
#driver.set_page_load_timeout(30)

In [17]:
url='https://scholar.google.com.co/scholar?hl=en&as_sdt=0%2C5&q=10.1590%2Fs0103-84782009000100018'

In [22]:
driver.get('http://google.com')

In [20]:
driver.page_source

'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"><meta name="viewport" content="initial-scale=1"><title>https://scholar.google.com.co/scholar?hl=en&amp;as_sdt=0%2C5&amp;q=10.1590%2Fs0103-84782009000100018</title></head>\n<body style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px;" onload="e=document.getElementById(\'captcha\');if(e){e.focus();}">\n<div style="max-width:400px;">\n<hr style="color:#ccc; background-color:#ccc;" size="1" noshade=""><br>\n<form id="captcha-form" action="index" method="post">\n<script type="text/javascript" async="" src="https://www.gstatic.com/recaptcha/api2/v1540189908068/recaptcha__en.js"></script><script src="https://www.google.com/recaptcha/api.js" async="" defer=""></script>\n<script>var submitCallback = function(response) {document.getElementById(\'captcha-form\').submit();};</script>\n<div id="recaptcha" class="g-recaptcha" data-sitekey="6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH

In [8]:
browser=driver

In [20]:
browser.get('https://github.com/mozilla/geckodriver')

InvalidSessionIdException: Message: Tried to run command without establishing a connection


In [1]:
import pandas as pd

In [4]:
gsta=pd.read_excel('gsta_full.xlsx')