Here we assume that there exist a data base with the authors multiple names associated to the real name in the format:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Author_Names</th>
      <th>Control</th>
      <th>Full_Name</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td> Zapata, Oscar;Zapata, O.</td>
      <td> 0</td>
      <td> Óscar Alberto Zapata Noreña</td>
    </tr>
  </tbody>
</table>
Where control is some identification number associated to the author.


We assume also that there is a data base with the Group at which some real name author belongs in the format:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Full_Name</th>
      <th>Institution_Group</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td> Óscar Alberto Zapata Noreña</td>
      <td> GFIF</td>
    </tr>
  </tbody>
</table>

In [1]:
%load_ext autoreload

In [2]:
%autoreload

In [5]:
%%writefile publications.py
#/usr/bin/env python
#BEBUG: Update insitutional authors in Groups even with filled columns
import numpy as np
import pandas as pd
import re
import requests
import warnings
import time
import sys
import os
from IPython.display import display, HTML
import tabulate #sudo pip3 install tabulate
import utilities as ut
from bs4 import BeautifulSoup
from cmdlike import *
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',200)
def update_column(ds,column,update=False):
    update_column=True
    if column in ds:
        if ds[column] and not update: #it update=True willreplace value
            update_column=False
    return update_column

def _get_doi(surname='Florez',\
        title=r'Baryonic violation of R-parity from anomalous $U(1)_H$',other=''):
        '''
        Search doi from http://search.crossref.org/ 
        '''
        doi={}
        search=''
        if surname:
            search=surname
        if title:
            if len(search)>0:
                search=search+', '+title
        if other:
            if len(search)>0:
                search=search+', '+other
                
        r=requests.get('http://search.crossref.org/?q=%s' %search)
        urldoi='http://dx.doi.org/'
        doitmp=''
        if len(r.text.split(urldoi))>1:
            doitmp=r.text.split(urldoi)[1].split("\'>")[0].replace('&lt;','<').replace('&gt;','>')
        #check doi is right by searching for all words in doi -> output title
        if doitmp:
            json='https://api.crossref.org/v1/works/'
            rr=requests.get( json+urldoi+doitmp )
            if rr.status_code==200:
                if 'message' in rr.json():
                    chktitle = re.sub(r"\$.*?\$","",title) # better remove all math expressions
                    chktitle = re.sub(r"[^a-zA-Z0-9 ]", " ", chktitle).split(' ')
                    if chktitle:
                        if not -1 in [(rr.json()["message"]['title'][0]).find(w)  for w in chktitle]:
                            doi=rr.json()["message"]
                        
        return doi
    
def _get_impact_factor_from_journal_name(journal_name='Physical Review D'):
    '''
      For the input Journal name obtain
      tPérezhe pandas DataFrame with Years and IF as columns
    '''
    q=journal_name.lower().replace(' ','-')
    URL='http://www.journal-database.com/journal/%s.html' %q        
    r = requests.get(URL)
    return ut.html_to_DataFrame(r.content)
    #UDPATE in repo
    
def _get_quartil(issn='1550-7998',journal_hindex=False):
    URL='http://www.scimagojr.com/journalsearch.php?q='
    r=requests.get('%s%s' %(URL,issn))
    result=r.text.split('href="journalsearch.php?q=')
    quartil='';hindex_journal=''
    if type(result)==list and len(result)==3:
        id_jour=result[-1].split('">')[0]
        if id_jour:
            rr=requests.get('%s%s' %(URL,id_jour))
            quartil=grep('^\s+var dataquartiles = ',rr.text).split(';')
            quartil.remove('')
            if type(quartil)==list and len(quartil)>0:
                quartil=quartil[-1].replace('"','')
            hindex=rr.text.split('<div class="hindexnumber">')
            if type(hindex)==list and len(hindex)==2:
                hindex_journal=hindex[1].split('</div>')[0]
    if journal_hindex:
        return quartil,hindex_journal
    else:
        return quartil  
    
    
def _gs_profile_to_dataframes(user='-6mndWkAAAAJ',number_of_articles=100,sleep=10):
    '''
    Convert the first number_of_articles of a google scholar profile for user id
    to two dataframes
    1) Citations indices
    2) Article, Cites, Year columns
    '''
    i=0
     
    Citations_indices=pd.DataFrame()
    cited_articles=pd.DataFrame()
    chk_citations=False
    while True:
        if i>0:
            print('waiting %d sec. to avoid robot detection...' %sleep)
            time.sleep(sleep)
        iold=i
        i=i+100
        r=requests.get('https://scholar.google.com/citations?sortby=pubdate&hl=en&user=%s' %user\
                       +'&view_op=list_works&cstart=%d&pagesize=%d' %(iold,i) )
        
        if r.status_code==200:
            soup = BeautifulSoup(r.text,"lxml")
            s=soup.find_all("table")
            if len(s)==2: #proper formated profile output 
                chk_citations=True
                Citations_indices=Citations_indices.append( ut.html_to_DataFrame(s[0].decode(),\
                    headings=['Citations indices','All','Since %d' %(time.localtime().tm_year-5)]) )
                cited_articles=cited_articles.append(\
                                ut.html_to_DataFrame(s[1].decode(),headings=['Article','Cites','Year']) )
        if i>number_of_articles:
            break
                
        
    if chk_citations:
        cited_articles=cited_articles[\
                ~cited_articles.Article.isnull()].reset_index(drop=True)
        an=cited_articles[cited_articles.Year.isnull()]
        for i in an.index:
            cited_articles.loc[i,'Cites']=0
            cited_articles.loc[i,'Year']=an.ix[i]['Cites']
        
    return Citations_indices,cited_articles

class publications(object):
    '''Add Generic publication data'''
    pass
class articles(publications):
    '''Read csv file exported by Google Scholar Citations profile and
       automatically add informations about:
       DOI: from title and author
       Journal title: from DOI
       ISSN of Jornal
       Impact factor of journal
       Institution_Authors: requires Data Base
       '''
    journal=pd.Series()
    columns=pd.Series({'Full_Name':'Full_Name','Author_Names':'Author_Names','Control':'Control',\
                      'Institution_Authors':'Institution_Authors','Institution_Group':'Institution_Group'})
    institution_authors=pd.DataFrame()
    institution_group=pd.DataFrame()
    cited_articles_hash=pd.Series()
    articles_hash=pd.Series()
    def __init__(self,csv_file='citations.csv',excel_file='',user='',citations_file=None,authors_file=None,\
                 group_file=None):
        #DEBUG: check file
        if csv_file:
            self.articles=pd.read_csv(csv_file).fillna('')
        if excel_file:
            self.articles=pd.read_excel(excel_file)
        #Fix problem with column Authors
        if self.articles.shape[0]>0 and self.articles.columns[0].find('Authors')>-1:
            self.articles=self.articles.rename(columns={self.articles.columns[0]:'Authors'})
        if citations_file:
            #DEBUG: check file
            self.citations=pd.read_csv(citations_file).fillna('')
            
        if authors_file:
            #DEBUG: chek is file exists,
            self.authors_file=authors_file
            self.institution_authors=pd.read_csv(authors_file).fillna('')
            
        if group_file:
            #DEBUG: chek is file exists,
            self.group_file=group_file
            self.institution_group=pd.read_csv(group_file).fillna('')
        if user:
            self.Citations_indices,self.cited_articles=_gs_profile_to_dataframes(\
                            user,number_of_articles=self.articles.shape[0],sleep=10)
            if self.cited_articles.shape[0]>0:
                self.cited_articles_hash=\
                    self.cited_articles.Article.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.replace('\s+','')
                self.articles_hash=\
                    self.articles.Title.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.replace('\s+','')
            
    def add_institution_author(self):
        #DEBUG: Check and load file
        full_name=input('Full name: ')
        #DEBUG: Ask for more author names and update author names separated by semicolons
        author_names=input('Author names\n(Example: Perez, Juan;Perez, J.;Pérez, J.):\n')
        #DEBUG: default to 0
        control=input('Additional identitication number: ') 
                    
        self.institution_authors=self.institution_authors.append({self.columns.Full_Name:full_name,\
                                                              self.columns.Author_Names:author_names,\
                                                              self.columns.Control:control},ignore_index=True)
        self.institution_authors.to_csv('authors.csv',index=False)
        
    def add_institution_group(self):
        wrn='Author not found'
        search_query=input('Find author by first name and first surname\n(Example: Juan Pérez);\n')
        sql=search_query.split(' ')
        #DEBUG: low_case
        if len(sql)>0:
            sql_name=sql[0]
            if len(sql)>1:
                sql_surname=sql[1]
            else:
                sql_surname=''
            
            author_match=self.institution_authors[\
                             np.logical_and( self.institution_authors.Full_Name.str.contains(sql_name),\
                                             self.institution_authors.Full_Name.str.contains(sql_surname) )]
            if not author_match.shape[0]:
                author_match=self.institution_authors[\
                             np.logical_or( self.institution_authors.Full_Name.str.contains(sql_name),\
                                            self.institution_authors.Full_Name.str.contains(sql_surname) )]
            if not author_match.shape[0]:
                warnings.warn(wrn)
            else:    
                author_match=author_match.reset_index(drop=True)
                author_match['Number']=author_match.index
                if author_match.shape[0]==1:
                    author_Series=author_match.ix[0]
                    #print( tabulate.tabulate(author_match,headers='keys', tablefmt='psql'))
                else:
                    print( tabulate.tabulate(author_match,headers='keys', tablefmt='psql'))
                    ai=input('Give Number of line:')
                    author_Series=author_match.ix[ai]
                Group=input('Group name for\n%s\n(Example: IA: Inteligencia Artificial)' %author_Series.Full_Name)
                
                self.institution_group=self.institution_group.append({self.columns.Full_Name:author_Series.Full_Name,\
                                                                  self.columns.Institution_Group:Group},ignore_index=True)
                self.institution_group.to_csv('groups.csv',index=False)
        else:
            warnings.warn(wrn)

        
        
    def get_doi(self,surname='Florez',\
                     title=r'Baryonic violation of R-parity from anomalous $U(1)_H$',other=''):
        return _get_doi(surname='Florez',\
                        title=r'Baryonic violation of R-parity from anomalous $U(1)_H$',other='')
    
    def get_IF(self,journal_name='Physical Review D'):
        return _get_impact_factor_from_journal_name(journal_name='Physical Review D')
    
    def get_citations(self,user='-6mndWkAAAAJ',number_of_articles=100,sleep=10):
        self.Citations_indices,self.cited_articles=_gs_profile_to_dataframes(\
                        user=user,number_of_articles=number_of_articles,sleep=sleep)
        if self.cited_articles.shape[0]>0:
            self.cited_articles_hash=\
                    self.cited_articles.Article.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.replace('\s+','')
            self.articles_hash=\
                    self.articles.Title.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.replace('\s+','')
            
        return self.Citations_indices,self.cited_articles

    def article_index_cites(self,index=0): 
        if self.cited_articles_hash.shape[0]>0:
            mt=self.cited_articles[self.cited_articles_hash.str.match(self.articles_hash[index])]
            if len(mt)>0:
                if len(mt)>1: #multiple matches. Refine search with, e.g, volume
                    mtv=mt[mt.thash.str.contains(str(self.articles.Volume[index]))]
                    if len(mtv)>0:
                        mt=mtv
        
                return mt.Cites.replace('*','').replace('','0').astype(int).max()
        else:
            return ''
    
    def articles_update(self,cites=True,institution_authors=False,institution_groups=False,DOI=False,\
                        impact_factor=False):
        self.fulldoi=pd.DataFrame()
        journal_columns=['Impact_Factor','Quartil','Journal_Hindex']
        newcolumns=['Institution_Authors','Institution_Groups','DOI','ISSN','DOI_Journal','Cites']+journal_columns
        for newcolumn in newcolumns:
            if not newcolumn in self.articles.columns:
                self.articles[newcolumn]=''

        self.articles=self.articles.reset_index(drop=True).fillna('')
        print('Updating entry:',end="")
        dbj=pd.DataFrame()
        for i in range(self.articles.shape[0]):
            time.sleep(0.3) #avoid robot detection
            print('%d.' %i,end="")
            #Update Full names: TODO: Unify algorithm
            if not self.articles.ix[i].Institution_Authors:
                institution_authors=True
                
            if self.articles.ix[i].Authors and institution_authors:
                inst_auth=''
                inst_auth_sep=';'
                for a in filter(None, self.articles.ix[0].Authors.replace('; ',';').split(';') ):
                    af=self.institution_authors[self.institution_authors.Author_Names.str.contains(a)].reset_index(drop=True)
                    if af.shape[0]==1:
                        if inst_auth:
                            inst_auth=inst_auth+inst_auth_sep
                        if af.Full_Name.values.shape[0]==1:
                            inst_auth=inst_auth+af.Full_Name.values[0]
                    elif af.shape[0]>1:
                        print('Improve real name matching')
                    if update_column(self.articles.ix[i],'Institution_Authors'):
                        self.articles.loc[i,'Institution_Authors']=inst_auth
            #Update Groups:
            if not self.articles.ix[i].Institution_Groups:
                institution_groups=True
            if self.articles.ix[i].Institution_Authors and institution_groups:
                inst_grp=''
                inst_grp_sep=';'
                chka=self.articles.ix[i].Institution_Authors.split(';')
                if len(chka)>0:
                    for fa in chka:
                        qry=self.institution_group[self.institution_group.Full_Name.str.contains(fa)].reset_index(drop=True)
                        if qry.shape[0]==1:
                            g=qry.Institution_Group.values[0]
                        else:
                            print('DEBUG improve search group for %s' %fa)
                        if inst_grp:
                            inst_grp=inst_grp+inst_grp_sep
                        
                        inst_grp=inst_grp+g
                    
                if update_column(self.articles.ix[i],'Institution_Groups'):
                        self.articles.loc[i,'Institution_Groups']=inst_grp
                    
                
            #Update DOI:
            if not self.articles.ix[i].DOI:
                rr=_get_doi(surname=self.articles.ix[i].Authors.split(';')[0].split(',')[0],\
                            title=self.articles.ix[i].Title)
                if len(rr)>0:
                    rr=pd.Series(rr)
                    if update_column(self.articles.ix[i],'DOI') and 'URL' in rr:
                        self.articles.loc[i,'DOI']=rr['URL']
                    if update_column(self.articles.ix[i],'ISSN') and 'ISSN' in rr:
                        issn=''
                        if type(rr.ISSN)==list:
                            issn=';'.join(rr.ISSN)
                        else:
                            print('DEBUG: Improve DOI -> ISSN not a list')
                        self.articles.loc[i,'ISSN']=issn
                    if update_column(self.articles.ix[i],'DOI_Journal') and 'container-title' in rr:                        
                        journal=[ j for j in rr['container-title'] if j.find('.')<0] 
                        if len(journal)==1:
                            self.articles.loc[i,'DOI_Journal']=journal[0]
                        else:
                            print('DEBUG: Improve DOI_journal have dots')
                        
                        
                self.fulldoi=self.fulldoi.append(rr,ignore_index=True).fillna('')
            #Update Cites:
            if not self.articles.ix[i].Cites:
                if update_column(self.articles.ix[i],'Cites') and self.cited_articles_hash.shape[0]>0:
                        #DEBUG implement method like here with self.article_index_cites
                        self.articles.loc[i,'Cites']=self.article_index_cites(i)
                    
            
            #WARNING: Only specific journal info related code from here: See the continue !!!!
            #JOURNAL info:
            if i>0 and 'ISSN' in self.articles:
                dbj=self.articles[:i][self.articles[:i].ISSN.str.contains(\
                              self.articles.ISSN.ix[i].split(';')[0])].reset_index(drop=True)
            
            if dbj.shape[0]>0:
                for k in journal_columns:
                    if k in self.articles:
                        self.articles.loc[i,k]=dbj.ix[0][k]
                continue   
            
            #Update IF
            q=''
            if self.articles.ix[i].DOI_Journal:
                q=self.articles.ix[i].DOI_Journal.lower().replace(' ','-')
            elif self.articles.ix[i].Publication:
                q=self.articles.ix[i].Publication
            IFdf=_get_impact_factor_from_journal_name(q)
            if IFdf.shape[0]>0:
                if update_column(self.articles.ix[i],'Impact_Factor') and 'IF' in IFdf:
                    self.articles.loc[i,'Impact_Factor']=eval(IFdf.ix[0].IF)
            
            #Update quartil
            if self.articles.ix[i].ISSN.split(';')[0]:
                quartil,hindex_journal=_get_quartil(self.articles.ix[i].ISSN.split(';')[0],journal_hindex=True)
                if update_column(self.articles.ix[i],'Quartil') and  quartil:
                        self.articles.loc[i,'Quartil']=quartil
                if update_column(self.articles.ix[i],'Journal_Hindex') and hindex_journal:
                        self.articles.loc[i,'Journal_Hindex']=hindex_journal                    
            
        return self.fulldoi.fillna('')

    def to_csv(self,csvfile):
        self.articles.to_csv(csvfile,index=False)
        if self.fulldoi.shape[0]>0:
            self.fulldoi.to_json('fulldoi.json')
        
                        
#TOD0 reset databases method:
#pd.DataFrame().to_csv('authors.csv',index=False)

Overwriting publications.py


In [37]:
#a=articles('citations_udea.csv',authors_file='authors.json',group_file='groups.json')
a=articles('citations_bak.csv',user='-6mndWkAAAAJ',authors_file='authors.json',group_file='groups.json')

In [38]:
a.articles.shape

(13, 8)

In [39]:
fulldoi=a.articles_update()

Updating entry:0.1.2.3.4.5.6.7.8.9.10.11.12.

In [40]:
a.articles

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,Institution_Authors,Institution_Groups,DOI,ISSN,DOI_Journal,Cites,Impact_Factor,Quartil,Journal_Hindex
0,"Sierra, D Aristizabal; Kubo, Jisuke; Suematsu, Daijiro; Restrepo, D; Zapata, Oscar;","Radiative seesaw model: Warm dark matter, collider signatures, and lepton flavor violating signals",Physical Review D,79,1,13011,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.79.013011,1550-7998;1550-2368,Physical Review D,83,4.643,Q1,259.0
1,"Restrepo, Diego; Taoso, Marco; Valle, JWF; Zapata, Oscar;",Gravitino dark matter and neutrino masses with bilinear R-parity violation,Physical Review D,85,2,23523,2012,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.85.023523,1550-7998;1550-2368,Physical Review D,40,4.643,Q1,259.0
2,"Choi, Ki-Young; Restrepo, Diego; Yaguna, Carlos E; Zapata, Oscar;",Indirect detection of gravitino dark matter including its three-body decays,Journal of Cosmology and Astroparticle Physics,2010,10,33,2010,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2010/10/033,1475-7516,Journal of Cosmology and Astroparticle Physics,33,5.81,Q3,70.0
3,"Sierra, D Aristizabal; Restrepo, Diego; Zapata, Oscar;",Decaying neutralino dark matter in anomalous U (1) H models,Physical Review D,80,5,55010,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.80.055010,1550-7998;1550-2368,Physical Review D,24,4.643,Q1,259.0
4,"Klasen, Michael; Yaguna, Carlos E; Ruiz-Álvarez, José D; Restrepo, Diego; Zapata, Oscar;",Scalar dark matter and fermion coannihilations in the radiative seesaw model,Journal of Cosmology and Astroparticle Physics,2013,4,44,2013,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2013/04/044,1475-7516,Journal of Cosmology and Astroparticle Physics,18,5.81,Q3,70.0
5,"Florez, Andres; Restrepo, Diego; Velasquez, Mauricio; Zapata, Oscar;",Baryonic violation of R parity from anomalous U (1) H,Physical Review D,87,9,95010,2013,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.87.095010,1550-7998;1550-2368,Physical Review D,16,4.643,Q1,259.0
6,"Molinaro, Emiliano; Yaguna, Carlos E; Zapata, Oscar;",FIMP realization of the scotogenic model,Journal of Cosmology and Astroparticle Physics,2014,7,15,2014,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2014/07/015,1475-7516,Journal of Cosmology and Astroparticle Physics,16,5.81,Q3,70.0
7,"Restrepo, Diego; Rivera, Andrés; Sánchez-Peláez, Marta; Zapata, Oscar; Tangarife, Walter;",Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singlets,Physical Review D,92,1,13005,2015,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.92.013005,1550-7998;1550-2368,Physical Review D,12,4.643,Q1,259.0
8,"Arbeláez, Carolina; Longas, Robinson; Restrepo, Diego; Zapata, Oscar;",Fermion dark matter from SO (10) GUTs,Physical Review D,93,1,13012,2016,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.93.013012,2470-0010;2470-0029,Physical Review D,10,4.643,,
9,"Ponce, William A; Zapata, Oscar;",Lepton masses and mixing without Yukawa hierarchies,Physical Review D,74,9,93007,2006,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.74.093007,1550-7998;1550-2368,Physical Review D,10,4.643,Q1,259.0


Design:
user id is given for the profile then use a method 

self.get_citations()

In [16]:
C,c=_gs_profile_to_dataframes(number_of_articles=13)

In [17]:
C

Unnamed: 0,Citations indices,All,Since 2011
0,Citations,270,238
1,h-index,10,9
2,i10-index,10,9


In [18]:
c

Unnamed: 0,Article,Cites,Year
0,"The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter modelS Horiuchi, O Macias, D Restrepo, A Rivera, O Zapata, H SilverwoodJournal of Cosmology and Ast...",2,2016
1,"The inert Zee modelD Portillo, D Restrepo, O ZapataJournal of High Energy Physics 2016 (03), 162, 2016",0,2016
2,"Direct detection of fermion dark matter in the radiative seesaw modelA Ibarra, CE Yaguna, O ZapataPhysical Review D 93 (3), 035012, 2016",6,2016
3,"Fermion dark matter from SO (10) GUTsC Arbeláez, R Longas, D Restrepo, O ZapataPhysical Review D 93 (1), 013012, 2016",10,2016
4,"Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singletsD Restrepo, A Rivera, M Sánchez-Peláez, O Zapata, W TangarifePhysical Review D 92 (1), 013005, 2015",12,2015
5,"FIMP realization of the scotogenic modelE Molinaro, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2014 (07), 015, 2014",16,2014
6,"Baryonic violation of R parity from anomalous U (1) HA Florez, D Restrepo, M Velasquez, O ZapataPhysical Review D 87 (9), 095010, 2013",16,2013
7,"Scalar dark matter and fermion coannihilations in the radiative seesaw modelM Klasen, CE Yaguna, JD Ruiz-Álvarez, D Restrepo, O ZapataJournal of Cosmology and Astroparticle Physics 2013 (04), 044,...",18,2013
8,"Gravitino dark matter and neutrino masses with bilinear R-parity violationD Restrepo, M Taoso, JWF Valle, O ZapataPhysical Review D 85 (2), 023523, 2012",40,2012
9,"Indirect detection of gravitino dark matter including its three-body decaysKY Choi, D Restrepo, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2010 (10), 033, 2010",33,2010


In [21]:
c,C=a.get_citations(user='-6mndWkAAAAJ',number_of_articles=a.articles.shape[0])

In [26]:
a.article_index_cites(3)

24

In [194]:
%%writefile utilities.py
#/usr/bin/env python
from bs4 import BeautifulSoup
import pandas as pd
def html_to_DataFrame(html_page_with_table,attrs={},headings=[]):
    '''
    Extract the table of a web page and convert to a pandas DataFrame
    '''

    soup = BeautifulSoup(html_page_with_table,"lxml")
    table = soup.find("table",attrs)

    # The first tr contains the field names.
    if table:
        if not headings:
            headings = [th.get_text().strip() for th in table.find("tr").find_all("td")]

        datasets = []
        for row in table.find_all("tr")[1:]:
            dataset = [td.get_text() for td in row.find_all("td") if td.get_text().strip()]
            datasets.append(dataset)
        
        if headings:  
            return pd.DataFrame(datasets,columns=headings)
    else:
        return pd.DataFrame()

Overwriting utilities.py


In [196]:
self.articles

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,Institution_Authors,Institution_Groups,DOI,ISSN,DOI_Journal,Impact_Factor,Quartil,Journal_Hindex,Cites,Simple
0,"Sierra, D Aristizabal; Kubo, Jisuke; Suematsu, Daijiro; Restrepo, D; Zapata, Oscar;","Radiative seesaw model: Warm dark matter, collider signatures, and lepton flavor violating signals",Physical Review D,79,1,13011,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.79.013011,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,83,radiativeseesawmodelwarmdarkmattercollidersignaturesandleptonflavorviolatingsignals
1,"Restrepo, Diego; Taoso, Marco; Valle, JWF; Zapata, Oscar;",Gravitino dark matter and neutrino masses with bilinear R-parity violation,Physical Review D,85,2,23523,2012,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.85.023523,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,40,gravitinodarkmatterandneutrinomasseswithbilinearrparityviolation
2,"Choi, Ki-Young; Restrepo, Diego; Yaguna, Carlos E; Zapata, Oscar;",Indirect detection of gravitino dark matter including its three-body decays,Journal of Cosmology and Astroparticle Physics,2010,10,33,2010,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2010/10/033,1475-7516,Journal of Cosmology and Astroparticle Physics,5.81,Q3,70.0,33,indirectdetectionofgravitinodarkmatterincludingitsthreebodydecays
3,"Sierra, D Aristizabal; Restrepo, Diego; Zapata, Oscar;",Decaying neutralino dark matter in anomalous U (1) H models,Physical Review D,80,5,55010,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.80.055010,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,24,decayingneutralinodarkmatterinanomalousu1hmodels
4,"Klasen, Michael; Yaguna, Carlos E; Ruiz-Álvarez, José D; Restrepo, Diego; Zapata, Oscar;",Scalar dark matter and fermion coannihilations in the radiative seesaw model,Journal of Cosmology and Astroparticle Physics,2013,4,44,2013,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2013/04/044,1475-7516,Journal of Cosmology and Astroparticle Physics,5.81,Q3,70.0,18,scalardarkmatterandfermioncoannihilationsintheradiativeseesawmodel
5,"Florez, Andres; Restrepo, Diego; Velasquez, Mauricio; Zapata, Oscar;",Baryonic violation of R parity from anomalous U (1) H,Physical Review D,87,9,95010,2013,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.87.095010,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,16,baryonicviolationofrparityfromanomalousu1h
6,"Molinaro, Emiliano; Yaguna, Carlos E; Zapata, Oscar;",FIMP realization of the scotogenic model,Journal of Cosmology and Astroparticle Physics,2014,7,15,2014,IOP Publishing,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1088/1475-7516/2014/07/015,1475-7516,Journal of Cosmology and Astroparticle Physics,5.81,Q3,70.0,16,fimprealizationofthescotogenicmodel
7,"Restrepo, Diego; Rivera, Andrés; Sánchez-Peláez, Marta; Zapata, Oscar; Tangarife, Walter;",Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singlets,Physical Review D,92,1,13005,2015,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.92.013005,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,12,radiativeneutrinomassesinthesingletdoubletfermiondarkmattermodelwithscalarsinglets
8,"Arbeláez, Carolina; Longas, Robinson; Restrepo, Diego; Zapata, Oscar;",Fermion dark matter from SO (10) GUTs,Physical Review D,93,1,13012,2016,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.93.013012,2470-0010;2470-0029,Physical Review D,4.643,,,10,fermiondarkmatterfromso10guts
9,"Ponce, William A; Zapata, Oscar;",Lepton masses and mixing without Yukawa hierarchies,Physical Review D,74,9,93007,2006,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.74.093007,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0,10,leptonmassesandmixingwithoutyukawahierarchies


In [96]:
an.index

Int64Index([1], dtype='int64')

In [105]:
kk=pd.DataFrame()
kk.index

Index([], dtype='object')

1


In [103]:
self.cited_articles

Unnamed: 0,Article,Cites,Year
0,"The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter modelS Horiuchi, O Macias, D Restrepo, A Rivera, O Zapata, H SilverwoodJournal of Cosmology and Ast...",2,2016
1,"The inert Zee modelD Portillo, D Restrepo, O ZapataJournal of High Energy Physics 2016 (03), 162, 2016",0,2016
2,"Direct detection of fermion dark matter in the radiative seesaw modelA Ibarra, CE Yaguna, O ZapataPhysical Review D 93 (3), 035012, 2016",6,2016
3,"Fermion dark matter from SO (10) GUTsC Arbeláez, R Longas, D Restrepo, O ZapataPhysical Review D 93 (1), 013012, 2016",10,2016
4,"Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singletsD Restrepo, A Rivera, M Sánchez-Peláez, O Zapata, W TangarifePhysical Review D 92 (1), 013005, 2015",12,2015
5,"FIMP realization of the scotogenic modelE Molinaro, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2014 (07), 015, 2014",16,2014
6,"Baryonic violation of R parity from anomalous U (1) HA Florez, D Restrepo, M Velasquez, O ZapataPhysical Review D 87 (9), 095010, 2013",16,2013
7,"Scalar dark matter and fermion coannihilations in the radiative seesaw modelM Klasen, CE Yaguna, JD Ruiz-Álvarez, D Restrepo, O ZapataJournal of Cosmology and Astroparticle Physics 2013 (04), 044,...",18,2013
8,"Gravitino dark matter and neutrino masses with bilinear R-parity violationD Restrepo, M Taoso, JWF Valle, O ZapataPhysical Review D 85 (2), 023523, 2012",40,2012
9,"Indirect detection of gravitino dark matter including its three-body decaysKY Choi, D Restrepo, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2010 (10), 033, 2010",33,2010


In [48]:
s[0]
html_to_DataFrame(s[0].decode(),headings=['Citations indices','All','Since %d' %(now.year-5)])

Unnamed: 0,A,B,C
0,Citations,270,238
1,h-index,10,9
2,i10-index,10,9


In [36]:
ss=s[1]

In [44]:
html_to_DataFrame(ss.decode(),headings=['A','B','C'])

Unnamed: 0,A,B,C
0,,,
1,"The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter modelS Horiuchi, O Macias, D Restrepo, A Rivera, O Zapata, H SilverwoodJournal of Cosmology and Ast...",2.0,2016.0
2,"The inert Zee modelD Portillo, D Restrepo, O ZapataJournal of High Energy Physics 2016 (03), 162, 2016",2016.0,
3,"Direct detection of fermion dark matter in the radiative seesaw modelA Ibarra, CE Yaguna, O ZapataPhysical Review D 93 (3), 035012, 2016",6.0,2016.0
4,"Fermion dark matter from SO (10) GUTsC Arbeláez, R Longas, D Restrepo, O ZapataPhysical Review D 93 (1), 013012, 2016",10.0,2016.0
5,"Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singletsD Restrepo, A Rivera, M Sánchez-Peláez, O Zapata, W TangarifePhysical Review D 92 (1), 013005, 2015",12.0,2015.0
6,"FIMP realization of the scotogenic modelE Molinaro, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2014 (07), 015, 2014",16.0,2014.0
7,"Baryonic violation of R parity from anomalous U (1) HA Florez, D Restrepo, M Velasquez, O ZapataPhysical Review D 87 (9), 095010, 2013",16.0,2013.0
8,"Scalar dark matter and fermion coannihilations in the radiative seesaw modelM Klasen, CE Yaguna, JD Ruiz-Álvarez, D Restrepo, O ZapataJournal of Cosmology and Astroparticle Physics 2013 (04), 044,...",18.0,2013.0
9,"Gravitino dark matter and neutrino masses with bilinear R-parity violationD Restrepo, M Taoso, JWF Valle, O ZapataPhysical Review D 85 (2), 023523, 2012",40.0,2012.0


TypeError: html_to_DataFrame() got an unexpected keyword argument 'headings'

In [45]:
tables=table.find_all("tr")

NameError: name 'table' is not defined

In [33]:
self.articles.ISSN.ix[i].split(';')[0]

'1550-7998'

In [36]:
self=a
i=1
dbj=self.articles[:i][self.articles[:i].ISSN.str.contains(\
                              self.articles.ISSN.ix[i].split(';')[0])].reset_index(drop=True)

In [29]:
a.articles

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,Institution_Authors,Institution_Groups,DOI,ISSN,DOI_Journal,Impact_Factor,Quartil,Journal_Hindex
0,"Sierra, D Aristizabal; Kubo, Jisuke; Suematsu, Daijiro; Restrepo, D; Zapata, Oscar;","Radiative seesaw model: Warm dark matter, collider signatures, and lepton flavor violating signals",Physical Review D,79,1,13011,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.79.013011,1550-7998;1550-2368,Physical Review D,4.643,Q1,259.0
1,"Restrepo, Diego; Taoso, Marco; Valle, JWF; Zapata, Oscar;",Gravitino dark matter and neutrino masses with bilinear R-parity violation,Physical Review D,85,2,23523,2012,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.85.023523,1550-7998;1550-2368,Physical Review D,,,
2,"Choi, Ki-Young; Restrepo, Diego; Yaguna, Carlos E; Zapata, Oscar;",Indirect detection of gravitino dark matter including its three-body decays,Journal of Cosmology and Astroparticle Physics,2010,10,33,2010,IOP Publishing,,,,,,,,


In [23]:
for i in range(10):
    print(i)
    if i>2:
        continue
    print('va')

0
va
1
va
2
va
3
4
5
6
7
8
9


In [16]:
i=2
a.articles[:i]

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,Institution_Authors,Institution_Groups,DOI,ISSN,DOI_Journal,Impact_Factor,Quartil,Journal_Hindex
0,"Sierra, D Aristizabal; Kubo, Jisuke; Suematsu, Daijiro; Restrepo, D; Zapata, Oscar;","Radiative seesaw model: Warm dark matter, collider signatures, and lepton flavor violating signals",Physical Review D,79,1,13011,2009,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.79.013011,1550-7998;1550-2368,Physical Review D,,,
1,"Restrepo, Diego; Taoso, Marco; Valle, JWF; Zapata, Oscar;",Gravitino dark matter and neutrino masses with bilinear R-parity violation,Physical Review D,85,2,23523,2012,APS,Óscar Alberto Zapata Noreña,GFIF,http://dx.doi.org/10.1103/physrevd.85.023523,1550-7998;1550-2368,Physical Review D,,,



Authors	Title	Publication	Volume	Number	Pages	Year	Publisher	Institution_Authors	Institution_Groups	DOI	ISSN	DOI_Journal	Impact_Factor
0	Sierra, D Aristizabal; Kubo, Jisuke; Suematsu, Daijiro; Restrepo, D; Zapata, Oscar;	Radiative seesaw model: Warm dark matter, collider signatures, and lepton flavor violating signals	Physical Review D	79	1	13011	2009	APS	Óscar Alberto Zapata Noreña	GFIF	http://dx.doi.org/10.1103/physrevd.79.013011	1550-7998;1550-2368	Physical Review D	4.643
1	Restrepo, Diego; Taoso, Marco; Valle, JWF; Zapata, Oscar;	Gravitino dark matter and neutrino masses with bilinear R-parity violation	Physical Review D	85	2	23523	2012	APS	Óscar Alberto Zapata Noreña	GFIF	http://dx.doi.org/10.1103/physrevd.85.023523	1550-7998;1550-2368	Physical Review D	4.643
2	Choi, Ki-Young; Restrepo, Diego; Yaguna, Carlos E; Zapata, Oscar;	Indirect detection of gravitino dark matter including its three-body decays	Journal of Cosmology and Astroparticle Physics	2010	10	33	2010	IOP Publishing	Óscar Alberto Zapata Noreña	GFIF	http://dx.doi.org/10.1088/1475-7516/2010/10/033	1475-7516	Journal of Cosmology and Astroparticle Physic
fulldoi[:1]
fulldoi[:1]

## Save uptated data base

In [9]:
a.to_csv('newcitations.csv')

Restore saved data base

In [10]:
arts=articles('newcitations.csv',authors_file='authors.json',group_file='groups.json')

In [11]:
arts.articles_update()

Updating entry:0.1.2.

In [61]:
IFdf=_get_impact_factor_from_journal_name()


4.643

In [9]:
i=0
selfarticles=a.articles
if selfarticles.ix[i].DOI_Journal:
    q=selfarticles.ix[i].DOI_Journal.lower().replace(' ','-')
elif selfarticles.ix[i].Publication:
     q=selfarticles.ix[i].Publication


'http://www.journal-database.com/journal/physical-review-d.html'

In [10]:
import lxml.html as LH
import requests
import pandas as pd
def text(elt):
    return elt.text_content().replace(u'\xa0', u' ')

url = URL #'http://www.fdmbenzinpriser.dk/searchprices/5/'
r = requests.get(url)
root = LH.fromstring(r.content)

for table in root.xpath('//table'):
    header = [text(th) for th in table.xpath('//th')]        # 1
    data = [[text(td) for td in tr.xpath('td')]  
            for tr in table.xpath('//tr')]                   # 2
    data = [row for row in data if len(row)==len(header)]    # 3 
    data = pd.DataFrame(data, columns=header)                # 4
    print(data)

Empty DataFrame
Columns: []
Index: []


In [40]:
%%writefile utilities.py
#/usr/bin/env python
from bs4 import BeautifulSoup
import pandas as pd
def html_to_DataFrame(html_page_with_table,attrs={},headings=[]):
    '''
    Extract the table of a web page and convert to a pandas DataFrame
    '''

    soup = BeautifulSoup(html_page_with_table,"lxml")
    table = soup.find("table",attrs)

    # The first tr contains the field names.
    if table:
        if not headings:
            headings = [th.get_text().strip() for th in table.find("tr").find_all("td")]

        datasets = []
        for row in table.find_all("tr")[1:]:
            dataset = [td.get_text() for td in row.find_all("td") if td.get_text().strip()]
            datasets.append(dataset)
        
        if headings:  
            return pd.DataFrame(datasets,columns=headings)
    else:
        return pd.DataFrame()

Overwriting utilities.py


In [41]:
from utilities import *

In [42]:
html_to_DataFrame??

In [73]:
import utilities as ut
ut.html_to_DataFrame(r.content)

In [12]:
issn='1550-7998;1550-2368'.split(';')[0]
g=_get_impact_factor_from_issn(issn)

In [32]:
import urllib
urllib.un

# Obtain citations
Can be obtaine from groups of 100
0-100

In [26]:
r=requests.get('https://scholar.google.com/citations?sortby=pubdate&hl=en&user=noRnsu8AAAAJ&view_op=list_works&cstart=0&pagesize=100')

In [39]:
HTML(r.text)

Citation indices,All,Since 2011
Citations,270,238
h-index,10,9
i10-index,10,9

Title1–13,Cited by,Year
"The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter modelS Horiuchi, O Macias, D Restrepo, A Rivera, O Zapata, H SilverwoodJournal of Cosmology and Astroparticle Physics 2016 (03), 048, 2016",2.0,2016
"The inert Zee modelD Portillo, D Restrepo, O ZapataJournal of High Energy Physics 2016 (03), 162, 2016",,2016
"Direct detection of fermion dark matter in the radiative seesaw modelA Ibarra, CE Yaguna, O ZapataPhysical Review D 93 (3), 035012, 2016",6.0,2016
"Fermion dark matter from SO (10) GUTsC Arbeláez, R Longas, D Restrepo, O ZapataPhysical Review D 93 (1), 013012, 2016",10.0,2016
"Radiative neutrino masses in the singlet-doublet fermion dark matter model with scalar singletsD Restrepo, A Rivera, M Sánchez-Peláez, O Zapata, W TangarifePhysical Review D 92 (1), 013005, 2015",12.0,2015
"FIMP realization of the scotogenic modelE Molinaro, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2014 (07), 015, 2014",16.0,2014
"Baryonic violation of R parity from anomalous U (1) HA Florez, D Restrepo, M Velasquez, O ZapataPhysical Review D 87 (9), 095010, 2013",16.0,2013
"Scalar dark matter and fermion coannihilations in the radiative seesaw modelM Klasen, CE Yaguna, JD Ruiz-Álvarez, D Restrepo, O ZapataJournal of Cosmology and Astroparticle Physics 2013 (04), 044, 2013",18.0,2013
"Gravitino dark matter and neutrino masses with bilinear R-parity violationD Restrepo, M Taoso, JWF Valle, O ZapataPhysical Review D 85 (2), 023523, 2012",40.0,2012
"Indirect detection of gravitino dark matter including its three-body decaysKY Choi, D Restrepo, CE Yaguna, O ZapataJournal of Cosmology and Astroparticle Physics 2010 (10), 033, 2010",33.0,2010


101-200

In [108]:
if 1==1:
soup = BeautifulSoup(r.text,"lxml")

In [109]:
s=soup.find_all("table")

In [110]:
datasets = []
for row in table.find_all("tr")[2:]:
    dataset = [td.get_text() for td in row.find_all("td") if td.get_text().strip()]
    datasets.append(dataset)

ba=pd.DataFrame(datasets,columns=['Article','Cites','Year'])


In [111]:
ba[~ba.Year.isnull()].reset_index(drop=True)

Unnamed: 0,Article,Cites,Year
0,"Process-based species pools reveal the hidden signature of biotic interactions amid the influence of temperature filteringJP Lessard, BG Weinstein, MK Borregaard, KA Marske, DR Martin, ...The Amer...",7,2016
1,"CAP [mdash] advancing the evaluation of preclinical Alzheimer disease treatmentsEM Reiman, JB Langbaum, PN Tariot, F Lopera, RJ Bateman, JC Morris, ...Nature Reviews Neurology, 2015",9,2015
2,"Brain imaging and blood biomarker abnormalities in children with autosomal dominant Alzheimer disease: a cross-sectional studyYT Quiroz, AP Schultz, K Chen, HD Protas, M Brickhouse, AS Fleisher, ....",9,2015
3,"Memory binding and white matter integrity in familial Alzheimer’s diseaseMA Parra, H Saarimäki, ME Bastin, AC Londoño, L Pettit, F Lopera, ...Brain, awv048, 2015",8,2015
4,"Associations between biomarkers and age in the presenilin 1 E280A autosomal dominant Alzheimer disease kindred: a cross-sectional studyAS Fleisher, K Chen, YT Quiroz, LJ Jakimovich, MG Gomez, CM L...",20,2015
5,"Node‐based analysis of species distributionsMK Borregaard, C Rahbek, J Fjeldså, JL Parra, RJ Whittaker, CH GrahamMethods in Ecology and Evolution 5 (11), 1225-1235, 2014",2,2014
6,"Origin of the PSEN1 E280A mutation causing early-onset Alzheimer's diseaseMA Lalli, HC Cox, ML Arcila, L Cadavid, S Moreno, G Garcia, L Madrigal, ...Alzheimer's & Dementia 10 (5), S277-S283. e10, ...",9,2014
7,"An 1 H-MRS framework predicts the onset of Alzheimer's disease symptoms in PSEN1 mutation carriersAC Londono, FX Castellanos, A Arbelaez, A Ruiz, DC Aguirre-Acevedo, ...Alzheimer's & Dementia 10 (...",13,2014
8,"Taxonomic, phylogenetic, and trait beta diversity in South American hummingbirdsBG Weinstein, B Tinoco, JL Parra, LM Brown, JA McGuire, FG Stiles, ...The American Naturalist 184 (2), 211-224, 2014",8,2014
9,"The origin and maintenance of montane diversity: integrating evolutionary and ecological processesCH Graham, AC Carnaval, CD Cadena, KR Zamudio, TE Roberts, ...Ecography 37 (8), 711-719, 2014",24,2014


In [66]:
i=5
for j in range(3):
    print(table.find_all("tr")[2:][i].find_all("td")[j].get_text())

Process-based species pools reveal the hidden signature of biotic interactions amid the influence of temperature filteringJP Lessard, BG Weinstein, MK Borregaard, KA Marske, DR Martin, ...The American Naturalist 187 (1), 75-88, 2016
7
2016


<table id="gsc_a_t"><thead id="gsc_a_hd"><tr aria-hidden="true" id="gsc_a_tr0"><th class="gsc_a_t" id="gsc_a_tr0_t"></th><th class="gsc_a_c"></th><th class="gsc_a_y"></th></tr><tr id="gsc_a_trh"><th class="gsc_a_t" id="gsc_a_trh_t" scope="col"><span id="gsc_a_ta"><a class="gsc_a_a" href="/citations?hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;view_op=list_works&amp;sortby=title">Title</a></span><span id="gsc_a_nn">1–100</span></th><th class="gsc_a_c" scope="col"><span id="gsc_a_ca"><a class="gsc_a_a" href="/citations?hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;view_op=list_works">Cited by</a></span></th><th class="gsc_a_y" scope="col"><span class="gsc_a_h">Year</span></th></tr></thead><tbody id="gsc_a_b"><tr class="gsc_a_tr"><td class="gsc_a_t"><a class="gsc_a_at" href="/citations?view_op=view_citation&amp;hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;sortby=pubdate&amp;citation_for_view=noRnsu8AAAAJ:xtoqd-5pKcoC">The role of OH… O and

In [32]:
r.text.split('<table')[3]

' id="gsc_a_t"><thead id="gsc_a_hd"><tr id="gsc_a_tr0" aria-hidden="true"><th class="gsc_a_t" id="gsc_a_tr0_t"></th><th class="gsc_a_c"></th><th class="gsc_a_y"></th></tr><tr id="gsc_a_trh"><th class="gsc_a_t" id="gsc_a_trh_t" scope="col"><span id="gsc_a_ta"><a href="/citations?hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;view_op=list_works&amp;sortby=title" class="gsc_a_a">Title</a></span><span id="gsc_a_nn">1&ndash;100</span></th><th class="gsc_a_c" scope="col"><span id="gsc_a_ca"><a href="/citations?hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;view_op=list_works" class="gsc_a_a">Cited by</a></span></th><th class="gsc_a_y" scope="col"><span class="gsc_a_h">Year</span></th></tr></thead><tbody id="gsc_a_b"><tr class="gsc_a_tr"><td class="gsc_a_t"><a href="/citations?view_op=view_citation&amp;hl=en&amp;oe=ASCII&amp;user=noRnsu8AAAAJ&amp;pagesize=100&amp;sortby=pubdate&amp;citation_for_view=noRnsu8AAAAJ:xtoqd-5pKcoC" class="gsc_a_at">The role of OH&#8230

In [50]:
r2=requests.get('https://scholar.google.com/citations?sortby=pubdate&hl=en&user=noRnsu8AAAAJ&view_op=list_works&cstart=101&pagesize=200')

In [51]:
r2.text

'<!doctype html><head><meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="referrer" content="always"><meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2"><style>@viewport{width:device-width;min-zoom:1;max-zoom:2;}</style><meta name="format-detection" content="telephone=no"><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}#gs_top{position:relative;min-width:964px;-webkit-tap-highlight-color:rgba(0,0,0,0);}#gs_top>*:not(#x){-webkit-tap-highlight-color:rgba(204,204,204,.5);}.gs_el_ph #gs_top,.gs_el_ta #gs_top{min-width:300px;}#gs_top.gs_nscl{position:fixed;width:100%;}body,td,input{font-size:13px;font-family:Arial,sans-serif;line-height:1.24}body{background:#fff;color:#222;-webkit-text-size-adjust:100%;-moz-text-size-adjust:none;}.gs_gray{color:#777

In [57]:
#f=open('perfil_udea_101_200.html','w')
#f.write(r2.text)
#f.close()

'259'

In [71]:
r.status_code

200

Python module for Web of Science
https://github.com/enricobacis/wos

In [18]:
ws=requests.get('http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcApp=PARTNER_APP&SrcAuth=LinksAMR&KeyRecord=1040-2446&PointOfEntry=Impact&DestApp=JCR')

In [19]:
from IPython.core.display import HTML
HTML(ws.text)

In [21]:
from wos import WosClient
import wos.utils

with WosClient() as client:
    print(wos.utils.query(client, 'AU=Knuth Donald'))

Authenticated (SID: 4WbstudfwmMGwmjI4O5)


WebFault: b"Server raised fault: 'Not authorized for product: WWS'"

In [51]:
session = requests.Session()
response = session.get('https://www.webofknowledge.com/?utm_source=false&utm_medium=false&utm_campaign=false&_ga=1.266380558.1021499308.1471640029')

In [52]:
session.cookies.get_dict()

{'CUSTOMER': '"Colciencias Consortium"',
 'E_GROUP_NAME': '"Universidad de Antioquia"',
 'JSESSIONID': '4E59C91115C9E346C3BADC57F367798A',
 'SID': '"4CuaACDXlJfwufIOfLI"'}

In [53]:
HTML(response.text)

0
"Welcome to Web of Science!To continue the registration process, please verify your email address by copying and pasting the following code in the text box on the registration page:{0}If you have received this email in error, you do not need to take any action to cancel the registration process. The email account will not be verified and you will not receive any further emails.Thank you, The Thomson Reuters Team© 2015 Thomson Reuters | Terms Of Use | Privacy Policy"

0
"Welcome to Web of Science!To continue the registration process, please verify your email address by copying and pasting the following code in the text box on the registration page:{0}If you have received this email in error, you do not need to take any action to cancel the registration process. The email account will not be verified and you will not receive any further emails.Thank you, The Thomson Reuters Team"
© 2015 Thomson Reuters | Terms Of Use | Privacy Policy

0
"Welcome to Web of Science!To continue the registration process, please verify your email address by copying and pasting the following code in the text box on the registration page:{0}If you have received this email in error, you do not need to take any action to cancel the registration process. The email account will not be verified and you will not receive any further emails.Thank you, The Thomson Reuters Team"

0,1
E-mail Address: Password: Cancel Keep me signed in Forgot Password Register,

0,1
,
E-mail Address:,
Password:,
,Cancel
,Keep me signed in Forgot Password Register

0,1,2
,,Cancel

0
"Save to a Local Drive Save your history to a local drive. Once saved, close this window."

0
Open from a Local Drive Use Browse to open a locally saved history file. Please select a valid saved history file.

0,1
,
E-mail: Password: Forgot Your Password?,
E-mail:,
Password:,
,Forgot Your Password?

0,1
E-mail:,
Password:,
,Forgot Your Password?

0,1
,Cancel

0,1
E-mail Address: Please enter an E-mail Address Please enter a valid E-mail Address Retype E-mail Address: Please enter an E-mail Address Cancel,"Note: If you are already registered for a Thomson Reuters product or service, please sign in.  Why register with the Web of Science? Automatic sign in  Access saved searches and search history  Create alerts  Add references to your EndNote Library  Select a preferred starting database or product  Update your personal information"

0,1
,
E-mail Address:,
,Please enter an E-mail Address Please enter a valid E-mail Address
Retype E-mail Address:,
,Please enter an E-mail Address
Cancel,Cancel

0,1,2
,,Cancel

0,1,2,3,4
+ Add Another Field | Reset Form,+ Add Another Field | Reset Form,Topic Title Author Author Identifiers Group Author Editor Publication Name DOI Year Published Address Organization-Enhanced Conference Language Document Type Funding Agency Grant Number Accession Number PubMed ID,,


http://stackoverflow.com/questions/11322430/python-how-to-send-post-request

In [11]:
session = requests.Session()
r=session.post('https://apps.webofknowledge.com/Search.do?product=WOS&SID=1BBmTCu9raQm48yBVnH&search_mode=GeneralSearch&prID=cd5039a6-6615-439d-b26b-e4ae5c62f0c7',verify=False)

* https://github.com/ckreibich/scholar.py
* https://github.com/venthur/gscholar

Udea Publications in Wos
https://apps.webofknowledge.com/Search.do?product=WOS&SID=4DH2XC6towYAGbMjoOA&search_mode=GeneralSearch&prID=4e106e04-7b30-4365-800e-c490843205e9

Save into raw tex and read as a series:
http://stackoverflow.com/questions/15760856/how-to-read-a-pandas-series-from-a-csv-file

In [109]:
%%bash
echo "AB 5
CD 2
EF 4" >series.csv

In [114]:
import re
f=open('series.csv')
l=f.readlines()
#use re to change sep to :::

In [184]:
%%bash
cat series.csv | sed -r 's/([A-Z][A-Z]) /\1:::/' > kk.csv

In [194]:
df=pd.read_csv('kk.csv',sep=':::',names=['k','v'],engine='python')
pd.Series(df.v.values,index=df.k.values)

AB    5
CD    2
EF    4
dtype: int64

Or with numpy

In [163]:
import numpy as np
lf=np.loadtxt('kk.csv',dtype=bytes,delimiter=':::').astype(str)

In [165]:
lf

array([['AB', '5'],
       ['CD', '2'],
       ['EF', '4']], 
      dtype='<U2')

In [170]:
lf[:,1]

array(['5', '2', '4'], 
      dtype='<U2')

In [169]:
lf[:,0]

array(['AB', 'CD', 'EF'], 
      dtype='<U2')

In [171]:
pd.Series(lf[:,1],index=lf[:,0])

AB    5
CD    2
EF    4
dtype: object