<a href="https://colab.research.google.com/github/restrepo/GSProfile/blob/master/GSProfile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Scholar Profile

In [0]:
import pandas as pd
import time
pd.set_option('display.max_colwidth',200)

## Methodology
By using the  Proper credentials in the [Google Scholar Profile](https://scholar.google.com/citations?sortby=pubdate&hl=en&us)
* Export the data into a CSV file. 
* Copy and paste the web version of the publication list into a spreadsheet

The two resulting files are to be loaded below from Google Drive by usiing the `File → Publish to the web` menú into CSV of Google spreadsheets

In [0]:
import re
import unidecode

In [0]:
sp='https://docs.google.com/spreadsheets/d/e'
gs_id='2PACX-1vQ8oE7rkkqQ07WzSEq8qCciqwJu8lOx8cnH0YTn2e6HDL3DJHYF2KahjkuVv6wVSMJEPeJLfBLxRRub'
cp_id='2PACX-1vRKmxmnaxySZ7OMUNbH-BpaGB1RILM55CMfJtlJUd6XUwnrEAzUh5YkzQI7R8_fl-kRHSBpGKfXwC4O'
gs=pd.read_csv('{}/{}/pub?output=csv'.format(sp,gs_id)).reset_index(drop=True)
time.sleep(1)
cp=pd.read_csv('{}/{}/pub?output=csv'.format(sp,cp_id)).reset_index(drop=True)

Normalize data

In [0]:
gs['TitleU']=gs['Title'].apply(lambda s: unidecode.unidecode(s) if s else s)
cp['Reference']=cp['Reference'].apply(lambda s: unidecode.unidecode(s) if s else s)

In [0]:
author='[A-Z]{1,3}\s[A-Z][a-z\-]+'
AUTHOR='[A-Z]{1,3}\s[A-Z\-]+'
cp['TitleU']=cp['Reference'].apply(lambda s: re.sub(
    '([a-z0-9…\?\.\'\"\]\)])%s,{0,1}' %author,r'\1::' ,s,re.UNICODE)).str.split('::').str[0]
#Special cases
#...
#cp['TitleU']=cp['TitleU'].apply(lambda s: re.sub(
#                  '(\.\.\.)%s,{0,1}' %author,r'\1::' ,s,re.UNICODE)
#                   if re.search('\.\.\.%s[,\n]' %author,s,re.UNICODE) else s
#                  ).str.split('::').str[0]
#Full upper-case letter: NOT POSSIBLE TO DISCRIMINATE FINAL 1 or two \w
cp['TitleU']=cp['TitleU'].apply(lambda s: re.sub('(\w+)[A-Z]\s[A-Z][a-z\-]+',r'\1::',s,re.UNICODE) 
                  if re.search('\s%s[,\n]' %author,s,re.UNICODE) else s
                 ).str.split('::').str[0]
# Upper case last name, only if end in , or \n
cp['TitleU']=cp['TitleU'].apply(lambda s: re.sub('(\w+)[A-Z]\s[A-Z][A-Z\-]+',r'\1::',s,re.UNICODE) 
                  if re.search('[\s\w]%s[,\n]' %AUTHOR,s,re.UNICODE) else s
                 ).str.split('::').str[0]
# Separated by space
cp['TitleU']=cp['TitleU'].apply(lambda s: re.sub(
                '(\w)\s%s,{0,1}' %author,r'\1::' ,s,re.UNICODE      )                
                   if re.search('\s%s[,\n]' %author,s,re.UNICODE) else s
                  ).str.split('::').str[0]
#Long authors: L de la Torre
longauthor='[A-Z]{1,3}(\s[a-z]+)+\s[A-Z][a-z\-]+'
cp['TitleU']=cp['TitleU'].apply(lambda s: re.sub(
            '([a-z0-9…\?\.\'\"\]\)])%s,{0,1}' %longauthor,r'\1::' ,s,re.UNICODE)
                   if re.search('%s[,\n]' %longauthor,s,re.UNICODE) else s
                  ).str.split('::').str[0]



#cp['TitleU']=cp['Reference'].apply(lambda s: re.sub(
#    '(\.\.\.)[A-Z]+\s[A-Z][a-z\-]+,{0,1}',r'\1::',s,re.UNICODE)).str.split('::').str[0]
#Title in Caps
#Totle end with space
#Fix obvious problems
cp['TitleU']=cp['TitleU'].str.replace('([0-9\s][TGM]e)$',r'\1V')

### Prepare merge

In [0]:
gs=gs.drop_duplicates('TitleU').reset_index(drop=True)
gs.shape

(1935, 9)

In [0]:
GS=gs.merge(cp.drop('Year',axis='columns'),on='TitleU',how='left')

In [0]:
GS=GS.drop_duplicates('TitleU').reset_index(drop=True)
GS.shape

(1935, 11)

In [0]:
GS[~GS.Reference.isna()].shape

(1640, 11)

TODO: Make a similarity search for each of the GF.TitleU  `NaN`
with the list of cp.TitleU

In [0]:
GS['Cites']=GS['Cites'].apply(lambda s: int(s.replace('*','')) if isinstance(s,str) else 0)

In [0]:
GS[:2]

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,TitleU,Reference,Cites
0,"Samia, A; Feddi, E; Duque, CA; Mora-Ramos, ME; Akimov, V; Correa, JD;",Optoelectronic properties of phosphorene quantum dots functionalized with free base porphyrins,Computational Materials Science,171.0,,109278,2020.0,Elsevier,Optoelectronic properties of phosphorene quantum dots functionalized with free base porphyrins,"Optoelectronic properties of phosphorene quantum dots functionalized with free base porphyrinsA Samia, E Feddi, CA Duque, ME Mora-Ramos, V Akimov, JD Correa\nComputational Materials Science 171, 1...",0
1,"Calle, Julián; Restrepo, Diego; Zapata, Óscar;",Dirac neutrino mass generation from a Majorana messenger,Physical Review D,101.0,3.0,35004,2020.0,American Physical Society,Dirac neutrino mass generation from a Majorana messenger,"Dirac neutrino mass generation from a Majorana messengerJ Calle, D Restrepo, O Zapata\nPhysical Review D 101 (3), 035004",1


In [0]:
GS.to_json('GS.json')

### Search for similar Titles
For failed merge entries (with `Reference` `NaN`), search for a similar `TitleU` on `cp`

In [0]:
from fuzzywuzzy import process

In [0]:
GS[GS.Reference.isna()].reset_index(drop=True).shape

(295, 11)

In [0]:
def extractOne(row,df=cp,rmin=90):
    
    cites=0
    try:
        kk=process.extractOne(row['TitleU'],cp['TitleU'])
    except:
        kk=['',0,0]
    #print('**',kk,kk[2])
    if kk[1]>rmin:
        cites=cp.loc[kk[2],'Cites']
        #Be sure cites is not string
        if isinstance(cites,str):
            cites=cites.replace('*','')
            try:
                cites=float(cites)
            except:
                print('WARNING: weird cites {} at {} of df'.format(cites,kk[2]))
                cites=0
        #nan>0 → False
        if cites>0:
             cites=cites
        else:
            cites=0
    #cites either int or 0
    return int(cites)

WARNING: similarity search is rather slow...

In [0]:
#kk=GS[GS.Reference.isna()].reset_index(drop=True)
#nan is a float 
GS['New_Cites']=GS.apply(lambda row: extractOne(row,cp) 
              if isinstance(row['Reference'],float) 
              else row['Cites'],axis=1 
             )

In [0]:
GS.shape

(1935, 12)

Test only NaN References changed

In [0]:
import sys
if GS[GS['New_Cites']!=GS['Cites'] ].shape[0]==GS[np.logical_and( 
      GS['New_Cites']!=GS['Cites'],
              GS['Reference'].isna()) ].shape[0]:
    GS['Cites']=GS['New_Cites']
    GS=GS.drop(['New_Cites','TitleU'],axis='columns')
else:
    sys.exit('ERROR in similarity search!')

Save to JSON

In [0]:
GS.to_json('GS.json')

In [0]:
new_cites=GS.Cites.sum()
total_cites=cp.Cites.apply(lambda s: s.replace('*','') if isinstance(s,str) else s
              ).astype(float
        ).apply(lambda f: 0 if pd.np.isnan(f) else f).sum()

In [0]:
print('{} recovered cites from TOTAL: {}'.format(new_cites,int(total_cites)))

24028 recovered cites from TOTAL: 24282


In [0]:
GS[:2]

Unnamed: 0,Authors,Title,Publication,Volume,Number,Pages,Year,Publisher,Reference,Cites
0,"Samia, A; Feddi, E; Duque, CA; Mora-Ramos, ME; Akimov, V; Correa, JD;",Optoelectronic properties of phosphorene quantum dots functionalized with free base porphyrins,Computational Materials Science,171.0,,109278,2020.0,Elsevier,"Optoelectronic properties of phosphorene quantum dots functionalized with free base porphyrinsA Samia, E Feddi, CA Duque, ME Mora-Ramos, V Akimov, JD Correa\nComputational Materials Science 171, 1...",0
1,"Calle, Julián; Restrepo, Diego; Zapata, Óscar;",Dirac neutrino mass generation from a Majorana messenger,Physical Review D,101.0,3.0,35004,2020.0,American Physical Society,"Dirac neutrino mass generation from a Majorana messengerJ Calle, D Restrepo, O Zapata\nPhysical Review D 101 (3), 035004",1


# Make analysis

In [0]:
GS=pd.read_json('GS.json').reset_index(drop=True)

### Articles per Year:
Articles/Citations for:

In [0]:
for Y in [2020,2019,2018,2017,2016,2015,2014,2013,2012,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000]:
    print('{}: {}/{}'.format(Y,GS[GS.Year==Y].shape[0],GS[GS.Year==Y]['Cites'].sum()) )

2020: 20/1
2019: 121/184
2018: 97/336
2017: 95/732
2016: 95/896
2015: 109/1139
2014: 97/3177
2013: 105/1909
2012: 95/1232
2010: 89/1670
2009: 121/1532
2008: 83/1713
2007: 73/1001
2006: 96/1812
2005: 71/642
2004: 55/483
2003: 66/604
2002: 42/567
2001: 42/489
2000: 44/700
