<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/Query_CTR-FCEN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Búsquedas WOS+SCI+SCP+PTJ+CTR para FCEN

Búsquedas en bases bibligráficas  
* Web of Science (WOS), 
* Scielo (SCI)
* Scopus  (SCP)
* Puntaje (UDEA)
* Center (CTR)
de los artículos científicos de la UdeA

La base de datos se creó con:

[WOS_SCI_SCP_PTJ_CTR.ipynb](./WOS_SCI_SCP_PTJ_CTR.ipynb)

In [1]:
import os
VERSION='NEW'
if os.getcwd()=='/content':
    !pip install openpyxl xlrd wosplus fuzzywuzzy[speedup] > /dev/null

## functions

In [1]:
import pandas as pd
import wosplus as wp

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel/json or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [2]:
%%writefile drive.cfg
[FILES]
WOS_SCI_SCP_PTJ_CTR.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j

Overwriting drive.cfg


##  Load data bases

In [3]:
drive_files=wp.wosplus('drive.cfg')

In [53]:
UDEAjsonfile='WOS_SCI_SCP_PTJ_CTR.json.gz'
tmp=drive_files.load_biblio(UDEAjsonfile,compression='gzip')
UDEA=drive_files.biblio['WOS'].copy().reset_index(drop=True)



In [59]:
UDEA=pd.read_json('WOS_SCI_SCP_PTJ_CTR.json.gz',compression='gzip')

In [60]:
json_column='UDEA_authors'
UDEA_NOT=UDEA[UDEA[json_column]==''].reset_index(drop=True)
UDEA_YES=UDEA[UDEA[json_column]!=''].reset_index(drop=True)
UDEA_YES.shape[0],UDEA_NOT.shape[0]

(15700, 0)

# Búsquedas

In [61]:
import pandas as pd
import numpy as np
import fuzzywuzzy.process as fwp
from fuzzywuzzy import fuzz
def flatten_if_nested(l):
    flatten=False
    for i in l:
        if type(i)==list:
            #return i
            flatten=True
    if flatten:
        l=[item for sublist in l for item in sublist]
        l=pd.np.array(l)
    return l
def extract_key(df,key,json_column='UDEA_authors'):
    '''
    Extract all the unique key values of the list of dictionaries in 
    a json column when the key value is a string or another list
    '''
    ll=df[json_column].apply(lambda l: np.unique([ d.get(key) for d in l 
                                if d.get(key) ]) if type(l)==list else l)
    if ll.str[0].apply(lambda l: l if type(l)==list else None).dropna().shape[0]:
        ll=ll.apply(flatten_if_nested)
    ll=ll.apply(pd.Series).stack().values
    return pd.DataFrame( {key:list(ll)} ).groupby(key)[key].count().sort_values(ascending=False)
def extract_key_unique(*args,**kwargs):
    keys=extract_key(*args,**kwargs).keys()
    return [ k for k in keys if k]



#.apply(....) is a loop!
g=[]
#append to g
tmp=UDEA.UDEA_authors.apply(lambda l: 
                        get_groups(l,g)
        if type(l)==list else None
                        )
grupos={'key':'GRUPO',
            'values' :g}


In [62]:
def query_json_column(q,df=UDEA,json_column='UDEA_authors',
                        choices=nombre_completo,scorer=fuzz.partial_token_sort_ratio,score_cutoff=0):
    #Found best exact match from index
    fchoices=fwp.extractOne(q,choices['values'],scorer=scorer,score_cutoff=score_cutoff)
    # Exact search in indexed subcolumn converted to strins (e.g list → string if necessary)
    if fchoices:
        fchoices=fchoices[0]
        dfF=df[df[json_column].apply(lambda l: True in [ str(d.get(choices['key'])).find(fchoices)>-1 
                                        for d in l if d.get(choices['key'])] if type(l)==list else False)]
        return dfF.reset_index(drop=True)
    else:
        return pd.DataFrame()

## Cree indices para búsquedas

In [65]:
def get_groups(l,g):
    '''Normalize columna GRUPO'''
    for d in l:
        gt=d.get('GRUPO')
        if gt and type( gt )==str:
            gs=gt.replace(
                ', Grupo','; Grupo'
            ).split('; ')
            for gg in gs:
                if gg not in g:
                    g.append(gg)
    return g

facultades={'key':'FACULTAD',
            'values' : extract_key_unique(UDEA,'FACULTAD',json_column='UDEA_authors') }
departamentos={'key':'DEPARTAMENTO',
            'values' :extract_key_unique(UDEA,'DEPARTAMENTO',json_column='UDEA_authors')}
nombre_completo={'key'    : 'NOMBRE COMPLETO',
            'values' : extract_key_unique(UDEA,'NOMBRE COMPLETO',json_column='UDEA_authors')}
full_name={'key'    : 'full_name',
            'values' : extract_key_unique(UDEA,'full_name',json_column='UDEA_authors')}
udea_affiliations={'key'    : 'WOS_affiliation',
            'values' : extract_key_unique(UDEA.fillna(''),'WOS_affiliation',json_column='UDEA_authors')}
wos_affiliations={'key'    : 'affiliation',
            'values' : extract_key_unique(UDEA,'WOS_affiliation',json_column='authors_WOS')}
udea_author={'key'    : 'WOS_author',
            'values' : extract_key_unique(UDEA.fillna(''),'WOS_author',json_column='UDEA_authors')}
wos_author={'key'    : 'WOS_author',
            'values' : extract_key_unique(UDEA,'WOS_author',json_column='authors_WOS')}

## Centro

In [66]:
cen=query_json_column('Facultad de Ciencias Exactas y Naturales',df=UDEA,json_column='UDEA_authors',
                        choices=facultades,scorer=fuzz.partial_token_sort_ratio,score_cutoff=79)

In [67]:
cen.shape

(2327, 181)

## Departamento

In [68]:
extract_key(cen,'DEPARTAMENTO')[:4]

DEPARTAMENTO
Instituto de Física            884
Instituto de Biología          671
Instituto de Química           667
Departamento de Matemáticas    142
Name: DEPARTAMENTO, dtype: int64

In [69]:
aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json').reset_index(drop=True)

In [70]:
aunly['DEPARTAMENTO']=aunly['UDEA_authors'].apply( lambda d: d.get('DEPARTAMENTO')   ).dropna()

In [71]:
dpto=['Instituto de Física',
        'Instituto de Biología',
        'Instituto de Química',
        'Departamento de Matemáticas' ]

In [72]:
print('DEPARTAMENTO')
for k in dpto: 
    print('{}:  {}'.format(k,
                    aunly[aunly['DEPARTAMENTO']==k].shape[0]))

DEPARTAMENTO
Instituto de Física:  33
Instituto de Biología:  33
Instituto de Química:  37
Departamento de Matemáticas:  22


### CITACIONES

In [73]:
df=pd.DataFrame()
dfinst={}
for k in dpto:
    inst={}
    dfinst[k]=query_json_column(k,df=UDEA,json_column='UDEA_authors',
                        choices=departamentos,scorer=fuzz.partial_token_sort_ratio,score_cutoff=100)
    inst['DEPARTAMENTO']=k
    inst['WOS Cited by']=dfinst[k].Z9.sum()
    inst['SCP Cited by']=dfinst[k]['SCP_Cited by'].sum()
    df=df.append(inst,ignore_index=True)

ct=[c for c in df.columns if c.find('Cited')>-1]
for c in ct:
    df[c]=df[c].astype(int)
df.loc[0,'GS Cited by*']='22500'
print('* https://scholar.google.com/citations?user=mxSOjTYAAAAJ')
df    

* https://scholar.google.com/citations?user=mxSOjTYAAAAJ


Unnamed: 0,DEPARTAMENTO,SCP Cited by,WOS Cited by,GS Cited by*
0,Instituto de Física,8207,8067,22500.0
1,Instituto de Biología,6848,11562,
2,Instituto de Química,7089,7078,
3,Departamento de Matemáticas,387,306,


In [76]:
pd.set_option('display.max_rows', 500)

In [77]:
extract_key(dfinst['Instituto de Física'],'full_name')

full_name
DUQUE ECHEVERRI CARLOS ALBERTO         261
MORALES ARAMBURO ALVARO LUIS            86
BARRERA RAMIREZ JOHN FREDY              62
BARRERO MENESES CESAR AUGUSTO           59
RESTREPO CARDENAS JOHANS                57
MORA RAMOS MIGUEL EDUARDO               49
REYES GOMEZ ERNESTO AMADOR              49
SANZ VICARIO JOSE LUIS                  46
ARNACHE OLMOS OSCAR LUIS                40
RESTREPO QUINTERO DIEGO ALEJANDRO       37
PONCE GUTIERREZ WILLIAM ANTONIO         34
HENAO HENAO RODRIGO DE JESUS            32
MAZO ZULUAGA JOHANN                     31
OSORIO GUILLEN JORGE MARIO              29
RODRIGUEZ REY BORIS ANGHELO             25
OSORIO VELEZ JAIME ALBERTO              23
ZULUAGA CALLEJAS JORGE IVAN             23
RAIGOZA BOHORQUEZ NICOLAS FERNANDO      22
ZAPATA NOREÑA OSCAR ALBERTO             22
GARCIA TELLEZ KAREN EDILMA              20
LOPEZ RIOS SONIA YANETH                 19
PACHON CONTRERAS LEONARDO AUGUSTO       18
RUEDA MUNOZ EDGAR ALBERTO               17
M

In [99]:
i=0
print(dpto[i])
aunly[aunly['DEPARTAMENTO']==dpto[i]].sort_values('tmp_author').reset_index(drop=True).tmp_author

Instituto de Física


0              ARNACHE OLMOS OSCAR LUIS
1            BARRERA RAMIREZ JOHN FREDY
2         BARRERO MENESES CESAR AUGUSTO
3         CUARTAS RESTREPO PABLO ANDRES
4        DUQUE ECHEVERRI CARLOS ALBERTO
5          FERRIN VAZQUEZ IGNACIO RAMON
6         GIRALDO CADAVID MARCO ANTONIO
7          HENAO HENAO RODRIGO DE JESUS
8       JARAMILLO ARANGO DANIEL ESTEBAN
9     JARAMILLO GALLEGO JOHNY ALEXANDER
10      LONDOÑO BADILLO FERNANDO ANDRES
11          MAHECHA GOMEZ JORGE EDUARDO
12                  MAZO ZULUAGA JOHANN
13               MIRA AGUDELO ALEJANDRO
14         MORALES ARAMBURO ALVARO LUIS
15            MUÑOZ CUARTAS JUAN CARLOS
16           OSORIO GUILLEN JORGE MARIO
17           OSORIO VELEZ JAIME ALBERTO
18    PACHON CONTRERAS LEONARDO AUGUSTO
19      PONCE GUTIERREZ WILLIAM ANTONIO
20             RESTREPO CARDENAS JOHANS
21    RESTREPO QUINTERO DIEGO ALEJANDRO
22           REYES GOMEZ ERNESTO AMADOR
23               RICAURTE AVELLA GERMAN
24          RODRIGUEZ REY BORIS ANGHELO


In [105]:
pd.set_option('display.max_colwidth',2000)

In [113]:
wp=UDEA[UDEA.AU.str.contains('Ponce, W')][['TI','SO','PY','AU','authors_WOS','UDEA_authors']].reset_index(drop=True)

In [129]:
aus=np.unique( aunly['UDEA_authors'].str['WOS_author'].apply(
    pd.Series).stack().values)

In [134]:
list( aus )

['AGUDELO OCHOA, GLORIA MARIA',
 'AGUDELO SANTAMARIA, JOHN',
 'AGUDELO, JOHN',
 'AGUIRRE, BEATRIZ',
 'ALVAREZ SANCHEZ, GONZALO',
 'AMARILES, PEDRO',
 'AMELL, ANDRES',
 'ARANGO VIANA, JUAN CARLOS',
 'ARISTIZABAL RIVERA, JUAN CARLOS',
 'ARREDONDO HOLGUIN, EDITH',
 'ATEHORTUA, LUCIA',
 'ATEHORTUA, Lucia',
 'Aceituno Bocanegra, Francisco Javier',
 'Aceituno, F. J.',
 'Aceituno, Francisco',
 'Aceituno, Francisco J.',
 'Aceituno, Francisco Javier',
 'Acevedo, Sandra P.',
 'Acevedo-Toro, Paola',
 'Acosta Cardenas, Alejandro',
 'Acosta, Alejandro',
 'Acosta, Carlos Alberto Palacio',
 'Adriana, Pabon',
 'Aedo, J. E.',
 'Aedo, Jose E.',
 'Afuwape, Anthony Uyi',
 'Agudelo Escobar, Lina Maria',
 'Agudelo Florez, Sergio',
 'Agudelo Jaramillo, Bernardo',
 'Agudelo Ochoa, A. M.',
 'Agudelo Ochoa, Ana Maria',
 'Agudelo Ochoa, G. M.',
 'Agudelo Ochoa, Gloria Maria',
 'Agudelo Rendon, Pedro Antonio',
 'Agudelo Santamaria, Andres Felipe',
 'Agudelo Suarez, Andres A.',
 'Agudelo Suarez, Andres Alonso',
 '

In [148]:
wp.apply(lambda r: fwp.extractOne(  [ a for a in r['AU'].split('\n')[0:-1] 
                       if a.find('Ponce, W')>-1][0], list( aus),
                        scorer=fuzz.partial_token_sort_ratio) ,axis=1 )                                  

0     (Ponce, William A., 88)
1     (Ponce, William A., 88)
2     (Ponce, William A., 88)
3     (Ponce, William A., 88)
4     (Ponce, William A., 88)
5     (Ponce, William A., 88)
6     (Ponce, William A., 88)
7     (Ponce, William A., 88)
8     (Ponce, William A., 88)
9     (Ponce, William A., 88)
10    (Ponce, William A., 88)
11    (Ponce, William A., 88)
12    (Ponce, William A., 88)
13    (Ponce, William A., 88)
14    (Ponce, William A., 88)
15    (Ponce, William A., 88)
16    (Ponce, William A., 88)
17    (Ponce, William A., 88)
18    (Ponce, William A., 88)
19    (Ponce, William A., 88)
20    (Ponce, William A., 88)
21    (Ponce, William A., 88)
22    (Ponce, William A., 88)
23    (Ponce, William A., 88)
24    (Ponce, William A., 88)
25    (Ponce, William A., 88)
26    (Ponce, William A., 88)
27    (Ponce, William A., 88)
28    (Ponce, William A., 88)
29    (Ponce, William A., 88)
30    (Ponce, William A., 88)
31    (Ponce, William A., 88)
32    (Ponce, William A., 88)
33    (Pon

In [153]:
#Check against PTJ
ra=UDEA[UDEA.AU.str.contains('Romano, A')][['TI','SO','PY','AU','authors_WOS','UDEA_authors']].reset_index(drop=True)