<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+GS+LNS
Merge the bibliographic datasets for 
* Web of Science, 
* Scielo 
* Scopus 
* Google Scholar
* Puntaje
* Lens
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

## functions

In [1]:
import unidecode
def get_author_info(x):
    sep='; '
    authors=[{'WOS_author':x[0].split(sep)[0],'affiliation':[x[0].split(sep)[-1]],'i':0}]
    iau=1
    for y  in x:
        y2=y.replace('[','').replace('] ',sep).split(sep)
        for z in y2[:-1]:
            aulist=[ d.get('WOS_author') for d in authors]
            if z not in aulist:
                authors.append({'WOS_author':z,'affiliation':[y2[-1]],'i':iau})
                iau=iau+1
            else:
                if y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]:
                    index_author=[ d.get('i') for d in authors if d.get('WOS_author')==z][0]
                    authors[index_author]['affiliation'].append(y2[-1])
    return authors

def dictionary_list_add_columns(df,df_dl,df_dl_key,df_dl_i,df_columns):
    '''
    For a
     df: Pandas DataFrame 
    with a:
     df_dl: column of list of dictionaries, with
     df_dl_key: dictionary key: e.g x=[{df_dl_key:1},{df_dl_key:2}]
    for the element df_dl_i of the list:
    Update the dictionary with:
        df_dl_key==x[df_dl_i][df_dl_key]
    with the dictionaries { df_columns[i]: df_columns[i].values }
    '''
    dff=df.copy()
    for key in df_columns:
        tmp=dff[df_dl].combine(dff[key],
                func=lambda x,y: y if pd.isna(y) 
                                   else 
                                     [z.update({key:y}) 
                                     if z.get(df_dl_key)==x[df_dl_i][df_dl_key] 
                                     else z 
                                 for z in x  ] )
    return dff

def split_full_names(y,full_name='full_name'):
    """
    From an input dictionary with {full_name:'APPELLIDO1 APPELLIDO2 NOMBRES'}
    Obtain a dictionary with the several name parts.
    """    
    yy=y.get(full_name).title()
    lfn=len(y[full_name].split())
    aps=0
    d={ 'PRIMER APELLIDO':yy.split()[aps] }
    aps=aps+1
    if lfn>=4:
        names=-2
        if lfn==5: # Extra name or last name
            yyy=yy.split()
            ll=pd.np.array( [len(n) for n in yyy ] )
            if ll[3:][ ll[3:]  <= 3 ].shape[0]:
                # last_names first_first_name de(l) second_first_name
                yy=' '.join( [ y for y in yyy if len(y)>=3] )
            else: 
                # first_last name de(l) second_last_name first_names
                tmpll=yyy.pop() # internal memory
                yy=' '.join( yyy )  
        if len( d['PRIMER APELLIDO'] )<=3:
            d['PRIMER APELLIDO']=d['PRIMER APELLIDO']+' '+yy.split()[aps]
            aps=aps+1
            d.update(  {'SEGUNDO APELLIDO':yy.split()[aps]} )
            names=names+1
            
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        aps=aps+1
        if len( d['SEGUNDO APELLIDO'] )<=3:
            d['SEGUNDO APELLIDO']=d['SEGUNDO APELLIDO']+' '+yy.split()[aps]
            if names==-2:
                names=names+1
    elif lfn>=3:        
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        names=-1
    else: #Colombian interpretation (TODO: Includes Brazilian interpretation)    
        names=-1
    d.update({'NOMBRES':' '.join( yy.split()[names:]),
              'INICIALES':' '.join( [z[0]+'.' for z in yy.split()[names:]] ),
              })
    if not d.get('SEGUNDO APELLIDO'):
        d['SEGUNDO APELLIDO']=''
    #if not d.get('NOMBRE COMPLETO'):
    #    d['NOMBRE COMPLETO']=''        
    return d

# Creates mask Search key in a list of dictionay
# First apply convert null values to string
# Second apply: implement a mask
def find_key_in_list_of_dictionaries(df,column,key,pattern):
    return df[column].apply(lambda x: 
                [ '' if pd.isnull( y.get(key)) else y for y in x ]  ).apply(
                            lambda x: 
                [ True if y.get(key).find(pattern)>-1 else False for y in x  ][0]  )

def key_contains_in_list_of_dictionaries(df,pattern,column='authors_WOS',key='WOS_author'):
    #TODO: loop in column len
    i=0
    r=df[ df[column].str[i].apply(lambda x: {} if pd.isnull(x) else x).apply(
                       lambda x: x.get(key) if x else '').str.contains(
        pattern) ][column].reset_index(drop=True)
    return r

In [2]:
import wosplus as wp
import numpy as np
import pandas as pd
import unidecode
import os
from IPython.display import clear_output
pd.set_option('display.max_colwidth',200)

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [3]:
%%writefile drive.cfg
[FILES]
UDEA_WOS.xlsx       = 1px2IcrjCrkyu7t78Q7PAE5nzV_yuPt9t
UDEA_SCI.xlsx       = 1pWMY5P72j0Ca6D-cm7dn7Q4TBGTs4PWV
UDEA_SCP.xlsx       = 1ulCsFHzDiTmuL9TH8F58ulh0u8Z2ylKh
UDEA_WOS_SCI_SCP.xlsx   = 1o9otmklgh-0w18Avv2ZTKOXr3vZbjwvj
UDEA_WOS_SCI_SCP.json=1RTDCh5pl0vapjJT_e9ZwadHPGBKGGv6Y
UDEA_WOS_SCI_SCP.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs
UDEA_authors_with_WOS_info.json=1o1eVT4JD0FMMICq_oxrTJOzWh47veBMw

Writing drive.cfg


##  Load data bases

In [4]:
affil='Univ Antioquia'
drive_files=wp.wosplus('drive.cfg')
RECOVER=False
if RECOVER==True:
    print('Go to: http://fisica.udea.edu.co:4443/user/restrepo/notebooks/prog/medicion/medicion/cienciometria/WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Load-output-restuls-of-previous-Cell-runs')

In [5]:
if os.path.exists('UDEA_WOS_SCI_SCP.json'):
    UDEA=pd.read_json('UDEA_WOS_SCI_SCP.json')
else:    
    UDEA=drive_files.read_drive_json('UDEA_WOS_SCI_SCP.json')

In [6]:
UDEA.Tipo.unique()

array(['WOS', 'SCI_SCP', 'WOS_SCP', 'WOS_SCI_SCP', 'SCP', 'WOS_SCI',
       'SCI'], dtype=object)

In [7]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

WOS:1884
SCI_SCP:1622
WOS_SCP:5824
WOS_SCI_SCP:773
SCP:2584
WOS_SCI:147
SCI:2892


## Extract  affiliation from C1

In [8]:
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

# Improve normalization: remove C1s with only affiliation (from Scielo)
UDEA['authors_WOS']=UDEA['authors_WOS'].apply( 
    lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x )

## Load trained old data 

In [9]:
if os.path.exists('WOS_SCP_UDEA_SJR_SIU.xlsx'):
    SIU=pd.read_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
else:    
    SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')

In [10]:
SIU.Tipo.unique()

array(['WOS+SCP', 'WOS+SCP+UDEA', 'WOS+SCI+SCP', 'WOS+SCI+SCP+UDEA',
       'SCI+SCP', 'SCI+SCP+UDEA', 'SCP', 'SCP+UDEA', 'WOS', 'WOS+UDEA',
       'WOS+SCI', 'WOS+SCI+UDEA', 'SCI', 'SCI+UDEA', 'UDEA'], dtype=object)

In [11]:
SIU=SIU[SIU.Tipo.str.contains('\+UDEA')].reset_index(drop=True)

In [12]:
SIU.columns.values

array(['AB', 'AF', 'AR', 'AU', 'BA', 'BE', 'BF', 'BN', 'BP', 'C1', 'CA',
       'CL', 'CR', 'CT', 'CTR_CATEGORÍA G', 'CTR_CATEGORÍA I',
       'CTR_Centro', 'CTR_CÉDULA', 'CTR_GRUPO', 'CTR_LINK CVLAC',
       'CTR_NOMBRE', 'CTR_Nivel de Formación', 'CTR_Tipo de Vinculación',
       'CTR_vinculación GrupLAC', 'CY', 'Clasificación 2016', 'D2', 'DE',
       'DI', 'DT', 'EI', 'EM', 'EP', 'FU', 'FX', 'GA', 'GP', 'HO', 'ID',
       'IS', 'ISSN', 'J9', 'JI', 'LA', 'MA', 'NR', 'OI', 'PA', 'PD', 'PG',
       'PI', 'PM', 'PN', 'PT', 'PU', 'PY', 'RI', 'RP', 'SC', 'SCI_AB',
       'SCI_AU', 'SCI_BP', 'SCI_C1', 'SCI_C2', 'SCI_CR', 'SCI_DE',
       'SCI_DI', 'SCI_DT', 'SCI_EC', 'SCI_EM', 'SCI_EP', 'SCI_IS',
       'SCI_LA', 'SCI_NR', 'SCI_OI', 'SCI_PA', 'SCI_PD', 'SCI_PI',
       'SCI_PT', 'SCI_PU', 'SCI_PY', 'SCI_RI', 'SCI_SC', 'SCI_SN',
       'SCI_SO', 'SCI_TC', 'SCI_TI', 'SCI_U1', 'SCI_U2', 'SCI_UT',
       'SCI_VL', 'SCI_X1', 'SCI_X4', 'SCI_X5', 'SCI_Y1', 'SCI_Y4',
       'SCI_Y5', 'SCI_Z1', 'S

TODO: def: Convert value string list into a list of dictionaries

In [13]:
SIU['UDEA_autores']=SIU['UDEA_autores'].str.replace('  ',' ')

In [14]:
SIU['UDEA_authors']=SIU.UDEA_autores.str.split(';').apply(lambda x: [{'full_name':y} for y in x ])

## Merge with official researcher list

In [15]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [16]:
AU_columns=list( AU.columns.values )

In [17]:
AU['name_tmp']=(AU['PRIMER APELLIDO']+' '+AU['SEGUNDO APELLIDO']+' '+AU['NOMBRES']).str.lower().str.strip().apply( 
    unidecode.unidecode )

In [18]:
maxau=SIU['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]).apply(len).max()

In [19]:
kkn=SIU.copy()

In [20]:
newcolumns=['name_tmp']+AU_columns
for i in range(maxau):
    print(i)
    kkn['name_tmp']=kkn['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]
                            ).str[i].apply( lambda x: unidecode.unidecode( x.lower().strip()) 
                                                      if not pd.isna(x) else x)
    if not kkn[~kkn['name_tmp'].isna()].empty:
        kkn=kkn.merge(AU[newcolumns],on='name_tmp',how='left').reset_index(drop=True)
        kkn=dictionary_list_add_columns(kkn,'UDEA_authors','full_name',i,AU_columns)
        kkn=kkn.drop(newcolumns,axis='columns')

0
1
2
3
4
5
6
7
8
9


https://stackoverflow.com/a/29530601/2268280

BUG: Cells filled with nans

In [21]:
kkn.shape,SIU.shape

((7916, 205), (7916, 205))

In [22]:
SIU=kkn.copy()

In [23]:
AU[ AU['NOMBRE COMPLETO'].str.contains('Oscar Alberto Zapata Noreña') ]

Unnamed: 0,CÉDULA,NOMBRE COMPLETO,NOMBRES,PRIMER APELLIDO,SEGUNDO APELLIDO,FACULTAD,DEPARTAMENTO,GRUPO,name_tmp
234,15386534,Oscar Alberto Zapata Noreña,Oscar Alberto,Zapata,Noreña,Facultad de Ciencias Exactas y Naturales,Instituto de Física,Grupo de Fenomenologia de Interacciones Fundamentales,zapata norena oscar alberto


Quality check

In [24]:
SIU[ find_key_in_list_of_dictionaries(SIU,'UDEA_authors','full_name','ZAPATA') ].UDEA_authors.loc[241]

[{'CÉDULA': 15386534.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'Oscar Alberto Zapata Noreña',
  'NOMBRES': 'Oscar Alberto ',
  'PRIMER APELLIDO': 'Zapata',
  'SEGUNDO APELLIDO': 'Noreña',
  'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO'},
 {'CÉDULA': 8287417.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'William Antonio Ponce Gutierrez',
  'NOMBRES': 'William Antonio ',
  'PRIMER APELLIDO': 'Ponce',
  'SEGUNDO APELLIDO': 'Gutierrez',
  'full_name': 'PONCE GUTIERREZ WILLIAM ANTONIO'}]

In [25]:
SIU[ find_key_in_list_of_dictionaries(SIU,'UDEA_authors','full_name','PONCE')
   ].UDEA_authors.reset_index(drop=True).loc[0]

[{'CÉDULA': 8287417.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'William Antonio Ponce Gutierrez',
  'NOMBRES': 'William Antonio ',
  'PRIMER APELLIDO': 'Ponce',
  'SEGUNDO APELLIDO': 'Gutierrez',
  'full_name': 'PONCE GUTIERREZ WILLIAM ANTONIO'}]

## Merge with trained data set

Merge requires split in DI and TI

In [26]:
SIUDI=SIU[~SIU.DI.isna()].drop_duplicates('DI').reset_index(drop=True)
SIUTI=SIU[ SIU.DI.isna()].drop_duplicates('TI').reset_index(drop=True)
SIUTI=SIUTI[SIUTI!=''].reset_index(drop=True)
SIUTI=SIUTI[~SIUTI.TI.isnull()].reset_index(drop=True)
SIUTI=SIUTI[ SIUTI.TI.apply(len)>20 ].reset_index(drop=True)

  result = method(y)


In [27]:
udea_columns=[       'UDEA_autores',
       'UDEA_año realiz', 'UDEA_doi', 'UDEA_fecha aplicación',
       'UDEA_idioma', 'UDEA_item adic', 'UDEA_material', 'UDEA_nombre',
       'UDEA_nombre revista o premio', 'UDEA_nro autores', 'UDEA_país',
       'UDEA_procodigo', 'UDEA_ptos', 'UDEA_simple_doi', 'UDEA_título',
       'UDEA_valor item','UDEA_authors']

In [28]:
UDEADI=UDEA[UDEA.DI!=''].drop_duplicates('DI').reset_index(drop=True)
UDEATI=UDEA[UDEA.DI==''].drop_duplicates('TI').reset_index(drop=True)

In [29]:
UDEA_mergeDI=UDEADI.merge( SIUDI[ ['DI']+udea_columns ],on='DI',how='left' )

In [30]:
UDEADI.shape,UDEA_mergeDI.shape

((8035, 152), (8035, 169))

In [31]:
UDEA_PTJ=UDEA_mergeDI[~UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_mergeDI[UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)

In [32]:
UDEATI['tmptitle']=UDEATI.TI.str.strip()
SIUTI['tmptitle']=SIUTI.TI.str.strip()

In [33]:
kk=UDEATI.merge( SIUTI[ ['tmptitle']+udea_columns ],on='tmptitle',how='left' ).drop('tmptitle',axis='columns')

In [34]:
UDEA_PTJ=UDEA_PTJ.append( kk[ ~kk.UDEA_autores.isna() ] ).reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_PTJ_NOT.append( kk[ kk.UDEA_autores.isna() ] ).reset_index(drop=True)

In [35]:
UDEA_PTJ.shape[0]+UDEA_PTJ_NOT.shape[0],UDEA.shape

(15722, (15726, 152))

In [36]:
UDEA_PTJ.shape,UDEA_PTJ_NOT.shape

((7074, 169), (8648, 169))

In [37]:
UDEA_PTJ.loc[[0,1]]

Unnamed: 0,AB,AF,AR,AU,BA,BE,BF,BN,BP,C1,...,UDEA_nombre,UDEA_nombre revista o premio,UDEA_nro autores,UDEA_país,UDEA_procodigo,UDEA_ptos,UDEA_simple_doi,UDEA_título,UDEA_valor item,UDEA_authors
0,,,,"Ramírez, Juan David\nGiraldo, Santiago Patiño\nArango, Marcos\n",,,,,,,...,PATIÑO GIRALDO SANTIAGO,ver UDEA_material,3.0,46.0,53961.0,3.6,0.0,"LINFOMA PRIMARIO DEL CORAZON, CAUSA POCO COMUN DEL SINDROME DE FALLA CARDIACA. (PRIMARY CARDIAC LYMPHOMA, AN UNCOMMON CAUSE OF HEART FAILURE).",SOCIEDAD COLOMBIANA DE CARDIOLOGIA Y CIRUGIA CARDIOVASCULAR.,"[{'full_name': 'PATIÑO GIRALDO SANTIAGO', 'FACULTAD': 'Facultad de Medicina', 'PRIMER APELLIDO': 'Patiño', 'NOMBRE COMPLETO': 'Santiago Patiño Giraldo', 'SEGUNDO APELLIDO': 'Giraldo', 'DEPARTAMENT..."
1,,,,"Molina, Marcela\nPalacio, Juan David\nVargas, Cristian\nDíaz-Zuluaga, Ana María\nAgudelo Berruecos, Yuli\nOspina, Sigifredo\nLópez-Jaramillo, Carlos\n",,,,,,,...,LOPEZ JARAMILLO CARLOS ALBERTO,REVISTA COLOMBIANA DE PSIQUIATRIA,7.0,46.0,55920.0,4.3,0.0,DESEMPENO NEUROCOGNITIVO DE PACIENTES CON TRASTORNO AFECTIVO BIPOLAR TIPO I EN EUTIMIA CON Y SIN ANTECEDENTE DE PSICOSIS DE UN PROGRAMA DE INTERVENCION MULTIMODAL: PRISMA,,"[{'full_name': 'VARGAS UPEGUI CRISTIAN DAVID', 'FACULTAD': 'Facultad de Medicina', 'PRIMER APELLIDO': 'Vargas', 'NOMBRE COMPLETO': 'Cristian David Vargas Upegui', 'SEGUNDO APELLIDO': 'Upegui', 'DE..."


In [38]:
UDEA=UDEA_PTJ.append(
    UDEA_PTJ_NOT).reset_index(
    drop=True)

In [39]:
key_contains_in_list_of_dictionaries(UDEA,'Restrepo, D',column='authors_WOS',key='WOS_author').loc[1:2]

1                                                                        [{'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.'], 'i': 0, 'WOS_author': 'Restrepo, D.'}]
2    [{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'i': 0, 'WOS_author': 'Restrepo, Diego'}, {'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'i': 1, '...
Name: authors_WOS, dtype: object

In [40]:
UDEA.to_json('UDEAtmp.json')

In [41]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Add `UDEA.authors_WOS` info* within `UDEA.UDEA_authors` data**
(\*) obtained from `UDEA.C1`

(\*\*) Obtained from [puntaje trained old UDEA data](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-trained-data-set) and the [official researcher list](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-official-researcher-list)

Obtain name parts and initials from full name in `UDEA_authors` dictionary and update `UDEA_authors` with them

In [42]:
# tmp is for update dict
dictupdatetmp=UDEA['UDEA_authors'].apply(lambda x: [y.update( 
                split_full_names(y,full_name='full_name')  ) if not pd.isnull(
                y.get('full_name')) else y for y in x] 
                                   if type(x)==list 
                                   else x)

In [43]:
x,y=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=lambda x,y:(x,y) ).loc[92]

In [44]:
x

[{'WOS_author': 'Echeverri Alvarez, Jonathan',
  'affiliation': ['Univ Antioquia, Antioquia, Colombia.'],
  'i': 0},
 {'WOS_author': 'Chaves Castano, Liliana',
  'affiliation': ['Univ Antioquia, Antioquia, Colombia.'],
  'i': 1}]

In [45]:
y

[{'CÉDULA': 43724493.0,
  'DEPARTAMENTO': 'Departamento de Sicología',
  'FACULTAD': 'Facultad de Ciencias Sociales y Humanas',
  'GRUPO': 'Grupo de investigación en psicologia cognitiva',
  'INICIALES': 'L.',
  'NOMBRE COMPLETO': 'Liliana Chaves Castaño',
  'NOMBRES': 'Liliana',
  'PRIMER APELLIDO': 'Chaves',
  'SEGUNDO APELLIDO': 'Castaño',
  'full_name': 'CHAVES CASTAÑO LILIANA'}]

In [46]:
def wos_names_append(wos_names,last_name,first_names,initials):
    wos_names=wos_names+[           last_name+', '+first_names]
    if len( first_names.split())>1:
        wos_names=wos_names+[ last_name+', '+first_names.split()[0] ]
    if len( first_names.split())==2 and  len(first_names.split()[-1]):
        wos_names=wos_names+[ last_name+', '+first_names.split()[-1] ]
    wos_names=wos_names+[ last_name+', '+initials]
    if len( initials.split())>1:
        wos_names=wos_names+[ last_name+', '+initials.split()[0]]
    if len(initials.split())==2:
        wos_names=wos_names+[
              last_name+', '+first_names.split()[0]+' '+initials.split()[-1] ]
        wos_names=wos_names+[last_name+', '+initials.split()[-1] ]
    return wos_names
    
def wos_names_list(dy ,y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name']   ):
    last_name= unidecode.unidecode( dy[y_keys[0]]   )
    first_names=unidecode.unidecode( dy[y_keys[1]]  )
    initials=unidecode.unidecode( dy[y_keys[2]]  )

    wos_names=[]
    wos_names=wos_names_append(wos_names,last_name,first_names,initials)
    
    if dy.get( y_keys[3] ):
        last_names= unidecode.unidecode( dy[y_keys[0]]+'-'+dy[y_keys[3]]   )
        wos_names=wos_names_append(wos_names,last_names,first_names,initials)
    return wos_names
    
def combinewos(x,y,x_keys=['WOS_author','affiliation'],
                   y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name'],
                   xy_keys=['WOS_author','WOS_affiliation']):
    if type(x)==list and type(y)==list:
        for dx in x:
            wos_name=unidecode.unidecode( dx[ x_keys[0] ] )
            WOS_affiliation= dx[x_keys[1]]
            # Try by buildinng spanish-like names list                                
            for i in range( len(y) ):
                if wos_name.title() in wos_names_list(y[i] ,y_keys):
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            for i in range( len(y) ):
                yi_to_list=unidecode.unidecode( y[i][y_keys[4]].title() )
                if yi_to_list:
                    yi_to_list=yi_to_list.split()
                else:
                    yi_to_list=[]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with initials
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]] ]+y[i][y_keys[2]].split()
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with first first name and initial
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[0],
                              y[i][y_keys[2]].split()[-1]]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
            #Try again but comparing full lists with second first name and initial
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[-1],
                              y[i][y_keys[2]].split()[0]]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
                    
                    
    return y

In [47]:
wos_names_list( split_full_names( y[0] ) )

['Chaves, Liliana',
 'Chaves, L.',
 'Chaves-Castano, Liliana',
 'Chaves-Castano, L.']

In [48]:
UDEA[UDEA.UDEA_autores.fillna('').str.contains('ZARRAZOLA RIVERA EDWIN DE JESUS')].UDEA_authors.loc[154]

[{'CÉDULA': 15513744.0,
  'DEPARTAMENTO': 'Departamento de Matemáticas',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Análisis Multivariado',
  'INICIALES': 'E. J.',
  'NOMBRE COMPLETO': 'Edwin De Jesus Zarrazola Rivera',
  'NOMBRES': 'Edwin Jesus',
  'PRIMER APELLIDO': 'Zarrazola',
  'SEGUNDO APELLIDO': 'Rivera',
  'full_name': 'ZARRAZOLA RIVERA EDWIN DE JESUS'},
 {'INICIALES': 'K.',
  'NOMBRES': 'Krishna',
  'PRIMER APELLIDO': 'Nagar',
  'SEGUNDO APELLIDO': 'Daya',
  'full_name': 'NAGAR DAYA KRISHNA'},
 {'INICIALES': 'D. S.',
  'NOMBRES': 'Del Socor',
  'PRIMER APELLIDO': 'Sanchez',
  'SEGUNDO APELLIDO': 'Herrera',
  'full_name': 'SANCHEZ HERRERA LUZ ESTELA DEL SOCOR'}]

In [49]:
#kk=UDEA['UDEA_authors'].str[0].apply( lambda d: d if type(d)==dict else pd.np.nan ).dropna()
#kk[kk.apply( lambda d: d.get('full_name') if type(d)==dict else pd.np.nan ).isna()]

In [50]:
kk=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=combinewos )

Combines the two list into a single one in  UDEA_authors

check

Full query of auhor_WOS in UDEA_authors:

In [51]:
UDEA['UDEA_authors'].apply(lambda x: [y  for y in x ][0] if type(x)==list else x).loc[92]

{'CÉDULA': 43724493.0,
 'DEPARTAMENTO': 'Departamento de Sicología',
 'FACULTAD': 'Facultad de Ciencias Sociales y Humanas',
 'GRUPO': 'Grupo de investigación en psicologia cognitiva',
 'INICIALES': 'L.',
 'NOMBRE COMPLETO': 'Liliana Chaves Castaño',
 'NOMBRES': 'Liliana',
 'PRIMER APELLIDO': 'Chaves',
 'SEGUNDO APELLIDO': 'Castaño',
 'WOS_affiliation': ['Univ Antioquia, Antioquia, Colombia.'],
 'WOS_author': ['Chaves Castano, Liliana'],
 'full_name': 'CHAVES CASTAÑO LILIANA'}

In [52]:
UDEA.to_json('UDEAtmp.json')

### Load output restuls of previous Cell runs

In [53]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [54]:
def extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return df[list_of_dictionaries].apply(lambda x: [y.get(dictionary_key) 
                                                        if y.get(dictionary_key) else []   
                                                        for y in x  ]
                           if type(x)==list else [])

def extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal list as value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
                    list_of_dictionaries,dictionary_key).apply(lambda x: 
                           [item for sublist  in x for item in sublist] 
                            if type(x)==list else x)

def mask_on_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    pattern='RESTREPO QUINTERO DIEGO ALEJANDRO',
    list_of_dictionaries='UDEA_authors',dictionary_key='full_name'):
    """
    Build a mask for a Pandas Series of list of dictionaries of label:
      list_of_dictionaries. 
    The:
      dictionary_key must be a single value like string or float
    """
    return extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
            list_of_dictionaries,dictionary_key).apply( 
            lambda x: pattern in x)

In [55]:
pattern='Restrepo, Diego'
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin 050010, Colombia.'], 'INICIALES': 'D...
1     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.'], 'INICIALES': 'D. A.', ...
2     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.'], 'INICIALES': 'D. A.', ...
3     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Oscar Alberto', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.'], 'INICIALES': 'O. A.', 'GRUPO': 'Grupo de...
4     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Oscar Alberto', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIALE

In [56]:
pattern='Restrepo, D.'
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.'], 'INICIALES': ...
1     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'Grupo ...
2     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'G...
3     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'G...
4     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIA

In [57]:
pattern

'Restrepo, D.'

In [58]:
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.'], 'INICIALES': ...
1     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'Grupo ...
2     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'G...
3     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIALES': 'D. A.', 'GRUPO': 'G...
4     [{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Diego Alejandro', 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'INICIA

In [59]:
def extract_dictionary_from_list(df,list_name,i=0):
    return df[list_name].apply( lambda l: [ d for d in l][i] if type(l)==list else l )
def extract_key_value_from_series(ds,key):
    return ds.apply( lambda d: d.get(key)if type(d)==dict else d  )

In [60]:
mask=extract_key_value_from_series( 
    extract_dictionary_from_list( UDEA,'UDEA_authors',i=0),'WOS_author').str[0].str.contains('\-').fillna(False)
UDEA[mask]['UDEA_authors'].reset_index(drop=True).str[0].loc[1]

{'CÉDULA': 87470374.0,
 'DEPARTAMENTO': 'Departamento de Microbiología y Parasitología',
 'FACULTAD': 'Facultad de Medicina',
 'GRUPO': 'Inmunovirología',
 'INICIALES': 'S.',
 'NOMBRE COMPLETO': 'Silvio Urcuqui Inchima',
 'NOMBRES': 'Silvio',
 'PRIMER APELLIDO': 'Urcuqui',
 'SEGUNDO APELLIDO': 'Inchima',
 'WOS_affiliation': ['Univ Antioquia, Sede Invest Univ, Grp Inmunoviol, Medellin, Colombia.'],
 'WOS_author': ['Urcuqui-Inchima, Silvio'],
 'full_name': 'URCUQUI INCHIMA SILVIO'}

TODO: Extract dict or value

## TODO: Build a single profile for all

flatten a list of lists:

In [61]:
ua=pd.DataFrame()

In [62]:
aumax=UDEA['UDEA_authors'].dropna().apply(len).max() 
ua=pd.DataFrame()
for i in range(aumax):
    kkk=pd.DataFrame()
    kkk['UDEA_authors']= UDEA['UDEA_authors'].str[i].dropna()
    kkk['authors_WOS']= UDEA['authors_WOS']
    kkk['SCP_Authors']=UDEA['SCP_Authors']
    kkk['tmp_str']=kkk['UDEA_authors'].astype(str)
    kkk=kkk.drop_duplicates('tmp_str')
    ua=ua.append(kkk).reset_index(drop=True)
    #ua['UDEA_authors']= 

In [63]:
ua['tmp_author']=ua['UDEA_authors'].apply( lambda d: d.get('full_name') )

BUG: `UDEA_authors` without `WOS_author key`

### Filter identified WOS articles

In [64]:
ua['authors_WOS']=ua['authors_WOS'].apply(lambda l: l if l else pd.np.nan)
ua=ua[~ua['authors_WOS'].isna()].reset_index(drop=True)

In [65]:
full_names=ua['tmp_author'].unique()

In [66]:
full_names[:3]

array(['DOVER ROBERT VAN HORN', 'RADA RINCON JUAN PABLO',
       'MACHADO RODRIGUEZ GLORIA'], dtype=object)

In [67]:
ua.shape,full_names.shape

((4135, 5), (850,))

TODO: implement also SCP authors

Example

In [68]:
aunly=pd.DataFrame()
for f in full_names:
    clear_output(wait=True)
    print(f)    
    kk=pd.DataFrame( { 'tmp_author':[f]  } ).merge(
          ua[['tmp_author','UDEA_authors']],on='tmp_author',how='left')

    kk['tmp_str']=kk.UDEA_authors.astype(str)

    kk=kk.drop_duplicates('tmp_str').dropna()#[['tmp_author','UDEA_authors']]

    try:
        laff=list( kk['UDEA_authors'].apply(lambda d: d.get( 'WOS_affiliation' )
                                     ).dropna().apply(pd.Series).stack().unique() )
        lau=list( kk['UDEA_authors'].apply(lambda d: d.get( 'WOS_author' )
                                     ).dropna().apply(pd.Series).stack().unique() )
    except AttributeError:
        laff=[];lau=[]

    if len(laff)>0 and len(lau)>0:
        tmpupdate=kk['UDEA_authors'].apply(lambda d: d.update({'WOS_author':lau,'WOS_affiliation':laff}) )

        kk['tmp_str']=kk['UDEA_authors'].astype(str)

        kk=kk.drop_duplicates('tmp_str')

        kk['tmp_len']=kk['tmp_str'].apply(len)#.astype(str)

        aunly=aunly.append( kk.sort_values('tmp_len',ascending=False).drop(index=kk.index[1:]).drop(
               ['tmp_str','tmp_len'],axis='columns') ).reset_index(drop=True)

MESA HERRERA NATALIA REGINA


In [69]:
#DEBUG
aunly[aunly.tmp_author.fillna('').str.contains('DUQUE ECHEVERRI CARLOS ALBERTO')
      ].reset_index(drop=True).UDEA_authors.loc[0]

{'CÉDULA': 71617883.0,
 'DEPARTAMENTO': 'Instituto de Física',
 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
 'GRUPO': 'Grupo de Materia Condensada-UdeA',
 'INICIALES': 'C. A.',
 'NOMBRE COMPLETO': 'Carlos Alberto Duque Echeverri',
 'NOMBRES': 'Carlos Alberto',
 'PRIMER APELLIDO': 'Duque',
 'SEGUNDO APELLIDO': 'Echeverri',
 'WOS_affiliation': ['Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Medellin, Colombia.',
  'Univ Antioquia UdeA, Grp Materia Condensada UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Medellin, Colombia.',
  'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Medellin, Colombia.',
  'Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia UdeA, Grp Mat Condensada UdeA, Inst Fis, Fac C

In [70]:
ua['UDEA_authors'].loc[0]

{'INICIALES': 'V. H.',
 'NOMBRES': 'Van Horn',
 'PRIMER APELLIDO': 'Dover',
 'SEGUNDO APELLIDO': 'Robert',
 'WOS_affiliation': ['Univ Antioquia, Dept Antropol, Antioquia, Colombia.',
  'Univ Antioquia, Dept Antropol, Medellin, Colombia.',
  'Univ Antioquia, Fac Nacl Salad Publ, Medellin, Antioquia, Colombia.'],
 'WOS_author': ['Dover, Robert V. H.'],
 'full_name': 'DOVER ROBERT VAN HORN'}

In [71]:
ua['authors_WOS'].loc[0]

[{'WOS_author': 'Dover, Robert V. H.',
  'affiliation': ['Univ Antioquia, Dept Antropol, Antioquia, Colombia.'],
  'i': 0}]

In [72]:
aunly.shape

(800, 2)

In [73]:
aunly.to_json('UDEA_authors_with_WOS_info.json')

In [74]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [75]:
UDEA.shape

(15722, 169)

In [76]:
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [77]:
aunly.shape

(800, 2)

(800, 2)

## Merge UDEA with authors

In [78]:
# loop upon i and j to merge aunly in UDEA such that the full authors and affiliations are registered
#UDEA.merge

In [79]:
l=UDEA['UDEA_authors'].loc[4]
def fill_full_wos_author_info(l,WOS_df=aunly,full_name='full_name',full_name_column='tmp_author',
                               WOS_column='UDEA_authors',WOS_author='WOS_author',
                               WOS_affiliation='WOS_affiliation'):
    newl=[]
    if type(l)==list:
        for d in l:
            if d.get('WOS_author'):
                #find in aunly
                mtch=WOS_df[WOS_df[full_name_column]==d.get(full_name)].reset_index(drop=True)
                if mtch.shape[0]==1:
                    #update d
                    if mtch[WOS_column].loc[0].get(WOS_author):
                        d[WOS_author]=mtch[WOS_column].loc[0].get(WOS_author)
                        d[WOS_affiliation]=mtch[WOS_column].loc[0].get(WOS_affiliation)
            newl.append(d)
    else:
        newl=l
    return newl        

In [80]:
fill_full_wos_author_info(l,aunly)[0].get('WOS_author')

In [81]:
UDEA['UDEA_authors']=UDEA['UDEA_authors'].apply(fill_full_wos_author_info )

In [82]:
UDEA.to_json('UDEAtmp.json')

In [124]:
RECOVER=True
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## TODO: Try to merge new WOS articles from old `'UDEA_authors'`

TODO: whithout split or merge

Split DataFrame into  "new" with UDEA authors not yet identified and "old" with full UDEA author info 

In [84]:
UDEAnew=UDEA[UDEA.UDEA_authors.isna()].reset_index(drop=True)
UDEAold=UDEA[~UDEA.UDEA_authors.isna()].reset_index(drop=True)

## Merge old with news

Split "new" into the "Y" with WOS author info

In [85]:
UDEAnewY=UDEAnew[ UDEAnew.authors_WOS.apply(lambda x: type(x)==list and len(x)>0) ].reset_index(drop=True)
UDEAnewN=UDEAnew[~UDEAnew.authors_WOS.apply(lambda x: type(x)==list and len(x)>0) ].reset_index(drop=True)

In [86]:
(UDEAnewY.shape,'+',UDEAnewN.shape,'+',UDEAold.shape,'=',
   UDEAnewY.shape[0]+UDEAnewN.shape[0]+UDEAold.shape[0],'=',UDEA.shape[0])

((3432, 169), '+', (5216, 169), '+', (7074, 169), '=', 15722, '=', 15722)

In [87]:
CHECK=False
if CHECK:
    pattern='Restrepo, D.'
    mask=extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(
                UDEAnewY,list_of_dictionaries='authors_WOS',dictionary_key='WOS_author').apply( 
                lambda x: pattern in x)
    UDEAnewY[mask].authors_WOS.reset_index(drop=True).values

In [88]:
if CHECK:
    mask=mask_on_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(UDEAnewY,
            pattern='Restrepo, D.',list_of_dictionaries='authors_WOS',dictionary_key='WOS_author')                                                                       
    UDEAnewY[mask]['authors_WOS'].reset_index(drop=True).values

In [89]:
#Extract a specific author with specific affiliation from the WOS author full info into a temporal column
i=0
aff_i=0
UDEAnewY['news_i_j_str']=UDEAnewY.authors_WOS.apply(lambda x: [ {'WOS_author':y.get('WOS_author'),
                                                   'affiliation':y.get('affiliation')[aff_i]} for y in x]
                                                   ).str[i].astype(str)
UDEAnewY['news_i_j_str'].loc[0]

"{'affiliation': 'Univ Antioquia, Fac Agr Sci, Biogenesis Res Grp, Calle 70 52-21, Medellin 050010, Colombia.', 'WOS_author': 'Ortiz, T.'}"

Prepare trained data set with the j-th author: first UDEA author info into a temporal column

In [90]:
j=0
auth_j=0
aff_j=0
UDEAold['UDEA_authors_j']=UDEAold['UDEA_authors'].str[j]

In [91]:
#Extract a specific author with specific affiliation from the WOS author full info into a temporal column
UDEAold['news_i_j']=UDEAold['UDEA_authors'].apply(lambda l: [ {'WOS_author':d.get('WOS_author')[auth_j], 
                                           'affiliation':d.get('WOS_affiliation')[aff_j] } if d.get('WOS_author') 
                                         else pd.np.nan for d in l] ).str[j]#.astype(str)


In [92]:
ma=UDEAold.dropna(subset=['news_i_j']).reset_index(drop=True)

### Quality critera search:
* Affiliation split(', ') > 3
* if Author find('.') → 'Author split(' ')  > 2 

In [93]:
ma['news_i_j'].loc[2]

{'WOS_author': 'Restrepo, J.',
 'affiliation': 'Univ Antioquia, Inst Fis, Grp Magnetismo & Simulac, Medellin 1226, Colombia.'}

In [94]:
#*** May be filter Homonymous
Homonymous=False
if Homonymous:
    ma=ma[~np.logical_and( ma['news_i_j'].apply(lambda d: d.get('affiliation') 
                                              if not pd.isna(d) else '' ).str.split(', ').apply(len)<=3,
                            ma['news_i_j'].apply(lambda d: d.get('WOS_author') 
                                              if not pd.isna(d) else '' ).str.split(' ').apply(len)<=2)]
#****

In [95]:
ma['news_i_j_str']=ma['news_i_j'].astype(str)

In [96]:
#Extract all the mathches for the author
kk=UDEAnewY.merge( ma[['news_i_j_str','news_i_j']],on='news_i_j_str',how='left')#.dropna().reset_index(drop=True)#.loc[5].values

In [97]:
ma.UDEA_authors.str[i].loc[0]

{'CÉDULA': 71617883.0,
 'DEPARTAMENTO': 'Instituto de Física',
 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
 'GRUPO': 'Grupo de Materia Condensada-UdeA',
 'INICIALES': 'C. A.',
 'NOMBRE COMPLETO': 'Carlos Alberto Duque Echeverri',
 'NOMBRES': 'Carlos Alberto',
 'PRIMER APELLIDO': 'Duque',
 'SEGUNDO APELLIDO': 'Echeverri',
 'WOS_affiliation': ['Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Medellin, Colombia.',
  'Univ Antioquia UdeA, Grp Materia Condensada UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Medellin, Colombia.',
  'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Medellin, Colombia.',
  'Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia UdeA, Grp Mat Condensada UdeA, Inst Fis, Fac C

In [98]:
kk.shape,UDEAnewY.shape

((3769, 171), (3432, 170))

In [99]:
kkk=kk.dropna(subset=['news_i_j'])#.shape
kkk.shape

(377, 171)

Implementation without merge

In [None]:
#Extract a specific author with specific affiliation from the WOS author full info into a temporal column
UDEAold['news_i_j']=UDEAold['UDEA_authors'].apply(lambda l: [ {'WOS_author':d.get('WOS_author')[auth_j], 
                                           'affiliation':d.get('WOS_affiliation')[aff_j] } if d.get('WOS_author') 
                                         else pd.np.nan for d in l] ).str[j]#.astype(str)
ma=UDEAold.dropna(subset=['news_i_j']).reset_index(drop=True)
ma['news_i_j_str']=ma['news_i_j'].astype(str)

In [None]:
#Extract a specific author with specific affiliation from the WOS author full info into a temporal column
i=0
aff_i=0
UDEAnewY['news_i_j_str']=UDEAnewY['authors_WOS'].apply(lambda x: [ {'WOS_author':y.get('WOS_author'),
                                                   'affiliation':y.get('affiliation')[aff_i]} for y in x]
                                                   ).str[i].astype(str)

In [None]:
#Extract all the mathches for the author
kk=UDEAnewY.merge( ma[['news_i_j_str','news_i_j']],
                  on='news_i_j_str',how='left')#.dropna().reset_index(drop=True)#.loc[5].values

In [356]:
#x=UDEA['authors_WOS']
#y=UDEA['UDEA_authors']
i=8
x=UDEA['authors_WOS'].loc[i]
y=UDEA['UDEA_authors'].loc[i]
if ( type(x)==list and x) and type(y)!=list:
    print(x)
    print("="*20)
    print(y)

[{'i': 0, 'WOS_author': 'Monsalve-Calderon, K.', 'affiliation': ['Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas Natur, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.']}, {'i': 1, 'WOS_author': 'Gil-Corrales, A.', 'affiliation': ['Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas Natur, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.']}, {'i': 2, 'WOS_author': 'Morales, A. L.', 'affiliation': ['Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas Natur, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.']}, {'i': 3, 'WOS_author': 'Duque, C. A.', 'affiliation': ['Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas Natur, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.']}]
None


In [349]:
j=2
x[j].get('WOS_author')

'Bedoya, Gabriel'

In [350]:
import Levenshtein as lv

In [423]:
def find_author_affiliation(author,affiliation,aunly,column='UDEA_authors',
                            author_key='WOS_author',affiliation_key='WOS_affiliation',ratio=0):
    au=aunly[ aunly[column].apply(
                lambda d: d.get(author_key) if type(d)==dict else '').apply(
                  lambda l: author in l)]#.reset_index(drop=True).loc[0,column]
    if au.shape[0]>0:
        #Fast
        auf=au[au[column].apply(
                 lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                 lambda l: affiliation in l)]
        

        if auf.shape[0]>0:
              return auf.reset_index(drop=True).loc[0,column]          
        #slow
        else:
            aus=au[au[column].apply(
                 lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                 lambda l: len( [af for af in l if lv.ratio(af,affiliation) > ratio ] )>0 )]

            if aus.shape[0]>0:
                dold=aus.reset_index(drop=True).loc[0,column]
                dold[affiliation_key]=dold[affiliation_key]+[affiliation]
                #TODO: Update affiliation in aunly
                #aunly.apply(...d.update...) # d.update does not requiere 
                return dold
    else:
        return None

In [424]:
column='UDEA_authors';author_key='WOS_author';affiliation_key='WOS_affiliation'
author='Morales, A. L.'
affiliation= 'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas Natur, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.'
#author=dw.get('WOS_author');affiliation=dw.get('affiliation')[0]
au=find_author_affiliation(author,affiliation ,aunly )

In [425]:
au

{'CÉDULA': 8287418.0,
 'DEPARTAMENTO': 'Instituto de Física',
 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
 'GRUPO': 'Grupo de Estado Sólido, Instrumentación Científica y Microelectrónica, Grupo de Materia Condensada-UdeA',
 'INICIALES': 'A. L.',
 'NOMBRE COMPLETO': 'Alvaro Luis Morales Aramburo',
 'NOMBRES': 'Alvaro Luis',
 'PRIMER APELLIDO': 'Morales',
 'SEGUNDO APELLIDO': 'Aramburo',
 'WOS_affiliation': ['Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Medellin, Colombia.',
  'Univ Antioquia UdeA, Grp Mat Condensada UdeA, Inst Fis, Fac Ciencias Exactas & Nat, Medellin, Colombia.',
  'Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Grp Mat Condensada UdeA, Inst Fis, Medellin, Colombia.',
  'Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensade UdeA, Medellin, Colombia.',
  'Univ Antioquia,

In [400]:
au[ au[column].apply(
                 lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                 lambda l: len( [af for af in l if lv.ratio(af,affiliation)>ratio] )>0  ) ]

KeyError: 'UDEA_authors'

In [226]:
author='Restrepo, D.'
#dw.get('WOS_author');
affiliation=dw.get('affiliation')[0]
column='UDEA_authors';author_key='WOS_author';affiliation_key='WOS_affiliation'
aunly[ pd.np.logical_and( aunly['UDEA_authors'].apply(
                            lambda d: d.get(author_key) if type(d)==dict else '').apply(
                            lambda l: author in l),True
                        )].reset_index(drop=True)['UDEA_authors'].loc[0]

{'CÉDULA': 98554575.0,
 'DEPARTAMENTO': 'Instituto de Física',
 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
 'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
 'INICIALES': 'D. A.',
 'NOMBRE COMPLETO': 'Diego Alejandro Restrepo Quintero',
 'NOMBRES': 'Diego Alejandro',
 'PRIMER APELLIDO': 'Restrepo',
 'SEGUNDO APELLIDO': 'Quintero',
 'WOS_affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin 050010, Colombia.',
  'Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.',
  'Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.',
  'Univ Antioquia, Inst Fis, Medellin, Colombia.',
  'Univ Antioquia, Inst Fis, Medellin 1226, Colombia.',
  'Univ Antioquia, Inst Fis, Bogota, Colombia.',
  'Univ Antioquia, Inst Fis, Medellin, Antioquia, Colombia.',
  'Univ Antioquia, Inst Fis, Antioquia, Colombia.'],
 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'],
 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO'}

In [222]:
aunly[ pd.np.logical_and( True,
                        aunly['UDEA_authors'].apply(
                            lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                            lambda l: affiliation in l)
                        )].reset_index(drop=True)['UDEA_authors']

Series([], Name: UDEA_authors, dtype: object)

In [198]:
aunly[ pd.np.logical_and( aunly['UDEA_authors'].apply(
                            lambda d: d.get('WOS_affiliation') if type(d)==dict else '').apply(
                            lambda l: dw.get('affiliation')[0] in l),
                        True
                        )]

Unnamed: 0,UDEA_authors,tmp_author


In [None]:
aunly[ pd.np.logical_and( aunly['UDEA_authors'].apply(
                            lambda d: d.get('WOS_author') if type(d)==dict else '').apply(
                            lambda l: dw.get('WOS_author') in l),
                        True
                        )]

In [None]:
aunly['UDEA_authors'].apply(
                            lambda d: d.get('WOS_affiliation')[0] if type(d)==dict else '').apply(
                            lambda l: dw.get('affiliation') in l)

In [165]:
if type(x)==list and x:
    print(1)
    

1


In [142]:
x=UDEAnewY['authors_WOS'].loc[0]
y=UDEAnewY.loc[0]
y['UDEA_authors']

In [147]:
y

In [145]:
UDEA[UDEA.DI=='10.1016/j.tvjl.2016.12.011']

Unnamed: 0,AB,AF,AR,AU,BA,BE,BF,BN,BP,C1,...,UDEA_procodigo,UDEA_ptos,UDEA_simple_doi,UDEA_título,UDEA_valor item,UT,VL,WC,Z9,authors_WOS
5,"The aims of this study were to evaluate the effects of milking method, disinfection practices and other management factors on the bulk tank milk somatic cell count (BTSCC) in tropical dairy herds ...","Reyes, J.\nSanchez, J.\nStryhn, H.\nOrtiz, T.\nOlivera, M.\nKeefe, G. P.\n",,"Reyes, J\nSanchez, J\nStryhn, H\nOrtiz, T\nOlivera, M\nKeefe, GP\n",,,,,34\n,"[Reyes, J.; Sanchez, J.; Stryhn, H.; Keefe, G. P.] Univ Prince Edward Isl, Atlant Vet Coll, Dept Hlth Management, Charlottetown, PE C1A 4P, Canada.\n[Ortiz, T.; Olivera, M.] Univ Antioquia, Fac Ag...",...,,,,,,WOS:000395218400006\n,220\n,Veterinary Sciences\n,0,"[{'affiliation': ['Univ Antioquia, Fac Agr Sci, Biogenesis Res Grp, Calle 70 52-21, Medellin 050010, Colombia.'], 'i': 0, 'WOS_author': 'Ortiz, T.'}, {'affiliation': ['Univ Antioquia, Fac Agr Sci,..."


# TMP

The entries left are the ones where there are normalization problemas
* Bad written names
* Wrong affiliation in WOS
* Author appears in AU but not in C1

In [100]:
i=0
tmpua=UDEA[UDEA.UDEA_authors.str[i].apply(lambda d: True if type(d)==dict else False)]

In [101]:
tmpkk=tmpua[ tmpua.UDEA_authors.str[i].apply(lambda d: d.get('WOS_author')).str[0].apply( pd.isna) ]

In [102]:
tmp=tmpkk[ tmpkk.authors_WOS.apply( lambda l: l if l and type(l)==list else pd.np.nan ).apply( 
    lambda l: True if type(l)==list else False) ][['authors_WOS','UDEA_authors','UDEA_autores','TI','DI','AU','C1']].reset_index(drop=True)#.loc[4].values

In [103]:
tmp.shape

(221, 7)

In [104]:
i=204
tmp[i:i+1]#.UDEA_authors#.loc[0]

Unnamed: 0,authors_WOS,UDEA_authors,UDEA_autores,TI,DI,AU,C1
204,"[{'affiliation': ['Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.'], 'i': 0, 'WOS_author': 'Salamanca M., Constain'}]","[{'full_name': 'SALAMANCA MEJIA CONSTAIN HUGO', 'NOMBRES': 'Constain Hugo', 'SEGUNDO APELLIDO': 'Mejia', 'INICIALES': 'C. H.', 'PRIMER APELLIDO': 'Salamanca'}]",SALAMANCA MEJIA CONSTAIN HUGO,EXPERIMENTAL AND THEORETICAL STUDY OF SHYNTESIS OF N-ALKYL-NITROIMIDAZOLES.,,"Salamanca, C\nTiznado, W\nJaramillo, P\n","[Salamanca M., Constain] Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.\n[Tiznado V., William] Univ Nacl Andres Bello, Fac Ecol & Recursos Nat, Dept Ciencias Quim, Santiag..."


In [105]:
tmp.UDEA_authors.loc[i]

[{'INICIALES': 'C. H.',
  'NOMBRES': 'Constain Hugo',
  'PRIMER APELLIDO': 'Salamanca',
  'SEGUNDO APELLIDO': 'Mejia',
  'full_name': 'SALAMANCA MEJIA CONSTAIN HUGO'}]

In [106]:
tmp.authors_WOS.loc[i]

[{'WOS_author': 'Salamanca M., Constain',
  'affiliation': ['Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.'],
  'i': 0}]

In [107]:
tmp.UDEA_autores.loc[i]

'SALAMANCA MEJIA CONSTAIN HUGO'

In [108]:
tmp.TI.loc[i]

'EXPERIMENTAL AND THEORETICAL STUDY OF SHYNTESIS OF N-ALKYL-NITROIMIDAZOLES.'

In [109]:
print(tmp.AU.loc[i])

Salamanca, C
Tiznado, W
Jaramillo, P



In [110]:
print(tmp.C1.loc[i])

[Salamanca M., Constain] Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.
[Tiznado V., William] Univ Nacl Andres Bello, Fac Ecol & Recursos Nat, Dept Ciencias Quim, Santiago, Chile.
[Jaramillo G, Paula] Univ Sao Paulo, Inst Fis, Fac Ciencias, BR-01498 Sao Paulo, Brazil.



In [111]:
x=tmp['authors_WOS'].reset_index(drop=True).loc[i]
y=tmp['UDEA_authors'].reset_index(drop=True).loc[i]

In [112]:
x

[{'WOS_author': 'Salamanca M., Constain',
  'affiliation': ['Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.'],
  'i': 0}]

In [113]:
y

[{'INICIALES': 'C. H.',
  'NOMBRES': 'Constain Hugo',
  'PRIMER APELLIDO': 'Salamanca',
  'SEGUNDO APELLIDO': 'Mejia',
  'full_name': 'SALAMANCA MEJIA CONSTAIN HUGO'}]

In [114]:
i=0
y#[i]['NOMBRE COMPLETO'].split()

[{'INICIALES': 'C. H.',
  'NOMBRES': 'Constain Hugo',
  'PRIMER APELLIDO': 'Salamanca',
  'SEGUNDO APELLIDO': 'Mejia',
  'full_name': 'SALAMANCA MEJIA CONSTAIN HUGO'}]

In [115]:
y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name']

In [116]:
i=0
wos_names_list(y[i] ,y_keys)

['Salamanca, Constain Hugo',
 'Salamanca, Constain',
 'Salamanca, Hugo',
 'Salamanca, C. H.',
 'Salamanca, C.',
 'Salamanca, Constain H.',
 'Salamanca, H.',
 'Salamanca-Mejia, Constain Hugo',
 'Salamanca-Mejia, Constain',
 'Salamanca-Mejia, Hugo',
 'Salamanca-Mejia, C. H.',
 'Salamanca-Mejia, C.',
 'Salamanca-Mejia, Constain H.',
 'Salamanca-Mejia, H.']

In [117]:
x

[{'WOS_author': 'Salamanca M., Constain',
  'affiliation': ['Univ Antioquia, Dept Farm, Fac Quim Farmaceut, Medellin 1226, Colombia.'],
  'i': 0}]

In [118]:
[1,
  2]

[1, 2]

In [119]:
def combinewos(x,y,x_keys=['WOS_author','affiliation'],
                   y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name'],
                   xy_keys=['WOS_author','WOS_affiliation']):
    if type(x)==list and type(y)==list:
        for dx in x:
            print( dx[ x_keys[0] ] )
            wos_name=unidecode.unidecode( dx[ x_keys[0] ] )
            WOS_affiliation= dx[x_keys[1]]
            print('Name True')
            for i in range( len(y) ):
                print('+'*20)
                print( wos_names_list(y[i] ,y_keys))
                print('+'*20)
                print(i,wos_name in wos_names_list(y[i] ,y_keys) )
                print('='*20)
                if wos_name.title() in wos_names_list(y[i] ,y_keys):
                    print('*'*20)
                    print(wos_name in wos_names_list(y[i] ,y_keys),':',wos_name)
                    print('*'*20)
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            # Try by buildong spanish-like names list                    
            #Try again but comparing full lists
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            print("-"*20)
            for i in range( len(y) ):
                print('--',i)
                yi_to_list=unidecode.unidecode( y[i][y_keys[4]].title() )
                if yi_to_list:
                    yi_to_list=yi_to_list.split()
                else:
                    yi_to_list=[]
                print(i,wos_name_to_list,yi_to_list,  )    
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with initials
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]] ]+y[i][y_keys[2]].split()
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with first first name and initial
            print(":"*20)
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[0],
                              y[i][y_keys[2]].split()[-1]]
                print("::::",yi_to_list)
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
            #Try again but comparing full lists with second first name and initial
            print(":"*20)
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[-1],
                              y[i][y_keys[2]].split()[0]]
                print("::::",yi_to_list)
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
                    
                    
                    
    return y

In [120]:
combinewos(x,y)

Salamanca M., Constain
Name True
++++++++++++++++++++
['Salamanca, Constain Hugo', 'Salamanca, Constain', 'Salamanca, Hugo', 'Salamanca, C. H.', 'Salamanca, C.', 'Salamanca, Constain H.', 'Salamanca, H.', 'Salamanca-Mejia, Constain Hugo', 'Salamanca-Mejia, Constain', 'Salamanca-Mejia, Hugo', 'Salamanca-Mejia, C. H.', 'Salamanca-Mejia, C.', 'Salamanca-Mejia, Constain H.', 'Salamanca-Mejia, H.']
++++++++++++++++++++
0 False
--------------------
-- 0
0 ['Salamanca', 'M.', 'Constain'] ['Salamanca', 'Mejia', 'Constain', 'Hugo']
::::::::::::::::::::
:::: ['Salamanca', 'Mejia', 'Constain', 'H.']
::::::::::::::::::::
:::: ['Salamanca', 'Mejia', 'Hugo', 'C.']


[{'INICIALES': 'C. H.',
  'NOMBRES': 'Constain Hugo',
  'PRIMER APELLIDO': 'Salamanca',
  'SEGUNDO APELLIDO': 'Mejia',
  'full_name': 'SALAMANCA MEJIA CONSTAIN HUGO'}]

In [121]:
pd.np.setdiff1d(['Bastidas', 'Myriam'] ,['Bastidas', 'Acevedo', 'Miryam', 'Del', 'Socorro']).shape[0]

1

In [122]:
800*3*3

7200

In [123]:
aunly

Unnamed: 0,tmp_author,UDEA_authors
0,DOVER ROBERT VAN HORN,"{'full_name': 'DOVER ROBERT VAN HORN', 'INICIALES': 'V. H.', 'WOS_author': ['Dover, Robert V. H.'], 'WOS_affiliation': ['Univ Antioquia, Dept Antropol, Antioquia, Colombia.', 'Univ Antioquia, Dept..."
1,RADA RINCON JUAN PABLO,"{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Juan Pablo', 'WOS_affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.', 'Univ Antioquia, Inst Matemat, M..."
2,MACHADO RODRIGUEZ GLORIA,"{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Gloria', 'WOS_affiliation': ['Univ Antioquia, Grp Invest Identificac Genet IdentiGE, Medellin, Colombia.', 'Univ Antioquia, Grp..."
3,DUQUE ECHEVERRI CARLOS ALBERTO,"{'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'NOMBRES': 'Carlos Alberto', 'WOS_affiliation': ['Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Mat Condensada UdeA, Medel..."
4,ARDILA MEDINA CARLOS MARTIN,"{'full_name': 'ARDILA MEDINA CARLOS MARTIN', 'INICIALES': 'C. M.', 'WOS_author': ['Ardila, Carlos M.', 'Martin Ardila, Carlos', 'Ardila, C. M.', 'Ardila, Carlos Martin'], 'WOS_affiliation': ['Univ..."
5,BEDOYA CARO IVAN DARIO,"{'FACULTAD': 'Facultad de Ingeniería', 'NOMBRES': 'Ivan Dario', 'WOS_affiliation': ['Univ Antioquia, Grp Ciencia & Tecnol Gas & Uso Rac Energia, Medellin 63108, Colombia.', 'Univ Antioquia, Dept M..."
6,RAMIREZ SANCHEZ ISABEL CRISTINA,"{'FACULTAD': 'Facultad de Medicina', 'NOMBRES': 'Isabel Cristina', 'WOS_affiliation': ['Univ Antioquia, Sch Med, Dept Internal Med, Div Infect Dis, Medellin, Colombia.', 'Univ Antioquia, Hosp Pabl..."
7,AGUDELO SUAREZ ANDRES ALONSO,"{'full_name': 'AGUDELO SUAREZ ANDRES ALONSO', 'INICIALES': 'A. A.', 'WOS_author': ['Agudelo-Suarez, Andres A.', 'Agudelo-Suarez, A. A.', 'Agudelo-Suarez, Andres', 'Alonso Agudelo-Suarez, Andres', ..."
8,ROJAS CAMARGO JHON JAIRO,"{'full_name': 'ROJAS CAMARGO JHON JAIRO', 'INICIALES': 'J. J.', 'WOS_author': ['Rojas, J.'], 'WOS_affiliation': ['Univ Antioquia, Sch Pharmaceut Chem, Dept Pharm, Medellin 094, Colombia.'], 'PRIME..."
9,CASTRILLON GERMAN ALBERTO,"{'FACULTAD': 'Facultad de Medicina', 'NOMBRES': 'Alberto', 'WOS_affiliation': ['Univ Antioquia, Dept Radiol, Medellin, Colombia.'], 'INICIALES': 'A.', 'GRUPO': 'Grupo de Gastrohepatología', 'SEGUN..."
