<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+GS+LNS
Merge the bibliographic datasets for 
* Web of Science, 
* Scielo 
* Scopus 
* Google Scholar
* Puntaje
* Lens
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

## functions

In [2]:
import unidecode
def get_author_info(x):
    sep='; '
    authors=[{'WOS_author':x[0].split(sep)[0],'affiliation':[x[0].split(sep)[-1]],'i':0}]
    iau=1
    for y  in x:
        y2=y.replace('[','').replace('] ',sep).split(sep)
        for z in y2[:-1]:
            aulist=[ d.get('WOS_author') for d in authors]
            if z not in aulist:
                authors.append({'WOS_author':z,'affiliation':[y2[-1]],'i':iau})
                iau=iau+1
            else:
                if y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]:
                    index_author=[ d.get('i') for d in authors if d.get('WOS_author')==z][0]
                    authors[index_author]['affiliation'].append(y2[-1])
    return authors

def dictionary_list_add_columns(df,df_dl,df_dl_key,df_dl_i,df_columns):
    '''
    For a
     df: Pandas DataFrame 
    with a:
     df_dl: column of list of dictionaries, with
     df_dl_key: dictionary key: e.g x=[{df_dl_key:1},{df_dl_key:2}]
    for the element df_dl_i of the list:
    Update the dictionary with:
        df_dl_key==x[df_dl_i][df_dl_key]
    with the dictionaries { df_columns[i]: df_columns[i].values }
    '''
    dff=df.copy()
    for key in df_columns:
        tmp=dff[df_dl].combine(dff[key],
                func=lambda x,y: y if pd.isna(y) 
                                   else 
                                     [z.update({key:y}) 
                                     if z.get(df_dl_key)==x[df_dl_i][df_dl_key] 
                                     else z 
                                 for z in x  ] )
    return dff

def split_full_names(y,full_name='full_name'):
    yy=y.get(full_name).title()
    lfn=len(y[full_name].split())
    aps=0
    d={ 'PRIMER APELLIDO':yy.split()[aps] }
    aps=aps+1
    if lfn>=4:
        names=-2
        if len( d['PRIMER APELLIDO'] )<=3:
            d['PRIMER APELLIDO']=d['PRIMER APELLIDO']+' '+yy.split()[aps]
            aps=aps+1
            d.update(  {'SEGUNDO APELLIDO':yy.split()[aps]} )
            names=names+1
            
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        aps=aps+1
        if len( d['SEGUNDO APELLIDO'] )<=3:
            d['SEGUNDO APELLIDO']=d['SEGUNDO APELLIDO']+' '+yy.split()[aps]
            if names==-2:
                names=names+1
    elif lfn>=3:        
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        names=-1
    else: #Colombian interpretation (TODO: Includes Brazilian interpretation)    
        names=-1
    d.update({'NOMBRES':' '.join( yy.split()[names:]),
              'INICIALES':' '.join( [z[0]+'.' for z in yy.split()[names:]] ),
              })
    return d

# Creates mask Search key in a list of dictionay
# First apply convert null values to string
# Second apply: implement a mask
def find_key_in_list_of_dictionaries(df,column,key,pattern):
    return df[column].apply(lambda x: 
                [ '' if pd.isnull( y.get(key)) else y for y in x ]  ).apply(
                            lambda x: 
                [ True if y.get(key).find(pattern)>-1 else False for y in x  ][0]  )

def key_contains_in_list_of_dictionaries(df,pattern,column='authors_WOS',key='WOS_author'):
    #TODO: loop in column len
    i=0
    r=df[ df[column].str[i].apply(lambda x: {} if pd.isnull(x) else x).apply(
                       lambda x: x.get(key) if x else '').str.contains(
        pattern) ][column].reset_index(drop=True)
    return r

In [3]:
import wosplus as wp
import pandas as pd

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [4]:
%%writefile drive.cfg
[FILES]
UDEA_WOS.xlsx       = 1px2IcrjCrkyu7t78Q7PAE5nzV_yuPt9t
UDEA_SCI.xlsx       = 1pWMY5P72j0Ca6D-cm7dn7Q4TBGTs4PWV
UDEA_SCP.xlsx       = 1ulCsFHzDiTmuL9TH8F58ulh0u8Z2ylKh
UDEA_WOS_SCI_SCP.xlsx   = 1o9otmklgh-0w18Avv2ZTKOXr3vZbjwvj
UDEA_WOS_SCI_SCP.json=1RTDCh5pl0vapjJT_e9ZwadHPGBKGGv6Y
UDEA_WOS_SCI_SCP.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs

Overwriting drive.cfg


##  Load data bases

In [5]:
drive_files=wp.wosplus('drive.cfg')

In [6]:
UDEA=drive_files.read_drive_json('UDEA_WOS_SCI_SCP.json')

In [7]:
UDEA.Tipo.unique()

array(['WOS', 'SCI_SCP', 'WOS_SCP', 'WOS_SCI_SCP', 'SCP', 'WOS_SCI',
       'SCI'], dtype=object)

In [8]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

WOS:1884
SCI_SCP:1622
WOS_SCP:5824
WOS_SCI_SCP:773
SCP:2584
WOS_SCI:147
SCI:2892


## Extract  affiliation from C1

In [9]:
affil='Univ Antioquia'
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

# Improve normalization: remove C1s with only affiliation (from Scielo)
UDEA['authors_WOS']=UDEA['authors_WOS'].apply( 
    lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x )

## load trained 

In [10]:
SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')

In [11]:
SIU.Tipo.unique()

array(['WOS+SCP', 'WOS+SCP+UDEA', 'WOS+SCI+SCP', 'WOS+SCI+SCP+UDEA',
       'SCI+SCP', 'SCI+SCP+UDEA', 'SCP', 'SCP+UDEA', 'WOS', 'WOS+UDEA',
       'WOS+SCI', 'WOS+SCI+UDEA', 'SCI', 'SCI+UDEA', 'UDEA'], dtype=object)

In [12]:
SIU=SIU[SIU.Tipo.str.contains('\+UDEA')].reset_index(drop=True)

In [13]:
SIU.columns.values

array(['AB', 'AF', 'AR', 'AU', 'BA', 'BE', 'BF', 'BN', 'BP', 'C1', 'CA',
       'CL', 'CR', 'CT', 'CTR_CATEGORÍA G', 'CTR_CATEGORÍA I',
       'CTR_Centro', 'CTR_CÉDULA', 'CTR_GRUPO', 'CTR_LINK CVLAC',
       'CTR_NOMBRE', 'CTR_Nivel de Formación', 'CTR_Tipo de Vinculación',
       'CTR_vinculación GrupLAC', 'CY', 'Clasificación 2016', 'D2', 'DE',
       'DI', 'DT', 'EI', 'EM', 'EP', 'FU', 'FX', 'GA', 'GP', 'HO', 'ID',
       'IS', 'ISSN', 'J9', 'JI', 'LA', 'MA', 'NR', 'OI', 'PA', 'PD', 'PG',
       'PI', 'PM', 'PN', 'PT', 'PU', 'PY', 'RI', 'RP', 'SC', 'SCI_AB',
       'SCI_AU', 'SCI_BP', 'SCI_C1', 'SCI_C2', 'SCI_CR', 'SCI_DE',
       'SCI_DI', 'SCI_DT', 'SCI_EC', 'SCI_EM', 'SCI_EP', 'SCI_IS',
       'SCI_LA', 'SCI_NR', 'SCI_OI', 'SCI_PA', 'SCI_PD', 'SCI_PI',
       'SCI_PT', 'SCI_PU', 'SCI_PY', 'SCI_RI', 'SCI_SC', 'SCI_SN',
       'SCI_SO', 'SCI_TC', 'SCI_TI', 'SCI_U1', 'SCI_U2', 'SCI_UT',
       'SCI_VL', 'SCI_X1', 'SCI_X4', 'SCI_X5', 'SCI_Y1', 'SCI_Y4',
       'SCI_Y5', 'SCI_Z1', 'S

TODO: def: Convert value string list into a list of dictionaries

In [14]:
SIU['UDEA_authors']=SIU.UDEA_autores.str.split(';').apply(lambda x: [{'full_name':y} for y in x ])

### Merge with official researcher list

BUG: Only first author information added

In [15]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [16]:
AU_columns=list( AU.columns.values )

In [17]:
AU['name_tmp']=(AU['PRIMER APELLIDO']+' '+AU['SEGUNDO APELLIDO']+' '+AU['NOMBRES']).str.lower().str.strip().apply( 
    unidecode.unidecode )

In [18]:
import unidecode

In [19]:
maxau=SIU['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]).apply(len).max()

In [20]:
kkn=SIU.copy()

In [21]:
newcolumns=['name_tmp']+AU_columns
for i in range(maxau):
    print(i)
    kkn['name_tmp']=kkn['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]
                            ).str[i].apply( lambda x: unidecode.unidecode( x.lower().strip()) 
                                                      if not pd.isna(x) else x)
    if not kkn[~kkn['name_tmp'].isna()].empty:
        kkn=kkn.merge(AU[newcolumns],on='name_tmp',how='left').reset_index(drop=True)
        kkn=dictionary_list_add_columns(kkn,'UDEA_authors','full_name',i,AU_columns)
        kkn=kkn.drop(newcolumns,axis='columns')

0
1
2
3
4
5
6
7
8
9


https://stackoverflow.com/a/29530601/2268280

BUG: Cells filled with nans

In [22]:
kkn.shape,SIU.shape

((7916, 205), (7916, 205))

In [23]:
SIU=kkn.copy()

In [24]:
AU[ AU['NOMBRE COMPLETO'].str.contains('Oscar Alberto Zapata Noreña') ]

Unnamed: 0,CÉDULA,NOMBRE COMPLETO,NOMBRES,PRIMER APELLIDO,SEGUNDO APELLIDO,FACULTAD,DEPARTAMENTO,GRUPO,name_tmp
234,15386534,Oscar Alberto Zapata Noreña,Oscar Alberto,Zapata,Noreña,Facultad de Ciencias Exactas y Naturales,Instituto de Física,Grupo de Fenomenologia de Interacciones Fundam...,zapata norena oscar alberto


Quality check

In [25]:
SIU[ find_key_in_list_of_dictionaries(SIU,'UDEA_authors','full_name','ZAPATA') ].UDEA_authors.loc[241]

[{'CÉDULA': 15386534.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'Oscar Alberto Zapata Noreña',
  'NOMBRES': 'Oscar Alberto ',
  'PRIMER APELLIDO': 'Zapata',
  'SEGUNDO APELLIDO': 'Noreña',
  'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO'},
 {'CÉDULA': 8287417.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'William Antonio Ponce Gutierrez',
  'NOMBRES': 'William Antonio ',
  'PRIMER APELLIDO': 'Ponce',
  'SEGUNDO APELLIDO': 'Gutierrez',
  'full_name': 'PONCE GUTIERREZ WILLIAM ANTONIO'}]

In [26]:
SIU[ find_key_in_list_of_dictionaries(SIU,'UDEA_authors','full_name','PONCE')
   ].UDEA_authors.reset_index(drop=True).loc[0]

[{'CÉDULA': 8287417.0,
  'DEPARTAMENTO': 'Instituto de Física',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales',
  'NOMBRE COMPLETO': 'William Antonio Ponce Gutierrez',
  'NOMBRES': 'William Antonio ',
  'PRIMER APELLIDO': 'Ponce',
  'SEGUNDO APELLIDO': 'Gutierrez',
  'full_name': 'PONCE GUTIERREZ WILLIAM ANTONIO'}]

### Merge with trained data set

Merge requires split in DI and TI

In [27]:
SIUDI=SIU[~SIU.DI.isna()].drop_duplicates('DI').reset_index(drop=True)
SIUTI=SIU[ SIU.DI.isna()].drop_duplicates('TI').reset_index(drop=True)
SIUTI=SIUTI[SIUTI!=''].reset_index(drop=True)
SIUTI=SIUTI[~SIUTI.TI.isnull()].reset_index(drop=True)
SIUTI=SIUTI[ SIUTI.TI.apply(len)>20 ].reset_index(drop=True)

In [28]:
udea_columns=[       'UDEA_autores',
       'UDEA_año realiz', 'UDEA_doi', 'UDEA_fecha aplicación',
       'UDEA_idioma', 'UDEA_item adic', 'UDEA_material', 'UDEA_nombre',
       'UDEA_nombre revista o premio', 'UDEA_nro autores', 'UDEA_país',
       'UDEA_procodigo', 'UDEA_ptos', 'UDEA_simple_doi', 'UDEA_título',
       'UDEA_valor item','UDEA_authors']

In [29]:
UDEADI=UDEA[UDEA.DI!=''].drop_duplicates('DI').reset_index(drop=True)
UDEATI=UDEA[UDEA.DI==''].drop_duplicates('TI').reset_index(drop=True)

In [30]:
UDEA_mergeDI=UDEADI.merge( SIUDI[ ['DI']+udea_columns ],on='DI',how='left' )

In [31]:
UDEADI.shape,UDEA_mergeDI.shape

((8035, 152), (8035, 169))

In [32]:
UDEA_PTJ=UDEA_mergeDI[~UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_mergeDI[UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)

In [33]:
UDEATI['tmptitle']=UDEATI.TI.str.strip()
SIUTI['tmptitle']=SIUTI.TI.str.strip()

In [34]:
kk=UDEATI.merge( SIUTI[ ['tmptitle']+udea_columns ],on='tmptitle',how='left' ).drop('tmptitle',axis='columns')

In [35]:
UDEA_PTJ=UDEA_PTJ.append( kk[ ~kk.UDEA_autores.isna() ] ).reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_PTJ_NOT.append( kk[ kk.UDEA_autores.isna() ] ).reset_index(drop=True)

In [36]:
UDEA_PTJ.shape[0]+UDEA_PTJ_NOT.shape[0],UDEA.shape

(15722, (15726, 152))

In [37]:
UDEA_PTJ.shape,UDEA_PTJ_NOT.shape

((7074, 169), (8648, 169))

In [38]:
UDEA_PTJ.loc[[0,1]]

Unnamed: 0,AB,AF,AR,AU,BA,BE,BF,BN,BP,C1,...,UDEA_nombre,UDEA_nombre revista o premio,UDEA_nro autores,UDEA_país,UDEA_procodigo,UDEA_ptos,UDEA_simple_doi,UDEA_título,UDEA_valor item,UDEA_authors
0,,,,"Ramírez, Juan David\nGiraldo, Santiago Patiño\...",,,,,,,...,PATIÑO GIRALDO SANTIAGO,ver UDEA_material,3.0,46.0,53961.0,3.6,0.0,"LINFOMA PRIMARIO DEL CORAZON, CAUSA POCO COMUN...",SOCIEDAD COLOMBIANA DE CARDIOLOGIA Y CIRUGIA C...,"[{'full_name': 'PATIÑO GIRALDO SANTIAGO', 'GRU..."
1,,,,"Molina, Marcela\nPalacio, Juan David\nVargas, ...",,,,,,,...,LOPEZ JARAMILLO CARLOS ALBERTO,REVISTA COLOMBIANA DE PSIQUIATRIA,7.0,46.0,55920.0,4.3,0.0,DESEMPENO NEUROCOGNITIVO DE PACIENTES CON TRAS...,,"[{'full_name': 'VARGAS UPEGUI CRISTIAN DAVID',..."


In [39]:
UDEA=UDEA_PTJ.append(
    UDEA_PTJ_NOT).reset_index(
    drop=True)

In [40]:
key_contains_in_list_of_dictionaries(UDEA,'Restrepo, D',column='authors_WOS',key='WOS_author').loc[1:2]

1    [{'affiliation': ['Univ Antioquia, Inst Fis, C...
2    [{'affiliation': ['Univ Antioquia, Inst Fis, M...
Name: authors_WOS, dtype: object

In [41]:
UDEA.to_json('UDEAtmp.json')

In [42]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Extrapolates to UDEA_PTJ_NOT

### merge separated names of UDEA.authors_WOS with UDEA.UDEA_authors data 

Obtain name parts and initial from full name in `UDEA_authors` dictionary and update `UDEA_authors`

In [43]:
# tmp is for update dict
dictupdatetmp=UDEA['UDEA_authors'].apply(lambda x: [y.update( 
                split_full_names(y,full_name='full_name')  ) if not pd.isnull(
                y.get('full_name')) else y for y in x] 
                                   if type(x)==list 
                                   else x)

In [49]:
x,y=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=lambda x,y:(x,y) ).loc[12]

In [50]:
x

[{'WOS_author': 'Monsalve, Juan',
  'affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.'],
  'i': 0},
 {'WOS_author': 'Rada, Juan',
  'affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.'],
  'i': 1}]

In [51]:
y

[{'CÉDULA': 413218.0,
  'DEPARTAMENTO': 'Departamento de Matemáticas',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Álgebra U de A',
  'INICIALES': 'J. P.',
  'NOMBRE COMPLETO': 'Juan Pablo Rada Rincon',
  'NOMBRES': 'Juan Pablo',
  'PRIMER APELLIDO': 'Rada',
  'SEGUNDO APELLIDO': 'Rincon',
  'full_name': 'RADA RINCON JUAN PABLO'}]

In [53]:
def wos_names_append(wos_names,last_name,first_names,initials):
    wos_names=wos_names+[           last_name+', '+first_names]
    if len( first_names.split())>1:
        wos_names=wos_names+[ last_name+', '+first_names.split()[0] ]
    wos_names=wos_names+[ last_name+', '+initials]
    if len( initials.split())>1:
        wos_names=wos_names+[ last_name+', '+initials.split()[0]]
    if len(initials.split())==2:
        wos_names=wos_names+[
              last_name+', '+first_names.split()[0]+' '+initials.split()[-1] ]
    return wos_names
    
def wos_names_list(dy ,y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO']   ):
    last_name= unidecode.unidecode( dy[y_keys[0]]   )
    first_names=unidecode.unidecode( dy[y_keys[1]]  )
    initials=unidecode.unidecode( dy[y_keys[2]]  )

    wos_names=[]
    wos_names=wos_names_append(wos_names,last_name,first_names,initials)
    
    if dy.get( y_keys[3] ):
        last_names= unidecode.unidecode( dy[y_keys[0]]+'-'+dy[y_keys[3]]   )
        wos_names=wos_names_append(wos_names,last_names,first_names,initials)
    return wos_names
    
def combinewos(x,y,x_keys=['WOS_author','affiliation'],
                   y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO'],
                   xy_keys=['WOS_author','WOS_affiliation']):
    if type(x)==list and type(y)==list:
        for dx in x:
            wos_name=unidecode.unidecode( dx[ x_keys[0] ] )
            WOS_affiliation= dx[x_keys[1]]
            for i in range( len(y) ):
                if wos_name in wos_names_list(y[i] ,y_keys):
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation 
    return y

In [54]:
combinewos(x,y)

[{'CÉDULA': 413218.0,
  'DEPARTAMENTO': 'Departamento de Matemáticas',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Álgebra U de A',
  'INICIALES': 'J. P.',
  'NOMBRE COMPLETO': 'Juan Pablo Rada Rincon',
  'NOMBRES': 'Juan Pablo',
  'PRIMER APELLIDO': 'Rada',
  'SEGUNDO APELLIDO': 'Rincon',
  'WOS_affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.'],
  'WOS_author': ['Rada, Juan'],
  'full_name': 'RADA RINCON JUAN PABLO'}]

In [55]:
kk=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=combinewos )

In [56]:
kk.loc[12]

[{'CÉDULA': 413218.0,
  'DEPARTAMENTO': 'Departamento de Matemáticas',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Álgebra U de A',
  'INICIALES': 'J. P.',
  'NOMBRE COMPLETO': 'Juan Pablo Rada Rincon',
  'NOMBRES': 'Juan Pablo',
  'PRIMER APELLIDO': 'Rada',
  'SEGUNDO APELLIDO': 'Rincon',
  'WOS_affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.'],
  'WOS_author': ['Rada, Juan'],
  'full_name': 'RADA RINCON JUAN PABLO'}]

Combines the two list into a single one in  UDEA_authors

check

Full query of auhor_WOS in UDEA_authors:

In [57]:
UDEA['UDEA_authors'].apply(lambda x: [y  for y in x ][0] if type(x)==list else x).loc[0]

{'CÉDULA': 8358251.0,
 'DEPARTAMENTO': 'Departamento de Medicina Interna',
 'FACULTAD': 'Facultad de Medicina',
 'GRUPO': 'Grupo de Investigación EDUSALUD, Informed',
 'INICIALES': 'S.',
 'NOMBRE COMPLETO': 'Santiago Patiño Giraldo',
 'NOMBRES': 'Santiago',
 'PRIMER APELLIDO': 'Patiño',
 'SEGUNDO APELLIDO': 'Giraldo',
 'full_name': 'PATIÑO GIRALDO SANTIAGO'}

In [58]:
UDEA.to_json('UDEAtmp.json')

In [59]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [60]:
def extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return df[list_of_dictionaries].apply(lambda x: [y.get(dictionary_key) 
                                                        if y.get(dictionary_key) else []   
                                                        for y in x  ]
                           if type(x)==list else [])

def extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal list as value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
                    list_of_dictionaries,dictionary_key).apply(lambda x: 
                           [item for sublist  in x for item in sublist] 
                            if type(x)==list else x)

In [61]:
pattern='Restrepo, Diego'
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
1     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
2     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
3     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
4     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
5     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
6     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
7     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
8     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
9     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
10    [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
Name: UDEA_authors, dtype: object

In [62]:
pattern='Restrepo, D.'
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
1     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
2     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
3     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
4     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
5     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
6     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
7     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
8     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
9     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
10    [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
11    [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
Name: UDEA_authors, dtype: object

## TODO: Try to merge new WOS articles from old `'UDEA_authors'`

In [63]:
UDEAnew=UDEA[UDEA.UDEA_authors.isna()].reset_index(drop=True)
UDEAold=UDEA[~UDEA.UDEA_authors.isna()].reset_index(drop=True)

In [64]:
mask=extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEA,list_of_dictionaries='UDEA_authors',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEA[mask]['UDEA_authors'].reset_index(drop=True)

0     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
1     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
2     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
3     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
4     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
5     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
6     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
7     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
8     [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
9     [{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', ...
10    [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
11    [{'full_name': 'RESTREPO QUINTERO DIEGO ALEJAN...
Name: UDEA_authors, dtype: object

TODO: Extract dict or value

In [126]:
def extract_dictionary_from_list(df,list_name,i=0):
    return df[list_name].apply( lambda l: [ d for d in l][i] if type(l)==list else l )
def extract_key_value_from_series(ds,key):
    return ds.apply( lambda d: d.get(key)if type(d)==dict else d  )

In [142]:
mask=extract_key_value_from_series( 
    extract_dictionary_from_list( UDEA,'UDEA_authors',i=0),'WOS_author').str[0].str.contains('\-').fillna(False)
UDEA[mask]['UDEA_authors'].reset_index(drop=True).str[0].loc[1]

{'CÉDULA': 87470374.0,
 'DEPARTAMENTO': 'Departamento de Microbiología y Parasitología',
 'FACULTAD': 'Facultad de Medicina',
 'GRUPO': 'Inmunovirología',
 'INICIALES': 'S.',
 'NOMBRE COMPLETO': 'Silvio Urcuqui Inchima',
 'NOMBRES': 'Silvio',
 'PRIMER APELLIDO': 'Urcuqui',
 'SEGUNDO APELLIDO': 'Inchima',
 'WOS_affiliation': ['Univ Antioquia, Sede Invest Univ, Grp Inmunoviol, Medellin, Colombia.'],
 'WOS_author': ['Urcuqui-Inchima, Silvio'],
 'full_name': 'URCUQUI INCHIMA SILVIO'}

## Merge old with news

In [66]:
UDEAnewY=UDEAnew[ UDEAnew.authors_WOS.apply(lambda x: type(x)==list and len(x)>0) ].reset_index(drop=True)
UDEAnewN=UDEAnew[~UDEAnew.authors_WOS.apply(lambda x: type(x)==list and len(x)>0) ].reset_index(drop=True)

In [67]:
pattern='Restrepo, D.'
mask=extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(
            UDEAnewY,list_of_dictionaries='authors_WOS',dictionary_key='WOS_author').apply( 
            lambda x: pattern in x)
UDEAnewY[mask].authors_WOS.reset_index(drop=True).values

array([list([{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Restrepo, D.', 'i': 0}]),
       list([{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Restrepo, D.', 'i': 0}, {'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Rivera, A.', 'i': 1}, {'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Sanchez, M.', 'i': 2}, {'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Zapata, O.', 'i': 3}]),
       list([{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'WOS_author': 'Restrepo, D.', 'i': 0}])],
      dtype=object)

In [189]:
i=0
UDEAnewY['news_i_0']=UDEAnewY.authors_WOS.apply(lambda x: [ {'WOS_author':y.get('WOS_author'),
                                                   'affiliation':y.get('affiliation')} for y in x]).str[0].astype(str)
UDEAnewY['news_i_0'].loc[0]

"{'affiliation': ['Univ Antioquia UdeA, Fac Ciencias Exactas & Nat, Inst Fis, Grp Opt & Foton, Calle 70 52-21, Medellin, Colombia.'], 'WOS_author': 'Fredy Barrera, John'}"

In [191]:
j=0
UDEAold['news_i_0']=UDEAold['UDEA_authors'].apply(lambda l: [ {'WOS_author':d.get('WOS_author')[0], 
                                           'affiliation':d.get('WOS_affiliation') } if d.get('WOS_author') 
                                         else pd.np.nan for d in l] ).str[0].dropna().astype(str).reset_index(drop=True)
UDEAold['news_i_0'].loc[0]

"{'affiliation': ['Univ Antioquia Medellin, Inst Matemat, Medellin, Colombia.'], 'WOS_author': 'Rada, Juan'}"

In [200]:
UDEAnewY[['news_i_0']].merge( UDEAold[['news_i_0','UDEA_authors']],on='news_i_0',how='left'
                            ).dropna().reset_index(drop=True).loc[4].values

array(["{'affiliation': ['Univ Antioquia, PECET, Medellin, Colombia.'], 'WOS_author': 'Muskus, Carlos'}",
       list([{'full_name': 'GALVAN DIAZ ANA LUZ', 'NOMBRES': 'Ana Luz', 'SEGUNDO APELLIDO': 'Diaz', 'INICIALES': 'A. L.', 'PRIMER APELLIDO': 'Galvan'}])],
      dtype=object)

In [None]:
UDEAnewY['authors_WOS'].loc[4085]#.apply( lambda x: [d for d in x] if type(x)==list else x ).loc[4085]

In [None]:
affil

In [None]:
UDEA['authors_WOS'].apply( lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x ).loc[4085]

# TMP

In [None]:
UDEA[ UDEA['authors_WOS'].apply(lambda x: [d for d in  x if d.get('WOS_author')=='Velasquez, Jesus A.']
                                if x else x).apply(len)>0 ]['authors_WOS'].str[0].apply( 
    lambda x: x.get('affiliation')).values

In [None]:
y2[-1],z

In [None]:
[ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]

In [None]:
y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]

In [None]:
authors[0]['affiliation'].append('kk')

In [None]:
authors

In [None]:
affs=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x )

In [None]:
affsf=affs.apply(lambda x: [  y.split('; ') for y in x])

In [None]:
affsf.apply( lambda x: [ y.append(x) for y in x]).loc[i]

In [None]:
import numpy as np

In [None]:
np.array(x).flatten()

In [None]:
affs

In [None]:
i=120
i=1
affil='Univ Antioquia'
affs=UDEA.C1.str.split('\n').apply(
    lambda x:   [ [z,y.replace('[','').split('] ')[-1]
                  for z in y.replace('[','').split('] ')[0].split('; ')
                  ] for y in x if y.find(affil)>-1 ]if x else x).loc[i]

In [None]:
{ x:'A' for x in [1,2]} 

In [None]:
affs