<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+GS+LNS
Merge the bibliographic datasets for 
* Web of Science, 
* Scielo 
* Scopus 
* Google Scholar
* Puntaje
* Lens
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

## functions

In [None]:
def get_author_info(x):
    sep='; '
    authors=[{'WOS_author':x[0].split(sep)[0],'affiliation':[x[0].split(sep)[-1]],'i':0}]
    iau=1
    for y  in x:
        y2=y.replace('[','').replace('] ',sep).split(sep)
        for z in y2[:-1]:
            aulist=[ d.get('WOS_author') for d in authors]
            if z not in aulist:
                authors.append({'WOS_author':z,'affiliation':[y2[-1]],'i':iau})
                iau=iau+1
            else:
                if y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]:
                    index_author=[ d.get('i') for d in authors if d.get('WOS_author')==z][0]
                    authors[index_author]['affiliation'].append(y2[-1])
    return authors

def dictionary_list_add_columns(df,df_dl,df_dl_key,df_dl_i,df_columns):
    '''
    For a
     df: Pandas DataFrame 
    with a:
     df_dl: column of list of dictionaries, with
     df_dl_key: dictionary key: e.g x=[{df_dl_key:1},{df_dl_key:2}]
    for the element df_dl_i of the list:
    Update the dictionary with:
        df_dl_key==x[df_dl_i][df_dl_key]
    with the dictionaries { df_columns[i]: df_columns[i].values }
    '''
    dff=df.copy()
    for key in df_columns:
        tmp=dff[df_dl].combine(dff[key],
                func=lambda x,y: [z.update({key:y}) if z.get(df_dl_key)==x[df_dl_i][df_dl_key] else z for z in x  ] )
    return dff

In [4]:
import wosplus as wp
import pandas as pd

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [627]:
%%writefile drive.cfg
[FILES]
UDEA_WOS.xlsx       = 1px2IcrjCrkyu7t78Q7PAE5nzV_yuPt9t
UDEA_SCI.xlsx       = 1pWMY5P72j0Ca6D-cm7dn7Q4TBGTs4PWV
UDEA_SCP.xlsx       = 1ulCsFHzDiTmuL9TH8F58ulh0u8Z2ylKh
UDEA_WOS_SCI_SCP.xlsx   = 1o9otmklgh-0w18Avv2ZTKOXr3vZbjwvj
UDEA_WOS_SCI_SCP.json=1RTDCh5pl0vapjJT_e9ZwadHPGBKGGv6Y
UDEA_WOS_SCI_SCP.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs

Overwriting drive.cfg


##  Load data bases

In [628]:
drive_files=wp.wosplus('drive.cfg')

In [8]:
UDEA=drive_files.read_drive_json('UDEA_WOS_SCI_SCP.json')

In [9]:
UDEA.Tipo.unique()

array(['WOS', 'SCI_SCP', 'WOS_SCP', 'WOS_SCI_SCP', 'SCP', 'WOS_SCI',
       'SCI'], dtype=object)

In [10]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

WOS:1884
SCI_SCP:1622
WOS_SCP:5824
WOS_SCI_SCP:773
SCP:2584
WOS_SCI:147
SCI:2892


## Extract  affiliation from C1

In [611]:
affil='Univ Antioquia'
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

## load trained 

In [1086]:
SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')

In [1087]:
SIU.Tipo.unique()

array(['WOS+SCP', 'WOS+SCP+UDEA', 'WOS+SCI+SCP', 'WOS+SCI+SCP+UDEA',
       'SCI+SCP', 'SCI+SCP+UDEA', 'SCP', 'SCP+UDEA', 'WOS', 'WOS+UDEA',
       'WOS+SCI', 'WOS+SCI+UDEA', 'SCI', 'SCI+UDEA', 'UDEA'], dtype=object)

In [1088]:
SIU=SIU[SIU.Tipo.str.contains('\+UDEA')].reset_index(drop=True)

In [1089]:
SIU.columns.values

array(['AB', 'AF', 'AR', 'AU', 'BA', 'BE', 'BF', 'BN', 'BP', 'C1', 'CA',
       'CL', 'CR', 'CT', 'CTR_CATEGORÍA G', 'CTR_CATEGORÍA I',
       'CTR_Centro', 'CTR_CÉDULA', 'CTR_GRUPO', 'CTR_LINK CVLAC',
       'CTR_NOMBRE', 'CTR_Nivel de Formación', 'CTR_Tipo de Vinculación',
       'CTR_vinculación GrupLAC', 'CY', 'Clasificación 2016', 'D2', 'DE',
       'DI', 'DT', 'EI', 'EM', 'EP', 'FU', 'FX', 'GA', 'GP', 'HO', 'ID',
       'IS', 'ISSN', 'J9', 'JI', 'LA', 'MA', 'NR', 'OI', 'PA', 'PD', 'PG',
       'PI', 'PM', 'PN', 'PT', 'PU', 'PY', 'RI', 'RP', 'SC', 'SCI_AB',
       'SCI_AU', 'SCI_BP', 'SCI_C1', 'SCI_C2', 'SCI_CR', 'SCI_DE',
       'SCI_DI', 'SCI_DT', 'SCI_EC', 'SCI_EM', 'SCI_EP', 'SCI_IS',
       'SCI_LA', 'SCI_NR', 'SCI_OI', 'SCI_PA', 'SCI_PD', 'SCI_PI',
       'SCI_PT', 'SCI_PU', 'SCI_PY', 'SCI_RI', 'SCI_SC', 'SCI_SN',
       'SCI_SO', 'SCI_TC', 'SCI_TI', 'SCI_U1', 'SCI_U2', 'SCI_UT',
       'SCI_VL', 'SCI_X1', 'SCI_X4', 'SCI_X5', 'SCI_Y1', 'SCI_Y4',
       'SCI_Y5', 'SCI_Z1', 'S

In [1090]:
SIU['UDEA_authors']=SIU.UDEA_autores.str.split(';').apply(lambda x: [{'full_name':y} for y in x ])

### Merge with official researcher list

In [1091]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [1092]:
AU_columns=list( AU.columns.values )

In [1093]:
AU['name_tmp']=(AU['PRIMER APELLIDO']+' '+AU['SEGUNDO APELLIDO']+' '+AU['NOMBRES']).str.lower().str.strip().apply( 
    unidecode.unidecode )

In [1094]:
import unidecode

In [1097]:
maxau=SIU['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]).apply(len).max()
SIUnew=pd.DataFrame()
kkn=SIU.copy()

In [1098]:
for i in range(maxau):
    print(i)
    kkn['name_tmp']=kkn['UDEA_authors'].apply(lambda x: [y.get('full_name') for y in x ]
                            ).str[i].apply( lambda x: unidecode.unidecode( x.lower().strip()) 
                                                      if not pd.isna(x) else x)

    newcolumns=['name_tmp']+AU_columns
    if not kkn[~kkn['name_tmp'].isna()].empty:
        kk=kkn.merge(AU[newcolumns],on='name_tmp',how='left')
        kky=kk[~kk['NOMBRE COMPLETO'].isna()].reset_index(drop=True)
        kkn=kk[ kk['NOMBRE COMPLETO'].isna()].reset_index(drop=True).drop(
               newcolumns,axis='columns')
        kky=dictionary_list_add_columns(kky,'UDEA_authors','full_name',i,AU_columns)
        
        SIUnew=SIUnew.append( kky ).reset_index(drop=True)
        #update str[i]
    else:
        SIUnew=SIUnew.append( kkn, sort=False ).reset_index(drop=True)
        break

0
1
2
3
4
5
6
7


In [1103]:
SIU=SIUnew.drop(newcolumns,axis='columns')

TODO: Falta añadir datos a nombres que faltan

### Merge with trained data set

In [1108]:
SIUDI=SIU[~SIU.DI.isna()].drop_duplicates('DI').reset_index(drop=True)
SIUTI=SIU[ SIU.DI.isna()].drop_duplicates('TI').reset_index(drop=True)
SIUTI=SIUTI[SIUTI!=''].reset_index(drop=True)
SIUTI=SIUTI[~SIUTI.TI.isnull()].reset_index(drop=True)
SIUTI=SIUTI[ SIUTI.TI.apply(len)>20 ].reset_index(drop=True)

In [1109]:
udea_columns=[       'UDEA_autores',
       'UDEA_año realiz', 'UDEA_doi', 'UDEA_fecha aplicación',
       'UDEA_idioma', 'UDEA_item adic', 'UDEA_material', 'UDEA_nombre',
       'UDEA_nombre revista o premio', 'UDEA_nro autores', 'UDEA_país',
       'UDEA_procodigo', 'UDEA_ptos', 'UDEA_simple_doi', 'UDEA_título',
       'UDEA_valor item','UDEA_authors']

In [1110]:
UDEADI=UDEA[UDEA.DI!=''].drop_duplicates('DI').reset_index(drop=True)
UDEATI=UDEA[UDEA.DI==''].drop_duplicates('TI').reset_index(drop=True)

In [1111]:
UDEA_mergeDI=UDEADI.merge( SIUDI[ ['DI']+udea_columns ],on='DI',how='left' )

In [1112]:
UDEADI.shape,UDEA_mergeDI.shape

((8035, 152), (8035, 169))

In [1113]:
UDEA_PTJ=UDEA_mergeDI[~UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_mergeDI[UDEA_mergeDI.UDEA_autores.isna()].reset_index(drop=True)

In [1114]:
UDEATI['tmptitle']=UDEATI.TI.str.strip()
SIUTI['tmptitle']=SIUTI.TI.str.strip()

In [1115]:
kk=UDEATI.merge( SIUTI[ ['tmptitle']+udea_columns ],on='tmptitle',how='left' ).drop('tmptitle',axis='columns')

In [1116]:
UDEA_PTJ=UDEA_PTJ.append( kk[ ~kk.UDEA_autores.isna() ] ).reset_index(drop=True)
UDEA_PTJ_NOT=UDEA_PTJ_NOT.append( kk[ kk.UDEA_autores.isna() ] ).reset_index(drop=True)

In [1117]:
UDEA_PTJ.shape[0]+UDEA_PTJ_NOT.shape[0],UDEA.shape

(15722, (15726, 152))

In [1118]:
UDEA_PTJ.shape,UDEA_PTJ_NOT.shape

((7074, 169), (8648, 169))

In [1132]:
UDEA_PTJ.loc[[0,1]]

Unnamed: 0,AB,AF,AR,AU,BA,BE,BF,BN,BP,C1,...,UDEA_nombre,UDEA_nombre revista o premio,UDEA_nro autores,UDEA_país,UDEA_procodigo,UDEA_ptos,UDEA_simple_doi,UDEA_título,UDEA_valor item,UDEA_authors
0,,,,"Ramírez, Juan David\nGiraldo, Santiago Patiño\...",,,,,,,...,PATIÑO GIRALDO SANTIAGO,ver UDEA_material,3.0,46.0,53961.0,3.6,0.0,"LINFOMA PRIMARIO DEL CORAZON, CAUSA POCO COMUN...",SOCIEDAD COLOMBIANA DE CARDIOLOGIA Y CIRUGIA C...,"[{'NOMBRES': 'Santiago ', 'CÉDULA': 8358251.0..."
1,,,,"Molina, Marcela\nPalacio, Juan David\nVargas, ...",,,,,,,...,LOPEZ JARAMILLO CARLOS ALBERTO,REVISTA COLOMBIANA DE PSIQUIATRIA,7.0,46.0,55920.0,4.3,0.0,DESEMPENO NEUROCOGNITIVO DE PACIENTES CON TRAS...,,"[{'NOMBRES': 'Cristian David ', 'CÉDULA': 1017..."


## Extrapolates to UDEA_PTJ_NOT

# TMP

In [1119]:
UDEA[ UDEA['authors_WOS'].apply(lambda x: [d for d in  x if d.get('WOS_author')=='Velasquez, Jesus A.']
                                if x else x).apply(len)>0 ]['authors_WOS'].str[0].apply( 
    lambda x: x.get('affiliation')).values

array([list(['Univ Antioquia, Hosp Univ San Vicente Fdn, Medellin, Colombia.', 'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.', 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.']),
       list(['Univ Antioquia, Hosp Univ San Vicente Fdn, Medellin, Colombia.', 'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.', 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.']),
       list(['Univ Antioquia, Hosp Univ San Vicente Fdn, Medellin, Colombia.', 'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.', 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.'])],
      dtype=object)

In [1120]:
y2[-1],z

('Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.',
 'Tolosa, Jorge E.')

In [1121]:
[ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]

['Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.',
 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.']

In [1122]:
y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]

False

In [1125]:
authors[0]['affiliation'].append('kk')

In [1126]:
authors

[{'WOS_author': 'Velasquez, Jesus A.',
  'affiliation': ['Univ Antioquia, Hosp Univ San Vicente Fdn, Medellin, Colombia.',
   'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.',
   'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.',
   'kk'],
  'i': 0},
 {'WOS_author': 'Tolosa, Jorge E.',
  'affiliation': ['Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.',
   'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.'],
  'i': 1}]

## merge separated named by cedula

In [1127]:
affs=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x )

In [1128]:
affsf=affs.apply(lambda x: [  y.split('; ') for y in x])

In [1129]:
affsf.apply( lambda x: [ y.append(x) for y in x]).loc[i]

[None]

In [349]:
import numpy as np

In [352]:
np.array(x).flatten()

array([list(['Velasquez, Jesus A.']),
       list(['Velasquez, Jesus A.', 'Tolosa, Jorge E.']),
       list(['Velasquez, Jesus A.', 'Tolosa, Jorge E.'])], dtype=object)

In [326]:
affs

0        [[Burwick, Richard M.; Tolosa, Jorge E.] Orego...
1        [[Burwick, Richard M.; Rincon, Monica; Tolosa,...
10       [[Castaneda Gallego, Alba D.; Cardona Arias, J...
100      [[Herrera Mejia, Julian; Herrera Mejia, Julian...
1000     [[Mesa, C.; Giraldo, C. A.; Angulo, J.; Ruiz, ...
10000                                                   []
10001                                                   []
10002                                                   []
10003                                                   []
10004                                                   []
10005                                                   []
10006                                                   []
10007                                                   []
10008                                                   []
10009                                                   []
1001     [[Giraldo-Echeverri, C. A.; Taborda, N.; Ruiz,...
10010                                                   

In [306]:
i=120
i=1
affil='Univ Antioquia'
affs=UDEA.C1.str.split('\n').apply(
    lambda x:   [ [z,y.replace('[','').split('] ')[-1]
                  for z in y.replace('[','').split('] ')[0].split('; ')
                  ] for y in x if y.find(affil)>-1 ]if x else x).loc[i]

SyntaxError: invalid syntax (<ipython-input-306-8c7b399dee8b>, line 6)

In [303]:
{ x:'A' for x in [1,2]} 

{1: 'A', 2: 'A'}

In [304]:
affs

[{'Velasquez, Jesus A.': 'Univ Antioquia, Hosp Univ San Vicente Fdn, Medellin, Colombia.'},
 {'Tolosa, Jorge E.': 'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.',
  'Velasquez, Jesus A.': 'Univ Antioquia, Dept Obstet & Ginecol, Medellin, Colombia.'},
 {'Tolosa, Jorge E.': 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.',
  'Velasquez, Jesus A.': 'Univ Antioquia, NACER Salud Sexual & Reprod, Medellin, Colombia.'}]