<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+CTR

Merge the bibliographic datasets for 
* Web of Science (WOS), 
* Scielo (SCI)
* Scopus  (SCP)
* Puntaje (UDEA)
* Center (CTR)
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

Implementation:
The input pure o partially processed database with WOS-SCI-SCP and may be some UDEA entries from PTJ and Center information with additional data about the Full Name UDEA authors.

Addtionaly UDEA entries can be captured from:
1. A previous WOS-SCI-SCP-UDEA
2. A Data Base with a column with full names (FULL LAST NAMES NAMES, e.g VALDEZ GÚZMAN JUAN ALBERTO) and a list of author Aliases in WOS format (Lastname, Name, e.g Valdez-Gúzman, J.A.) with a list of registered affiliations. TODO: Test
3. The database from Puntaje (UDEA). 

In [1]:
# Delete UDEA_columns and start from schratch
REBUILD=True
MERGE_WITH_TRAINED=True

## functions

In [2]:
import pandas as pd
import wosplus as wp
pd.set_option('display.max_colwidth',200)

In [3]:
# %load wos_sci_scp_ptj_ctr.py

In [4]:
from wos_sci_scp_ptj_ctr import *

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [5]:
%%writefile drive.cfg
[FILES]
WOS_SCI_SCP_PTJ_CTR.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs
UDEA_authors_with_WOS_info.json=1o1eVT4JD0FMMICq_oxrTJOzWh47veBMw
produccion_fecha_vig_2003_2018.xlsx=1WbtX4K__TTLxXRjuLvqUYz9tuHCIlS5v
UDEA_WOS_SCI_SCP_PTJ.json=1OkVytKbxJwGvXZDkynkSoUDtkUOTaT4A

Overwriting drive.cfg


##  Load data bases

In [6]:
affil='Univ Antioquia'
drive_files=wp.wosplus('drive.cfg')

#### DEBUG: if False stop in UDEA_PTJ!!!!

if os.path.exists(UDEAjsonfile):
    UDEA=               pd.read_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)
else:    
    UDEA=drive_files.read_drive_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)

In [7]:
RECOVER=False #False for test purposes
UDEAjsonfile='WOS_SCI_SCP_PTJ_CTR.json.gz'
#Test purposes
UDEAjsonfile='UDEA_WOS_SCI_SCP_PTJ.json'
if RECOVER:
    #Requieres latest wosplus!
    tmp=drive_files.load_biblio(UDEAjsonfile)#,compression='gzip')# TODO CHANGE FOR LAST VERSION IN GOOGLE DRIVE
else:
    tmp=drive_files.load_biblio('UDEAtmp.json')
    #drive_files.load_biblio(
    #  'https://raw.githubusercontent.com/restrepo/medicion/master/cienciometria/data/UDEAtmp300.json'
    #    )#Test: 199+1=200 found
    
UDEA=drive_files.biblio['WOS'].reset_index(drop=True)
#DEBUG
#UDEA=UDEA.sample(300,replace=True).reset_index(drop=True) #Test: 77 found
#tmp=drive_files.load_biblio('Sample_WOS.xlsx')



In [8]:
if REBUILD:
    UDEA=UDEA.drop([ c for c in UDEA.columns if c.find('UDEA_')>-1  ],axis='columns')
    UDEA['UDEA_authors']=None
    UDEA['Tipo']=UDEA['Tipo'].str.replace('_{0,1}UDEA','')

In [9]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

WOS_SCP:5820
WOS_SCI_SCP:768
SCI_SCP:1616
WOS:1884
SCI:2892
SCP:2573
WOS_SCI:147


In [10]:
UDEA.shape

(15700, 153)

## Load trained old data 

### Merge WOS_SCP_SCI with trained data set PTJ_CTR

Merge requires split in DI and TI


15700 (15700, 152)
(7072, 169) (8628, 169)

In [11]:
if MERGE_WITH_TRAINED:
    if os.path.exists('WOS_SCP_UDEA_SJR_SIU.xlsx'):
        SIU=pd.read_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
    else:    
        SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
        
    UDEA,SIU=fill_trained_data(UDEA,SIU)#TODO: Remnove SIU

15700 (15700, 152)
(7072, 168) (8628, 168)


In [12]:
if MERGE_WITH_TRAINED:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [13]:
UDEA[UDEA['UDEA_autores']==''].shape

(0, 168)

In [14]:
UDEA.UDEA_autores.dropna().shape

(7072,)

# Puntaje

UDEA

In [15]:
qq=UDEA.copy()

In [16]:
drive_files.biblio['WOS']=qq
drive_files.biblio['WOS'].shape

(15700, 168)

In [17]:
tmp=drive_files.load_biblio('produccion_fecha_vig_2003_2018.xlsx',prefix='UDEA')

In [18]:
pp= drive_files.biblio['UDEA'].copy()

In [19]:
drive_files.biblio['UDEA']=pp

In [20]:
df=merge_puntaje(drive_files)

(32581, 24)
........................................................................

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


7258 : 5806 + 2827 = 8633
.........................................................7258 : 5453 + 353 = 5806
.......................................................7258 : 5389 + 64 = 5453
(3239, 174) + (5389, 152) = 8628


In [21]:
df.shape

(15700, 180)

In [22]:
print(0,'=',df[UDEA['UDEA_autores']==''].shape[0],'; found:',df['UDEA_autores'].dropna().shape[0])

0 = 0 ; found: 10311


In [23]:
UDEA=df.copy()

In [24]:
UDEA.shape

(15700, 180)

## Fill C1 for not WOS entries in WOS format and extract  affiliation from C1

In [11]:
#Fill from SCI_C1
UDEA['C1']=SCI_C1_to_C1(UDEA)

In [12]:
#Fill from SCP_C1='SCP_Authors with affiliations
UDEA['C1']=SCP_Authors_with_affiliations_to_C1(UDEA)

In [13]:
UDEA[UDEA['C1'].isnull()].shape

(0, 153)

In [14]:
UDEA[UDEA.Tipo=='WOS'].reset_index(drop=True).C1.loc[0]

'[Cardenas Ardila, Laura Milena] Tecnol Antioquia, Fac Ingn, Medellin, Colombia.\n[Parra Valencia, Jorge Andrick] Univ Autonoma Bucaramanga, Grp Invest Pensamiento Sistem, Bucaramanga, Colombia.\n[Fernando Ceballos, Yony] Univ Antioquia, Dept Ingn Insdustrial, Medellin, Colombia.\n'

In [15]:
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

# Improve normalization: remove C1s with only affiliation (from Scielo)
UDEA['authors_WOS']=UDEA['authors_WOS'].apply( 
    lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x )

In [16]:
UDEA[UDEA.Tipo=='SCP'].reset_index(drop=True).loc[0].authors_WOS

[{'WOS_author': 'Llano, R. C.',
  'affiliation': ['Gastro-hepatology Group, Univ Antioquia, Hospital Pablo Tobón Uribe, Medellín, Colombia'],
  'i': 0},
 {'WOS_author': 'Gutiérrez, J.C.R.',
  'affiliation': ['Gastro-hepatology Group, Univ Antioquia, Hospital Pablo Tobón Uribe, Medellín, Colombia'],
  'i': 1},
 {'WOS_author': 'Hoyos Duque, S. I.',
  'affiliation': ['Gastro-hepatology Group, Univ Antioquia, Hospital Pablo Tobón Uribe, Medellín, Colombia'],
  'i': 2},
 {'WOS_author': 'García, V.',
  'affiliation': ['Radiology Specialist Corporal Imaging, Univ Antioquia, Hospital Pablo Tobón Uribe, Medellín, Colombia'],
  'i': 3},
 {'WOS_author': 'Arango, G. C.',
  'affiliation': ['Gastro-hepatology Group, Univ Antioquia, Hepatic Transplant Group, Hospital Pablo Tobón Uribe, Medellín, Colombia'],
  'i': 4}]

## Prepare UDEA columns

In [20]:
#TODO: Remove from fill_trained_data(..)
if 'UDEA_autores' in UDEA.columns:
    UDEA['UDEA_autores']=UDEA['UDEA_autores'].apply(lambda s: re.sub('\s+',' ',s) if type(s)==str else s)
    UDEA['UDEA_authors']=UDEA['UDEA_autores'].apply(lambda s: s.split(';') if type(s)==str else s).apply(
                           lambda l: [{'full_name':y} for y in l ] if type(l)==list else l)

## Merge with official researcher list: PTJ

In [21]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [33]:
UPDATE_UDEA_authors_with_AU=True
#if MERGE_WITH_TRAINED:
#    kkn=SIU.copy()
#    kkn=update_institutional_authors(kkn,AU)
#    print(kkn.shape,SIU.shape)
#    SIU=kkn.copy()
#    UPDATE_UDEA_authors_with_AU=False

In [22]:
if (UDEA['UDEA_authors'].dropna().shape[0] and 
    UPDATE_UDEA_authors_with_AU):
    kkn=UDEA.copy()
    kkn=update_institutional_authors(kkn,AU)
    print(kkn.shape,UDEA.shape)
    UDEA=kkn.copy()

Quality check

In [23]:
key_contains_in_list_of_dictionaries(UDEA,'Restrepo, D',column='authors_WOS',key='WOS_author').loc[1:2]

1    [{'WOS_author': 'Granda-Restrepo, Diana', 'i': 0, 'affiliation': ['Univ Antioquia, Fac Quim Farmaceut, Dept Alimentos, Grp Invest Biotecnol Alimentos BIOALI, Medellin, Colombia.']}, {'WOS_author':...
2    [{'WOS_author': 'Restrepo, D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.']}, {'WOS_author': 'Rivera, A.', 'i': 1, 'affiliation': ['Univ Antioquia, Inst Fis, Mede...
Name: authors_WOS, dtype: object

In [36]:
if UPDATE_UDEA_authors_with_AU:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Add `UDEA.authors_WOS` info* within `UDEA.UDEA_authors` data**
(\*) obtained from `UDEA.C1`

(\*\*) Obtained from [puntaje trained old UDEA data](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-trained-data-set) and the [official researcher list](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-official-researcher-list)

Obtain name parts and initials from full name in `UDEA_authors` dictionary and update `UDEA_authors` with them

In [37]:
import sys
if 'UDEA_authors' not in UDEA.columns and REBUILD==False:
    sys.exit('Make MERGE_WITH_TRAINED True and run again')

In [38]:
# Obtain spanish name parts from full name
dictupdatetmp=UDEA['UDEA_authors'].apply(lambda x: [y.update( 
                split_full_names(y,full_name='full_name')  ) if not pd.isnull(
                y.get('full_name')) else y for y in x] 
                                   if type(x)==list 
                                   else x)

In [39]:
kk=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=combinewos )

In [40]:
UDEA['UDEA_authors'].loc[0]

[{'CÉDULA': 8358251.0,
  'DEPARTAMENTO': 'Departamento de Medicina Interna',
  'FACULTAD': 'Facultad de Medicina',
  'GRUPO': 'Grupo de Investigación EDUSALUD, Informed',
  'INICIALES': 'S.',
  'NOMBRE COMPLETO': 'Santiago Patiño Giraldo',
  'NOMBRES': 'Santiago',
  'PRIMER APELLIDO': 'Patiño',
  'SEGUNDO APELLIDO': 'Giraldo',
  'WOS_affiliation': ['Univ Antioquia, Colombia.'],
  'WOS_author': ['Giraldo, Santiago Patino'],
  'full_name': 'PATIÑO GIRALDO SANTIAGO'}]

In [41]:
UDEA.to_json('UDEAtmp.json')

### Load output restuls of previous Cell runs

In [42]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Build a single profile for all

### Fill UDEA_authors with WOS_author info

Obtain UDEA_authors DataFrame: `aunly`

In [43]:
aunly=DataFrame_authors(UDEA)

DELGADO LASTRA JUAN DE DIOS


In [44]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [45]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [46]:
UDEA.shape

(15700, 181)

In [47]:
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [48]:
aunly.shape

(1273, 2)

(800, 2)

## Merge UDEA with authors

In [49]:
UDEA['UDEA_authors']=UDEA['UDEA_authors'].apply(lambda l:fill_full_wos_author_info(l,aunly) )

In [50]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [51]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [52]:
UDEA.shape

(15700, 181)

In [53]:
kk=UDEA.authors_WOS.combine(UDEA.UDEA_authors,func=lambda x,y: get_UDEA_authors(x,y,aunly))

In [54]:
UDEA.UDEA_authors.dropna().shape

(10311,)

(7072,)

(10960,)

In [55]:
UDEA['UDEA_authors']=kk

In [56]:
UDEA.UDEA_authors.dropna().shape,UDEA.shape

((10899,), (15700, 181))

((8446,), (15700, 169))

((10963,), (15704, 181))

In [57]:
aunly.shape

(1273, 2)

(1461, 2)

In [58]:
if not aunly.empty:
    print(aunly.drop_duplicates('tmp_author').shape)

(1273, 2)


In [59]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [60]:
RECOVER=True
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [61]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [62]:
UDEA.to_json('UDEAtmp300.json')

In [63]:
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [None]:
UDEA.to_json('WOS_SCI_SCP_PTJ_CTR.json.gz',compression='gzip') 

## Add PTJ directly from `UDEA_authors` with `WOS_info` DataFrame

In [15]:
aunly=pd.read_json('UDEA_authors_with_WOS_info.json').reset_index(drop=True)

In [81]:
def build_institutional_authors(x,author_df,x_author_key='WOS_author',x_affiliation_key='affiliation',
                                        author_key='WOS_author',
                                        affiliation_key='WOS_affiliation'):
    if type(x)!=list:
        return None
    ll=[]
    for j in range(len(x)):
        
                                #author_WOS→affiliation always have single affiliation
        kk=find_author_affiliation(x[j].get(x_author_key),x[j].get(x_affiliation_key)[0],
                                        author_df=author_df,
                                        author_key=author_key,
                                        affiliation_key=affiliation_key,
                                        ratio=0.9 )
        if kk:
            ll.append(kk)
    if not ll:
        ll=None
    return ll

In [82]:
UDEA['UDEA_authors']=UDEA.authors_WOS.apply(lambda l: build_institutional_authors(l,aunly) )

In [83]:
UDEA['UDEA_authors'].dropna().shape

(6795,)