<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/Query_CTR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+CTR QUERY

Queries for the bibliographic datasets for 
* Web of Science (WOS), 
* Scielo (SCI)
* Scopus  (SCP)
* Puntaje (UDEA)
* Center (CTR)
of the scientific articles of Universidad de Antioquia. 

They were created with:

[WOS_SCI_SCP_PTJ_CTR.ipynb](./WOS_SCI_SCP_PTJ_CTR.ipynb)

In [2]:
import os
VERSION='NEW'
if os.getcwd()=='/content':
    !pip install openpyxl xlrd wosplus > /dev/null

## functions

In [7]:
import pandas as pd
import wosplus as wp
pd.set_option('display.max_colwidth',200)
from venn import draw_venn, generate_colors
import numpy as np
import fuzzywuzzy.process as fwp
from fuzzywuzzy import fuzz

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [4]:
%%writefile drive.cfg
[FILES]
WOS_SCI_SCP_PTJ_CTR.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs
UDEA_authors_with_WOS_info.json=1o1eVT4JD0FMMICq_oxrTJOzWh47veBMw
produccion_fecha_vig_2003_2018.xlsx=1WbtX4K__TTLxXRjuLvqUYz9tuHCIlS5v
UDEA_WOS_SCI_SCP_PTJ.json=1OkVytKbxJwGvXZDkynkSoUDtkUOTaT4A

Overwriting drive.cfg


##  Load data bases

In [5]:
affil='Univ Antioquia'
drive_files=wp.wosplus('drive.cfg')

#### DEBUG: if False stop in UDEA_PTJ!!!!

if os.path.exists(UDEAjsonfile):
    UDEA=               pd.read_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)
else:    
    UDEA=drive_files.read_drive_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)

In [6]:
UDEAjsonfile='WOS_SCI_SCP_PTJ_CTR.json.gz'
tmp=drive_files.load_biblio(UDEAjsonfile,compression='gzip')# TODO CHANGE FOR LAST VERSION IN GOOGLE DRIVE
UDEA=drive_files.biblio['WOS'].copy().reset_index(drop=True)



In [None]:
from check_quality import *
check_quality(UDEA)

## Indices

See: 

`{'CÉDULA': 63508258.0,
  'DEPARTAMENTO': 'Instituto de Biología',
  'FACULTAD': 'Facultad de Ciencias Exactas y Naturales',
  'GRUPO': 'Sin Grupo Asociado',
  'INICIALES': 'I.',
  'NOMBRE COMPLETO': 'Idalyd Fonseca Gonzalez',
  'NOMBRES': 'Idalyd',
  'PRIMER APELLIDO': 'Fonseca',
  'SEGUNDO APELLIDO': 'Gonzalez',
  'WOS_affiliation': ['Univ Antioquia, Colombia.'],
  'WOS_author': ['FONSECA, IDALYD',
   'FONSECA-GONZALEZ, IDALYD',
   'Fonseca-Gonzalez, Idalyd',
   'Fonseca-Gonzalez, I.'],
  'full_name': 'FONSECA GONZALEZ IDALYD'}`

In [174]:
#TODO: Improve indices
json_column='UDEA_authors'
facultades={'key':'FACULTAD',
            'values' : UDEA.UDEA_authors.apply(lambda l: 
                         [d.get('FACULTAD') for d in l] 
                          if type(l)==list else None
                          ).dropna().apply(pd.Series).stack().unique()}
departamentos={'key':'DEPARTAMENTO',
            'values' :UDEA.UDEA_authors.apply(lambda l: 
                         [d.get('DEPARTAMENTO') for d in l] 
                          if type(l)==list else None
                        ).dropna().apply(pd.Series).stack().unique()}
nombre_completo={'key'    : 'NOMBRE COMPLETO',
                 'values' : UDEA[json_column].apply(lambda l: 
                            [d.get('NOMBRE COMPLETO') for d in l] 
                            if type(l)==list else None
                            ).dropna().apply(pd.Series).stack().unique()
                }#Nombres Apellidos. #TODO: Be sure to include full list
full_name={'key'    : 'full_name',
                 'values' : UDEA[json_column].apply(lambda l: 
                            [d.get('full_name') for d in l] 
                            if type(l)==list else None
                            ).dropna().apply(pd.Series).stack().unique()
                }#Apellidos Nombres . #TODO: Be sure to include full list

In [192]:
nombre_completo['values'].shape,full_name['values'].shape

((649,), (1727,))

## Query function

In [176]:
def query_json_column(q,df=UDEA,json_column='UDEA_authors',
                        choices=nombre_completo,scorer=fuzz.partial_token_sort_ratio,**kwargs):
    fchoices=fwp.extractOne(q,choices['values'],scorer=scorer)[0]
    dfF=df[df[json_column].apply(lambda l: True in [ d.get(choices['key'])==fchoices
                                    for d in l] if type(l)==list else False)]
    return dfF.reset_index(drop=True)
    return fchoices

In [177]:
r=query_json_column('Diego Alejandro Restrepo Quintero',df=UDEA,json_column='UDEA_authors',
                        choices=nombre_completo,scorer=fuzz.partial_token_sort_ratio,score_cutoff=79)

In [178]:
r.shape

(37, 181)

In [179]:
r[['TI','AU','authors_WOS',json_column]]

Unnamed: 0,TI,AU,authors_WOS,UDEA_authors
0,The inert doublet model,"Arias, C\nMartins, J\nMartinez, H\nRon, E\nSalzmann, C\nVasconcelos, GMS\nVillalba, F\n","[{'WOS_author': 'Arias, C.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, AA 1226, Medellin, Colombia.']}]","[{'INICIALES': 'O. A.', 'PRIMER APELLIDO': 'Zapata', 'CÉDULA': 15386534.0, 'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', 'WOS_author': ['Zapata, O.', 'Zapata, Oscar'], 'NOMBRE COMPLETO': 'Oscar Albe..."
1,Leptonic charged Higgs decays in the Zee model,"Sierra, DA\nRestrepo, D\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
2,Collider signals of gravitino dark matter in bilinearly broken R-parity,"Hirsch, M\nPorod, W\nResterpo, D\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
3,Probing neutrino mass with multilepton production at the Tevatron in the simplest R-parity violation model,"Magro, MB\nde Campos, F\nEboli, OJP\nPorod, W\nRestrepo, D\nValle, JWF\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
4,The inert Zee model,"Longas, R\nPortillo, D\nRestrepo, D\nZapata, O\n","[{'WOS_author': 'Longas, Robinson', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.']}, {'WOS_author': 'Portillo, Dilia', 'i': 1, 'affiliation': ['Univ Antio...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
5,The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter model,"Horiuchi, S\nMacias, O\nRestrepo, D\nRivera, A\nZapata, O\nSilverwood, H\n","[{'WOS_author': 'Restrepo, Diego', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.']}, {'WOS_author': 'Rivera, Andres', 'i': 1, 'affiliation': ['Univ Antioqu...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
6,Diboson anomaly: Heavy Higgs resonance and QCD vectorlike exotics,"Sierra, DA\nHerrero-Garcia, J\nRestrepo, D\nVicente, A\n","[{'WOS_author': 'Restrepo, D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.']}]","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
7,Fermion dark matter from SO(10) GUTs,"Arbelaez, C\nLongas, R\nRestrepo, D\nZapata, O\n","[{'WOS_author': 'Longas, Robinson', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.']}, {'WOS_author': 'Restrepo, Diego', 'i': 1, 'affiliation': ['Univ Antioquia, Inst Fis, ...","[{'INICIALES': 'O. A.', 'PRIMER APELLIDO': 'Zapata', 'CÉDULA': 15386534.0, 'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', 'WOS_author': ['Zapata, O.', 'Zapata, Oscar'], 'NOMBRE COMPLETO': 'Oscar Albe..."
8,"Connection of gamma rays, dark matter, and Higgs boson searches at the LHC","Ruiz-Alvarez, JD\nPires, CADS\nQueiroz, FS\nRestrepo, D\nda Silva, PSR\n","[{'WOS_author': 'Ruiz-Alvarez, J. D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.']}, {'WOS_author': 'Restrepo, D.', 'i': 1, 'affiliation': ['Univ Antioquia, Inst ...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
9,A model with a viable dark matter candidate and massive neutrinos,"Restrepo, D\nRivera, A\nSanchez, M\nZapata, O\n","[{'WOS_author': 'Restrepo, D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.']}, {'WOS_author': 'Rivera, A.', 'i': 1, 'affiliation': ['Univ Antioquia, Inst Fis, Mede...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."


In [180]:
r=query_json_column('RESTREPO QUINTERO DIEGO ALEJANDRO',df=UDEA,json_column='UDEA_authors',
                        choices=full_name,scorer=fuzz.partial_token_sort_ratio,score_cutoff=79)

In [181]:
r[['TI','AU','authors_WOS',json_column]]

Unnamed: 0,TI,AU,authors_WOS,UDEA_authors
0,The inert doublet model,"Arias, C\nMartins, J\nMartinez, H\nRon, E\nSalzmann, C\nVasconcelos, GMS\nVillalba, F\n","[{'WOS_author': 'Arias, C.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, AA 1226, Medellin, Colombia.']}]","[{'INICIALES': 'O. A.', 'PRIMER APELLIDO': 'Zapata', 'CÉDULA': 15386534.0, 'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', 'WOS_author': ['Zapata, O.', 'Zapata, Oscar'], 'NOMBRE COMPLETO': 'Oscar Albe..."
1,Leptonic charged Higgs decays in the Zee model,"Sierra, DA\nRestrepo, D\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
2,Collider signals of gravitino dark matter in bilinearly broken R-parity,"Hirsch, M\nPorod, W\nResterpo, D\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
3,Probing neutrino mass with multilepton production at the Tevatron in the simplest R-parity violation model,"Magro, MB\nde Campos, F\nEboli, OJP\nPorod, W\nRestrepo, D\nValle, JWF\n",[],"[{'DEPARTAMENTO': 'Instituto de Física', 'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales', 'full_name': 'RESTREPO Q..."
4,The inert Zee model,"Longas, R\nPortillo, D\nRestrepo, D\nZapata, O\n","[{'WOS_author': 'Longas, Robinson', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.']}, {'WOS_author': 'Portillo, Dilia', 'i': 1, 'affiliation': ['Univ Antio...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
5,The Fermi-LAT gamma-ray excess at the Galactic Center in the singlet-doublet fermion dark matter model,"Horiuchi, S\nMacias, O\nRestrepo, D\nRivera, A\nZapata, O\nSilverwood, H\n","[{'WOS_author': 'Restrepo, Diego', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21, Medellin, Colombia.']}, {'WOS_author': 'Rivera, Andres', 'i': 1, 'affiliation': ['Univ Antioqu...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
6,Diboson anomaly: Heavy Higgs resonance and QCD vectorlike exotics,"Sierra, DA\nHerrero-Garcia, J\nRestrepo, D\nVicente, A\n","[{'WOS_author': 'Restrepo, D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Calle 70 52-21 Medellin, Medellin, Colombia.']}]","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
7,Fermion dark matter from SO(10) GUTs,"Arbelaez, C\nLongas, R\nRestrepo, D\nZapata, O\n","[{'WOS_author': 'Longas, Robinson', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.']}, {'WOS_author': 'Restrepo, Diego', 'i': 1, 'affiliation': ['Univ Antioquia, Inst Fis, ...","[{'INICIALES': 'O. A.', 'PRIMER APELLIDO': 'Zapata', 'CÉDULA': 15386534.0, 'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', 'WOS_author': ['Zapata, O.', 'Zapata, Oscar'], 'NOMBRE COMPLETO': 'Oscar Albe..."
8,"Connection of gamma rays, dark matter, and Higgs boson searches at the LHC","Ruiz-Alvarez, JD\nPires, CADS\nQueiroz, FS\nRestrepo, D\nda Silva, PSR\n","[{'WOS_author': 'Ruiz-Alvarez, J. D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.']}, {'WOS_author': 'Restrepo, D.', 'i': 1, 'affiliation': ['Univ Antioquia, Inst ...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."
9,A model with a viable dark matter candidate and massive neutrinos,"Restrepo, D\nRivera, A\nSanchez, M\nZapata, O\n","[{'WOS_author': 'Restrepo, D.', 'i': 0, 'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.']}, {'WOS_author': 'Rivera, A.', 'i': 1, 'affiliation': ['Univ Antioquia, Inst Fis, Mede...","[{'INICIALES': 'D. A.', 'PRIMER APELLIDO': 'Restrepo', 'CÉDULA': 98554575.0, 'full_name': 'RESTREPO QUINTERO DIEGO ALEJANDRO', 'WOS_author': ['Restrepo, Diego', 'Restrepo, D.'], 'NOMBRE COMPLETO':..."


In [182]:
r=query_json_column('Instituto de Física',df=UDEA,json_column='UDEA_authors',
                        choices=departamentos,scorer=fuzz.partial_token_sort_ratio,score_cutoff=79)

In [183]:
r.shape

(862, 181)

In [184]:
import Levenshtein as lv

In [185]:
lv.ratio( 'The inert doublet model'.lower(),'THE INERT ZEE MODEL'.lower() )

0.8095238095238095

In [186]:
fuzz.ratio( 'The inert doublet model'.lower(),'THE INERT ZEE MODEL'.lower() )

81