# Web of Science (WOS)

In [1]:
%load_ext autoreload

In [2]:
%autoreload

In [3]:
import pandas as pd
import re
import sys
import numpy as np
import time
from publications import *
from unidecode import unidecode


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',200)

In [4]:
def columns_add_prefix(df,prefix):
    return df.rename_axis( dict( (key,prefix+'_'+key) for key in df.columns.values) , axis=1)

def fill_NaN(df):
    '''Fill NaN entries with proper empty values
     Type  : dtype: Fill with
     string: "0"  : ''
     float : "float64" 
    '''
    for key in df.columns:
        if df[key].dtype=='O':
            df[key]=df[key].str.strip()
            df[key]=df[key].fillna('')
        elif df[key].dtype=='float64':
            df[key]=df[key].fillna(0.0)
    return df
def read_excel_fill_NaN(*args, **kwargs):
    '''Fill NaN entries with proper empty values
     Type  : dtype: Fill with
     string: "0"  : ''
     float : "float64" 
    '''
    df=pd.read_excel(*args, **kwargs)
    df=fill_NaN(df)
    return df

def add_SJR_info(journal_df,sjr_df,\
                sjr_open_acces_df=pd.DataFrame(),column_journal='SO'):
    if sjr_open_acces_df.shape[0]:
        if not column_journal in journal_df:
            sys.exit('%s no in journal DataFrame: %s' %(column_journal,journal_df))
        journal_df['Open_Access']='No'
        for joa in np.intersect1d( journal_df[column_journal].str.lower().str.strip().unique(),\
                                   sjr_oa.Title.str.lower().str.strip().unique() ):
            moa=journal_df[ journal_df[column_journal].str.lower() == joa ]
            for j in moa.index:
                journal_df.loc[j,'Open_Access']='Yes'    
            
    #for i in sjr_df.index: #Search upon 
    #moa=journal_df[ journal_df[column_journal].str.lower() == sjr_df.Title.str.lower().ix[i] ]
            
        return journal_df
    
#To add to main publications object:
def add_sjr_info_from_issn(self,SJR,column_issn='SN',SJR_column_journal='SJR_Title',SJR_column_issn='SJR_Issn'):
    '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''
    if not SJR_column_journal in self.articles.columns:
        sys.exit("Run first the the more exact and fast add_sjr_info")
            
    self.articles=fill_NaN(self.articles)
    kk=self.articles[self.articles[SJR_column_journal]=='']
    for issn in kk[column_issn].str.replace('-','').unique():
        mtch=SJR[SJR[SJR_column_issn].str.contains(issn)].reset_index(drop=True)
        if mtch.shape[0]>=1:
            moa=kk[ kk[column_issn].str.replace('-','')==issn ]
            if moa.shape[0]>=1:
                #DEBUG: more filters if 
                for key in SJR.columns.values:
                    self.articles.loc[moa.index.values,key]=mtch.ix[0][key]
                    
    return self

def add_sjr_info_from_journal(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):
    '''self is an publication object and SJR is the info for a journal in column SJR_Issn'''
    if not SJR_column_journal in self.articles.columns:
        sys.exit("Run first the more exact and fast add_sjr_info")
        
    self.articles=fill_NaN(self.articles)
    kk=self.articles[self.articles[SJR_column_journal]=='']
    for title in kk[column_journal].str.lower().str.strip().unique():
        #at least 3 long words matched!                    
        #remove no alpha numeric characters
        tmp_title=re.sub('\W+',' ',title).split(' ')
        long_word=0;short_word=0
        short_title=[]
        for t in tmp_title:
            if len(t)>3:
                long_word=long_word+1
            else:
                short_word=short_word+1
            short_title.append(t)
            if (long_word==4 and short_word==0) or  (long_word==3 and short_word>=1):
                break
        short_title=' '.join(short_title)
        mtch=SJR[SJR[SJR_column_journal].str.replace('\W+',' ').str.lower().str.strip().str.contains(\
                                                short_title )].reset_index(drop=True)
        if mtch.shape[0]>=1:
            moa=kk[ kk[column_journal].str.lower().str.strip()==title ]
            if moa.shape[0]>=1:
                for key in SJR.columns.values:
                    self.articles.loc[moa.index.values,key]=mtch.ix[0][key]

    return self

def add_sjr_info(self,SJR,column_journal='SO',SJR_column_journal='SJR_Title'):
    '''self is an publication object and SJR is the info for a journal in column SJR_Title'''
    for joa in np.intersect1d( self.articles[column_journal].str.lower().str.strip().unique(),\
                                   SJR[SJR_column_journal].str.lower().str.strip().unique() ):
        moa=self.articles[ self.articles[column_journal].str.lower() == joa ]
        if moa.shape[0]:
            mtch=SJR[SJR[SJR_column_journal].str.lower().str.strip()==joa].reset_index(drop=True)
            if mtch.shape[0]>=1:
                #DEBUG: filter by ISSN if >1:
                for key in SJR.columns.values:
                    self.articles.loc[moa.index.values,key]=mtch.ix[0][key]
    
    return self
        


Design:
Creates new full data frame joining information from several databases. 
The initial data frame is in Web of Science format, with tge prefix `wos_` in the column names:
The new data is added as addtional columns in either an existing row identified by
* DOI
* Title and if necessary vol, first author surname, journal etc.
In the current implementation only DOI will be working.

In [5]:
sjr_oa=pd.read_excel('scimago/journals_scimago_open_access.xlsx')
sjr=pd.read_excel('scimago/journals_scimago_all.xlsx')

In [6]:
wos=articles(excel_file='wos_full_with_CR.xlsx')
wos.articles=fill_NaN(wos.articles)
wos.articles.shape

(6644, 55)

In [7]:
wos.articles[:1]

Unnamed: 0,AB,AF,AR,AU,BP,CR,DE,DT,EM,EP,FU,FX,GA,ID,IS,JI,LA,NR,PA,PD,PG,PI,PT,PU,PY,RP,SC,SN,SO,TC,TI,UT,VL,WC,DI,PM,RI,EI,OI,CL,CT,CY,SP,SU,BE,BN,HO,PN,SE,MA,SI,GP,CA,BA,BF
0,"Objectives: This work is intended to establish the prevalence of reverse smokers at the villages of Hato Nuevo, San Francisco and Cayo de Palma, Department of Sucre, Colombia, characterizing their...","Alvarez Gomez, Gloria J.\nAlvarez Martinez, Efrain\nJimenez Gomez, Raul\nMosquera Silva, Yolanda\nGaviria Nunez, Angela Maria\nGarces Agudelo, Adriana\nAlonso Duque, Alexander\nZabala Castano, Ale...",1111111172,"Gomez, GJA\nMartinez, EA\nGomez, RJ\nSilva, YM\nNunez, AMG\nAgudelo, AG\nDuque, AA\nCastano, AZ\nGonzalez, EE\nMillan, MI\nOssa, DR",E1,"Axell T, 1996, J ORAL PATHOL MED, V25, P49, DOI 10.1111/j.1600-0714.1996.tb00191.x\nBARIC JM, 1982, ORAL SURG ORAL MED O, V54, P424, DOI 10.1016/0030-4220(82)90389-9\nChang YC, 2001, J ORAL PATHOL...",oral cancer; oral premalignant lesions; reverse smoker; tobacco,Article,gloria@alvarez.nu,E8,CODI,"To the habitants of Hato Nuevo, San Francisco and Cayo de Palma, Sucre,\nColombia; to the field advisors for their help and attention in the\naccomplishment of this study and to the CODI by the pr...",352NW,SUBMUCOUS FIBROSIS; ANDHRA-PRADESH; SMOKING; LESIONS; INDIA;\nLEUKOPLAKIA; EPITHELIUM; CARCINOMA; CIGARETTE; NICOTINE,1,Med. Oral Patol. Oral Cir. Bucal,English,28.0,"CALLE DANIEL BALACIART N 4 PTA 17, VALENCIA, 46020, SPAIN",JAN,8.0,VALENCIA,J,MEDICINA ORAL S L,2008,"Gomez, GJA (reprint author), Univ Antioquia, Fac Dent, Calle 64 52-59, Medellin, Colombia.","Dentistry, Oral Surgery & Medicine",1698-4447,MEDICINA ORAL PATOLOGIA ORAL Y CIRUGIA BUCAL,4.0,"Reverse smokers's and changes in oral mucosa. Department of Sucre,\nColombia",WOS:000259504900001,13,"Dentistry, Oral Surgery & Medicine",,0.0,,,,,,,,,,,,,,,,,,,


## Includes papers from Scopus
* Add articles with doi into WOS: loop with the intersection between DOIs
* Creates a reduced pandas dataframe with not DOI and the DOIs not in intersection
* Add articles with title+author into reduced WOS: loop with the intersection between title+author
* append reduced pandas dataframe with title+author not in intersection


In [8]:
#sc=pd.read_csv('scopus/scopus2006-1977.csv').fillna('').drop('References',1)
#for fs in ['scopus/scopus2007.csv','scopus/scopus2010-2009-2008.csv',\
#          'scopus/scopus2012-2011.csv','scopus/scopus2014-2013.csv','scopus/scopus2016-2015.csv']:
#    print(fs)
#    sc=sc.append(pd.read_csv(fs,error_bad_lines=False))
#sc.to_excel('scp_full_with_Reference.xlsx',index=False)

In [9]:
scp=articles(excel_file='scp_full_with_Reference.xlsx')
scp.articles=scp.articles.rename_axis({'﻿Authors':'Authors'},1)
scp.articles=fill_NaN(scp.articles)

In [10]:
scp.articles[:1]

Unnamed: 0,Abbreviated Source Title,Abstract,Affiliations,Art. No.,Author Keywords,Authors with affiliations,CODEN,Chemicals/CAS,Cited by,Conference code,Conference date,Conference location,Conference name,Correspondence Address,DOI,Document Type,EID,Editors,Funding Details,ISBN,ISSN,Index Keywords,Issue,Language of Original Document,Link,Manufacturers,Molecular Sequence Numbers,Page count,Page end,Page start,PubMed ID,Publisher,References,Source,Source title,Sponsors,Title,Tradenames,Volume,Year,Authors
0,Gaceta Sanit.,"The Millennium Development Goals, and within these the Millennium Targets, constitute a working plan that strives to achieve basic goals within the field of health, eradicate poverty, and ensure c...","Área de Medicina Preventiva Y Salud Pública, Universidad de Alicante, Alicante, Spain; Facultad de Medicina, Universidad Autónoma de Yucatán, Yucatán, Mexico; Facultad Nacional de Salud Pública, U...",,Development; Millenium Goals; Political epidemiology; WHO,"González, D.G., Área de Medicina Preventiva Y Salud Pública, Universidad de Alicante, Alicante, Spain, Observatorio de Políticas Públicas Y Salud (OPPS), Spain, Área de Medicina Preventiva Y Salud...",,,5.0,0.0,,,,"González, D.G.; Área de Medicina Preventiva Y Salud Pública, Universidad de AlicanteSpain; email: Diana.Gil@ua.es",10.1157/13101091,Review,2-s2.0-34547700023,,,,2139111,epidemiology; motivation; politics; public health; review; social change; Epidemiology; Goals; Politics; Public Health; Social Change,SUPPL. 3,Spanish,https://www.scopus.com/inward/record.uri?eid=2-s2.0-34547700023&partnerID=40&md5=2155e0c6cdb235efd8a1953c4b49c370,,,0.0,65,61,17433202.0,,,Scopus,Gaceta Sanitaria,,The challenge to public health of the Millenium Development Goals: An approach from political epidemiology [El reto para la salud pública de los Objetivos de Desarrollo del Milenio: Un enfoque des...,,20,2006.0,"González D.G., Solís M.P., Cantero M.T.R., Moncada M.D.R.O., Giraldo Á.F., Stein A., Díaz C.Á.-D."


In [11]:
if wos.articles.shape[0]>0:
    wos.articles_hash=\
        wos.articles.TI.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.strip().str.replace('\s+','').map(unidecode)
    scp.articles_hash=\
        scp.articles.Title.str.replace(r"[^a-zA-Z0-9 ]", " ").str.lower().str.strip().str.replace('\s+','').map(unidecode)

In [12]:
SCP=columns_add_prefix(scp.articles,'SCP')
SCP[:1]

Unnamed: 0,SCP_Abbreviated Source Title,SCP_Abstract,SCP_Affiliations,SCP_Art. No.,SCP_Author Keywords,SCP_Authors with affiliations,SCP_CODEN,SCP_Chemicals/CAS,SCP_Cited by,SCP_Conference code,SCP_Conference date,SCP_Conference location,SCP_Conference name,SCP_Correspondence Address,SCP_DOI,SCP_Document Type,SCP_EID,SCP_Editors,SCP_Funding Details,SCP_ISBN,SCP_ISSN,SCP_Index Keywords,SCP_Issue,SCP_Language of Original Document,SCP_Link,SCP_Manufacturers,SCP_Molecular Sequence Numbers,SCP_Page count,SCP_Page end,SCP_Page start,SCP_PubMed ID,SCP_Publisher,SCP_References,SCP_Source,SCP_Source title,SCP_Sponsors,SCP_Title,SCP_Tradenames,SCP_Volume,SCP_Year,SCP_Authors
0,Gaceta Sanit.,"The Millennium Development Goals, and within these the Millennium Targets, constitute a working plan that strives to achieve basic goals within the field of health, eradicate poverty, and ensure c...","Área de Medicina Preventiva Y Salud Pública, Universidad de Alicante, Alicante, Spain; Facultad de Medicina, Universidad Autónoma de Yucatán, Yucatán, Mexico; Facultad Nacional de Salud Pública, U...",,Development; Millenium Goals; Political epidemiology; WHO,"González, D.G., Área de Medicina Preventiva Y Salud Pública, Universidad de Alicante, Alicante, Spain, Observatorio de Políticas Públicas Y Salud (OPPS), Spain, Área de Medicina Preventiva Y Salud...",,,5.0,0.0,,,,"González, D.G.; Área de Medicina Preventiva Y Salud Pública, Universidad de AlicanteSpain; email: Diana.Gil@ua.es",10.1157/13101091,Review,2-s2.0-34547700023,,,,2139111,epidemiology; motivation; politics; public health; review; social change; Epidemiology; Goals; Politics; Public Health; Social Change,SUPPL. 3,Spanish,https://www.scopus.com/inward/record.uri?eid=2-s2.0-34547700023&partnerID=40&md5=2155e0c6cdb235efd8a1953c4b49c370,,,0.0,65,61,17433202.0,,,Scopus,Gaceta Sanitaria,,The challenge to public health of the Millenium Development Goals: An approach from political epidemiology [El reto para la salud pública de los Objetivos de Desarrollo del Milenio: Un enfoque des...,,20,2006.0,"González D.G., Solís M.P., Cantero M.T.R., Moncada M.D.R.O., Giraldo Á.F., Stein A., Díaz C.Á.-D."


In [13]:
wos.articles.shape

(6644, 55)

In [14]:
scp.articles.shape

(8524, 41)

In [15]:
newwos=pd.DataFrame()
for i in scp.articles.index:
    if i%500==0: print(i)
    #Try by title or DOI
    hash_match=True
    for si in reversed(range(0,len(scp.articles_hash[i])+1)):
        chk2=pd.DataFrame()
        chk=wos.articles[wos.articles_hash.str.match(scp.articles_hash[i][:si])]
        if si<10:
            hash_match=False
        if chk.shape[0]>0:
            break
        #validation: check for DOI or author
    if scp.articles.DOI.str.strip().ix[i] and not hash_match:
        chk=wos.articles[ wos.articles.DI==scp.articles.DOI.str.strip().ix[i] ]
        if chk.shape[0]:
            hash_match=True
    #Found good match by title or DOI:        
    if hash_match:
        #filter either with DOI or author
        if scp.articles.DOI.str.strip().ix[i]:
            chk2=chk[chk.DI==scp.articles.DOI.str.strip().ix[i]]
        
        if not chk2.shape[0]:
            chk2=chk[chk.AU.str.contains( unidecode( scp.articles.Authors.ix[i].split(' ')[0] ) ) ]
            if chk2.shape[0]:
                if chk2.shape[0]:  #take the first match: DEBUG: further filter woth journal name
                    if chk.DI.str.strip().ix[chk2.index.values[0]]:
                        scp.articles.loc[i,'DOI']=chk.DI.str.strip().ix[chk2.index.values[0]]
                    elif scp.articles.DOI.str.strip().ix[i]:
                        wos.articles.loc[chk2.index.values[0],'DI']=scp.articles.DOI.str.strip().ix[i]
                        
            else:
                hash_match=False
                    
                    
    #print(i,len(scp.articles_hash[i]),si,chk.shape[0],chk2.shape[0],hash_match)
    
    ##Actionschk.AU.str.contains(
    if hash_match:
        ##add columns to wos.articles
        for key in SCP.columns.values:
                wos.articles.loc[chk2.index.values[0],key]=SCP.ix[ i ][key]
        
    else:
        ##append new entry
        ##fill; wos.articles.SO; wos.articles.SN
        #break
        
        newwos=newwos.append( SCP.ix[ i ],ignore_index=True )
        newwos.loc[newwos.shape[0]-1,'SO']=scp.articles.ix[ i ]['Source title']
        newwos.loc[newwos.shape[0]-1,'SN']=re.sub('(^[0-9A-Z]{4})',r'\1-',\
                                                           scp.articles.ix[ i ]['ISSN'])
        #fill the other entries in proper format
    #if i==0:
    #    break

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500


In [16]:
newwos.shape

(3155, 43)

In [17]:
wos.articles=wos.articles.append(newwos)
wos.articles=fill_NaN(wos.articles)

In [18]:
wos.articles.shape

(9799, 96)

## Include information from journals

In [19]:
start=time.time()
#prepare new columns
#prefix='SJR'
#SJR=sjr.rename_axis( dict( (key,prefix+'_'+key) for key in sjr.columns.values) , axis=1)
SJR=columns_add_prefix(sjr,'SJR')
#Add to exising dataframe
column_journal='SO'
self=wos    
self=add_sjr_info(self,SJR)    
print(time.time()-start)

80.6884982585907


In [20]:
start=time.time()
self=add_sjr_info_from_issn(self,SJR)
print(time.time()-start)

10.078721046447754


In [21]:
start=time.time()
self=add_sjr_info_from_journal(self,SJR)
print(time.time()-start)

9.052651405334473


In [22]:
kk=self.articles[self.articles.SJR_Title=='']

In [23]:
kk.shape

(87, 111)

In [24]:
res=wos.articles[wos.articles['SO'].str.lower().str.strip().str.contains('biomedica')]
res[['SO','SJR_Title']]

Unnamed: 0,SO,SJR_Title
43,BIOMEDICA,Expert Review of Anti-Infective Therapy
44,BIOMEDICA,Archivos de Neurociencias
66,BIOMEDICA,Revista Colombiana de Entomologia
85,BIOMEDICA,Histopathology
86,BIOMEDICA,Colombia Medica
144,BIOMEDICA,Livestock Research for Rural Development
146,BIOMEDICA,Dermatology Online Journal
147,BIOMEDICA,Revista de Salud Publica
148,BIOMEDICA,Iatreia
149,BIOMEDICA,Iatreia


## Tag open access papers based on WOS name database

## Tag open access papers based on Scimago SJR database based in SJR_Title
* Articles withoit SJR_Title and not in WOS Open Acces database cannot be tagged as Open Acces

In [None]:
start=time.time()
journal_df=wos.articles
column_journal='SO'
journal_df['Open_Access']='No'
sjr_open_acces_df=sjr_oa
for i in sjr_open_acces_df.index:
    moa=journal_df[ journal_df[column_journal].str.lower() == sjr_open_acces_df.Title.str.lower().ix[i] ]
    for j in moa.index:
        journal_df.loc[j,'Open_Access']='Yes'
time.time()-start

In [None]:
wos.articles[wos.articles.Open_Access=='Yes'].shape

In [None]:
start=time.time()
#journal_df=wos.articles
column_journal='SJR_Title'
#journal_df['Open_Access']='No'
#sjr_open_acces_df=sjr_oa
for i in sjr_open_acces_df.index:
    moa=journal_df[ journal_df[column_journal].str.lower() == sjr_open_acces_df.Title.str.lower().ix[i] ]
    for j in moa.index:
        journal_df.loc[j,'Open_Access']='Yes'
time.time()-start

In [None]:
wos.articles[wos.articles.Open_Access=='Yes'].shape

cms pas B2G-15-007
https://cds.cern.ch/record/2208044

MCFM

In [41]:
pd.__version__

'0.14.1'

In [None]:
wos.articles.sort