<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+CTR

Merge the bibliographic datasets for 
* Web of Science (WOS), 
* Scielo (SCI)
* Scopus  (SCP)
* Puntaje (UDEA)
* Center (CTR)
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

Implementation:
The input pure o partially processed database with WOS-SCI-SCP and may be some UDEA entries from PTJ and Center information with additional data about the Full Name UDEA authors.

Addtionaly UDEA entries can be captured from:
1. A previous WOS-SCI-SCP-UDEA
2. A Data Base with a column with full names (FULL LAST NAMES NAMES, e.g VALDEZ GÚZMAN JUAN ALBERTO) and a list of author Aliases in WOS format (Lastname, Name, e.g Valdez-Gúzman, J.A.) with a list of registered affiliations. TODO: Test
3. The database from Puntaje (UDEA). 

In [1]:
# Delete UDEA_columns and start from schratch
REBUILD=True
MERGE_WITH_TRAINED=True

## functions

In [2]:
import unidecode
import Levenshtein as lv
def get_author_info(x):
    sep='; '
    authors=[{'WOS_author':x[0].split(sep)[0],'affiliation':[x[0].split(sep)[-1]],'i':0}]
    iau=1
    for y  in x:
        y2=y.replace('[','').replace('] ',sep).split(sep)
        for z in y2[:-1]:
            aulist=[ d.get('WOS_author') for d in authors]
            if z not in aulist:
                authors.append({'WOS_author':z,'affiliation':[y2[-1]],'i':iau})
                iau=iau+1
            else:
                if y2[-1] not in [ d.get('affiliation') for d in authors if d.get('WOS_author')==z][0]:
                    index_author=[ d.get('i') for d in authors if d.get('WOS_author')==z][0]
                    authors[index_author]['affiliation'].append(y2[-1])
    return authors

def dictionary_list_add_columns(df,df_dl,df_dl_key,df_dl_i,df_columns):
    '''
    For a
     df: Pandas DataFrame 
    with a:
     df_dl: column of list of dictionaries, with
     df_dl_key: dictionary key: e.g x=[{df_dl_key:1},{df_dl_key:2}]
    for the element df_dl_i of the list:
    Update the dictionary with:
        df_dl_key==x[df_dl_i][df_dl_key]
    with the dictionaries { df_columns[i]: df_columns[i].values }
    '''
    dff=df.copy()
    for key in df_columns:
        tmp=dff[df_dl].combine(dff[key],
                func=lambda x,y: y if pd.isna(y) 
                                   else 
                                     [z.update({key:y}) 
                                     if z.get(df_dl_key)==x[df_dl_i][df_dl_key] 
                                     else z 
                                 for z in x  ] )
    return dff

def split_full_names(y,full_name='full_name'):
    """
    From an input dictionary with {full_name:'APPELLIDO1 APPELLIDO2 NOMBRES'}
    Obtain a dictionary with the several name parts.
    """    
    yy=y.get(full_name).title()
    lfn=len(y[full_name].split())
    aps=0
    d={ 'PRIMER APELLIDO':yy.split()[aps] }
    aps=aps+1
    if lfn>=4:
        names=-2
        if lfn==5: # Extra name or last name
            yyy=yy.split()
            ll=pd.np.array( [len(n) for n in yyy ] )
            if ll[3:][ ll[3:]  <= 3 ].shape[0]:
                # last_names first_first_name de(l) second_first_name
                yy=' '.join( [ y for y in yyy if len(y)>=3] )
            else: 
                # first_last name de(l) second_last_name first_names
                tmpll=yyy.pop() # internal memory
                yy=' '.join( yyy )  
        if len( d['PRIMER APELLIDO'] )<=3:
            d['PRIMER APELLIDO']=d['PRIMER APELLIDO']+' '+yy.split()[aps]
            aps=aps+1
            d.update(  {'SEGUNDO APELLIDO':yy.split()[aps]} )
            names=names+1
            
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        aps=aps+1
        if len( d['SEGUNDO APELLIDO'] )<=3:
            d['SEGUNDO APELLIDO']=d['SEGUNDO APELLIDO']+' '+yy.split()[aps]
            if names==-2:
                names=names+1
    elif lfn>=3:        
        d.update({'SEGUNDO APELLIDO':yy.split()[aps]})
        names=-1
    else: #Colombian interpretation (TODO: Includes Brazilian interpretation)    
        names=-1
    d.update({'NOMBRES':' '.join( yy.split()[names:]),
              'INICIALES':' '.join( [z[0]+'.' for z in yy.split()[names:]] ),
              })
    if not d.get('SEGUNDO APELLIDO'):
        d['SEGUNDO APELLIDO']=''
    #if not d.get('NOMBRE COMPLETO'):
    #    d['NOMBRE COMPLETO']=''        
    return d

# Creates mask Search key in a list of dictionay
# First apply convert null values to string
# Second apply: implement a mask
def find_key_in_list_of_dictionaries(df,column,key,pattern):
    return df[column].apply(lambda x: 
                [ '' if pd.isnull( y.get(key)) else y for y in x ]  ).apply(
                            lambda x: 
                [ True if y.get(key).find(pattern)>-1 else False for y in x  ][0]  )

def key_contains_in_list_of_dictionaries(df,pattern,column='authors_WOS',key='WOS_author'):
    #TODO: loop in column len
    i=0
    r=df[ df[column].str[i].apply(lambda x: {} if pd.isnull(x) else x).apply(
                       lambda x: x.get(key) if x else '').str.contains(
        pattern) ][column].reset_index(drop=True)
    return r

def update_institutional_authors(kkn,AU,authors_column='UDEA_authors',authors_column_key='full_name',
                                        AU_first_last_name='PRIMER APELLIDO',
                                        AU_second_last_name='SEGUNDO APELLIDO',
                                        AU_first_names='NOMBRES'
                                ):
    '''
    For a Data base containing full names of the authors of the articles: SIU,
    include detailed information from other database                    : AU.
      authors_column    : kkn Column with dictionary to be updated
      authors_column_key: Key with full name as a value in  authors_column of kkn
      For Spaniard name, e.g "Juan Pedro Restrepo Correa"
      AU_first_last_name: AU column with "Restrepo"
      AU_second_last_name: AU column with "Correa"
      AU_first_names: AU column with "Juan Pedro"
      full_name: Colum in AU with full name
      
    '''
    full_name='name_tmp'
    AU_columns=list( AU.columns.values )

    AU[full_name]=(AU[AU_first_last_name]+' '+AU[AU_second_last_name]+' '+AU[AU_first_names]
                  ).str.lower().str.strip().apply( unidecode.unidecode )

    maxau=kkn[authors_column].apply(lambda l: [d.get(full_name) for d in l ] 
                                    if type(l)==list else []).apply(len).max()
    
    newcolumns=[full_name]+AU_columns
    for i in range(maxau):
        print(i)
        kkn[full_name]=kkn[authors_column].apply(lambda l: [d.get(authors_column_key) for d in l ]
                        if type(l)==list else ''
                            ).str[i].apply( lambda s: unidecode.unidecode( s.lower().strip()) 
                                                      if not pd.isna(s) else s)
        if not kkn[~kkn[full_name].isna()].empty:
            kkn=kkn.merge(AU[newcolumns],on=full_name,how='left').reset_index(drop=True)
            kkn=dictionary_list_add_columns(kkn,authors_column,authors_column_key,i,AU_columns)
            kkn=kkn.drop(newcolumns,axis='columns')
    return kkn

def SCI_C1_to_C1(UDEA,C1='C1',Tipo='Tipo',WOS='WOS',SCI='SCI',affil='Univ Antioquia',
            SCI_C1='SCI_C1',
            regex_normalize_affilliation_SCI=['Universidad de Antioquia','Univ. de Antioquia']):
    UDEA_SCI=UDEA[SCI_C1].combine(UDEA[Tipo],
                    func=lambda x,y: x if y.find(SCI)>-1 and y.find(WOS)==-1 else None)
    for bad_affil in regex_normalize_affilliation_SCI:
        UDEA_SCI=UDEA_SCI.apply(lambda x:x.replace(
            bad_affil,affil) if not pd.isnull(x) else x)
    #Update only if not filled already
    return UDEA_SCI.combine(UDEA[C1],lambda x,y:x if pd.isnull(y) else y)

def SCP_Authors_with_affiliations_to_C1(UDEA,C1='C1',Tipo='Tipo',SCP='SCP',
            affil='Univ Antioquia',SCP_C1='SCP_Authors with affiliations',
            lastname='[\w\-\.\s]+',firstname='[\w\-\.]+',
            regex_normalize_affilliation_SCP=['Universi[dadty]{2,3}\s+[deofDEOF]{2}\s+Antioqu[ií]a',
                                              'U[\.niv]{1,4}\s+[deofDEOF]{0,2}\s*Antioqu[ií]a',
                                              'Antioqu[ií]a\s+[deofDEOF]{0,2}\s*Universi[dadty]{2,3}']):    
    #Remove authors without affiliations
    SCP2WOS=UDEA[SCP_C1].str.replace('(^|;\s+)(({},\s+{};\s+)+)'.format( 
                                        lastname,firstname,lastname,firstname),r'\1',re.UNICODE
                                                     )
    SCP2WOS=SCP2WOS.str.replace('; ','\n').str.replace(
                                '^({},{}),'.format( lastname,lastname),r'[\1]',re.UNICODE).str.replace(
                                '\n({},{}),'.format(lastname,lastname),r'\n[\1]',re.UNICODE).str.replace(
                                '(,\s+\w\.)(\w\.\])'.format(lastname,lastname),r'\1 \2',re.UNICODE)
    # Normalize to WOS
    for bad_affil in regex_normalize_affilliation_SCP:
            SCP2WOS=SCP2WOS.str.replace(bad_affil,affil)
    UDEA_SCP=SCP2WOS.combine(UDEA[Tipo],func=lambda x,y: x if y==SCP else None)
    #Update only if not filled already
    return UDEA_SCP.combine(UDEA[C1],lambda x,y:x if pd.isnull(y) else y)

def wos_names_append(wos_names,last_name,first_names,initials):
    wos_names=wos_names+[           last_name+', '+first_names]
    if len( first_names.split())>1:
        wos_names=wos_names+[ last_name+', '+first_names.split()[0] ]
    if len( first_names.split())==2 and  len(first_names.split()[-1]):
        wos_names=wos_names+[ last_name+', '+first_names.split()[-1] ]
    wos_names=wos_names+[ last_name+', '+initials]
    if len( initials.split())>1:
        wos_names=wos_names+[ last_name+', '+initials.split()[0]]
    if len(initials.split())==2:
        wos_names=wos_names+[
              last_name+', '+first_names.split()[0]+' '+initials.split()[-1] ]
        wos_names=wos_names+[last_name+', '+initials.split()[-1] ]
    return wos_names
    
def wos_names_list(dy ,y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name']   ):
    """
    Generate a list of WOS names possibilitites from full name parts.
    The full name parts are obtained from dictionary: dy
    with keys in the strict order:
       y_keys=[first_last_name,names,initials,second_first_name,[full_name]]
               otptional full_name is used in general function
    Output Example:
      ['Pabon, Adriana Lucia',
       'Pabon, Adriana',
       'Pabon, Lucia',
       'Pabon, A. L.',
       'Pabon, A.',
       'Pabon, Adriana L.',
       'Pabon, L.',
       'Pabon-Vidal, Adriana Lucia',
       'Pabon-Vidal, Adriana',
       'Pabon-Vidal, Lucia',
       'Pabon-Vidal, A. L.',
       'Pabon-Vidal, A.',
       'Pabon-Vidal, Adriana L.',
       'Pabon-Vidal, L.']
    TODO: Initial can be internally generated
    """
    last_name= unidecode.unidecode( dy[y_keys[0]]   )
    first_names=unidecode.unidecode( dy[y_keys[1]]  )
    initials=unidecode.unidecode( dy[y_keys[2]]  )

    wos_names=[]
    wos_names=wos_names_append(wos_names,last_name,first_names,initials)
    
    if dy.get( y_keys[3] ):
        last_names= unidecode.unidecode( dy[y_keys[0]]+'-'+dy[y_keys[3]]   )
        wos_names=wos_names_append(wos_names,last_names,first_names,initials)
    return wos_names
    
def combinewos(x,y,x_keys=['WOS_author','affiliation'],
                   y_keys=['PRIMER APELLIDO','NOMBRES','INICIALES','SEGUNDO APELLIDO','full_name'],
                   xy_keys=['WOS_author','WOS_affiliation']):
    if type(x)==list and type(y)==list:
        for dx in x:
            wos_name=unidecode.unidecode( dx[ x_keys[0] ] )
            WOS_affiliation= dx[x_keys[1]]
            # Try by buildinng spanish-like names list                                
            for i in range( len(y) ):
                if wos_name.title() in wos_names_list(y[i] ,y_keys):
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            for i in range( len(y) ):
                yi_to_list=unidecode.unidecode( y[i][y_keys[4]].title() )
                if yi_to_list:
                    yi_to_list=yi_to_list.split()
                else:
                    yi_to_list=[]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with initials
            wos_name_to_list=wos_name.replace(',','').replace('-',' ').title().split()
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]] ]+y[i][y_keys[2]].split()
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break
            #Try again but comparing full lists with first first name and initial
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[0],
                              y[i][y_keys[2]].split()[-1]]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
            #Try again but comparing full lists with second first name and initial
            for i in range( len(y) ):
                yi_to_list=[y[i][y_keys[0]],y[i][y_keys[3]],y[i][y_keys[1]].split()[-1],
                              y[i][y_keys[2]].split()[0]]
                if not pd.np.setdiff1d(wos_name_to_list,yi_to_list).shape[0]:
                    y[i][  xy_keys[0] ]=[ wos_name ]
                    y[i][  xy_keys[1] ]=WOS_affiliation
                    break                    
                    
                    
    return y

def extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return df[list_of_dictionaries].apply(lambda x: [y.get(dictionary_key) 
                                                        if y.get(dictionary_key) else []   
                                                        for y in x  ]
                           if type(x)==list else [])

def extract_internal_list_as_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    list_of_dictionaries='UDEA_authors',
    dictionary_key='WOS_author'):
    #Extract internal list as value of a dictionary key in a list of dictionaries and empty list otherwise
    
    return extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
                    list_of_dictionaries,dictionary_key).apply(lambda x: 
                           [item for sublist  in x for item in sublist] 
                            if type(x)==list else x)

def mask_on_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
    pattern='RESTREPO QUINTERO DIEGO ALEJANDRO',
    list_of_dictionaries='UDEA_authors',dictionary_key='full_name'):
    """
    Build a mask for a Pandas Series of list of dictionaries of label:
      list_of_dictionaries. 
    The:
      dictionary_key must be a single value like string or float
    """
    return extract_internal_value_of_a_dictionary_key_in_a_list_of_dictionaries(df,
            list_of_dictionaries,dictionary_key).apply( 
            lambda x: pattern in x)

def fill_trained_data(UDEA,SIU,UDEA_authors='UDEA_authors',semicolon_authors='UDEA_autores',Tipo='Tipo',
                      Tipo_prefix='UDEA',full_name='full_name',DI='DI',TI='TI',
                      REMOVE_UDEA_columns=True,
                     udea_columns=[       'UDEA_autores',
           'UDEA_año realiz', 'UDEA_doi', 'UDEA_fecha aplicación',
           'UDEA_idioma', 'UDEA_item adic', 'UDEA_material', 'UDEA_nombre',
           'UDEA_nombre revista o premio', 'UDEA_nro autores', 'UDEA_país',
           'UDEA_procodigo', 'UDEA_ptos', 'UDEA_simple_doi', 'UDEA_título',
           'UDEA_valor item','UDEA_authors']):
    SIU=SIU[SIU[Tipo].str.contains('\+{}'.format(Tipo_prefix))].reset_index(drop=True)

    SIU[semicolon_authors]=SIU[semicolon_authors].str.replace('\s+',' ')

    SIU[UDEA_authors]=SIU[semicolon_authors].str.split(';').apply(lambda x: [{full_name:y} for y in x ])
    
    #Be sure that always be True. The columns will be obtaied from SIU
    REMOVE_UDEA_columns=True
    if REMOVE_UDEA_columns:
        UDEA_columns=[c for c in UDEA.columns if c.find('{}_'.format(Tipo_prefix))>-1]
        UDEA=UDEA.drop(UDEA_columns,axis='columns')    

    SIUDI=SIU[~SIU[DI].isna()].drop_duplicates(DI).reset_index(drop=True)
    SIUTI=SIU[ SIU[DI].isna()].drop_duplicates(TI).reset_index(drop=True)
    SIUTI=SIUTI[SIUTI!=''].reset_index(drop=True)
    SIUTI=SIUTI[~SIUTI[TI].isnull()].reset_index(drop=True)
    SIUTI=SIUTI[ SIUTI[TI].apply(len)>20 ].reset_index(drop=True)

    UDEADI=UDEA[UDEA[DI]!=''].drop_duplicates(DI).reset_index(drop=True)
    UDEATI=UDEA[UDEA[DI]==''].drop_duplicates(TI).reset_index(drop=True)

    UDEA_mergeDI=UDEADI.merge( SIUDI[ [DI]+udea_columns ],on=DI,how='left' )

    UDEA_PTJ=pd.DataFrame()
    UDEA_PTJ_NOT=pd.DataFrame()
    UDEA_PTJ=UDEA_mergeDI[~UDEA_mergeDI[semicolon_authors].isna()].reset_index(drop=True)
    UDEA_PTJ_NOT=UDEA_mergeDI[UDEA_mergeDI[semicolon_authors].isna()].reset_index(drop=True)

    UDEATI['tmptitle']=UDEATI[TI].str.strip()
    SIUTI['tmptitle']=SIUTI[TI].str.strip()

    kk=UDEATI.merge( SIUTI[ ['tmptitle']+udea_columns ],on='tmptitle',how='left' ).drop('tmptitle',axis='columns')

    UDEA_PTJ=UDEA_PTJ.append( kk[ ~kk[semicolon_authors].isna() ] ).reset_index(drop=True)
    UDEA_PTJ_NOT=UDEA_PTJ_NOT.append( kk[ kk[semicolon_authors].isna() ] ).reset_index(drop=True)

    print(UDEA_PTJ.shape[0]+UDEA_PTJ_NOT.shape[0],UDEA.shape)

    print(UDEA_PTJ.shape,UDEA_PTJ_NOT.shape)

    UDEA=UDEA_PTJ.append(
        UDEA_PTJ_NOT).reset_index(
        drop=True)    
    return UDEA,SIU

def build_udea_authors(UDEA,UDEA_authors='UDEA_authors',authors_WOS='authors_WOS'):
    aumax=UDEA[UDEA_authors].dropna().apply(len).max() 
    ua=pd.DataFrame()
    if type(aumax)!=int:
        aumax=1
    for i in range(aumax):
        kkk=pd.DataFrame()
        kkk[UDEA_authors]= UDEA[UDEA_authors].str[i].dropna()
        kkk[authors_WOS]= UDEA[authors_WOS]
        #kkk['SCP_Authors']=UDEA['SCP_Authors']
        kkk['tmp_str']=kkk['UDEA_authors'].astype(str)
        kkk=kkk.drop_duplicates('tmp_str')
        ua=ua.append(kkk).reset_index(drop=True)

    ua['tmp_author']=ua[UDEA_authors].apply( 
            lambda d: d.get('full_name') if type(d)==dict else d)
        
    ua[authors_WOS]=ua[authors_WOS].apply(lambda l: l if l else pd.np.nan)
    ua=ua[~ua[authors_WOS].isna()].reset_index(drop=True)        
    return ua

def DataFrame_authors(UDEA,UDEA_authors='UDEA_authors',
                      WOS_affiliation='WOS_affiliation',
                     WOS_author='WOS_author'):
    ua=build_udea_authors(UDEA)
    full_names=ua['tmp_author'].unique()
    aunly=pd.DataFrame()
    for f in full_names:
        clear_output(wait=True)
        print(f)    
        kk=pd.DataFrame( { 'tmp_author':[f]  } ).merge(
              ua[['tmp_author','UDEA_authors']],on='tmp_author',how='left')

        kk['tmp_str']=kk[UDEA_authors].astype(str)

        kk=kk.drop_duplicates('tmp_str').dropna()#[['tmp_author','UDEA_authors']]

        try:
            laff=list( kk[UDEA_authors].apply(lambda d: d.get( WOS_affiliation )
                                         ).dropna().apply(pd.Series).stack().unique() )
            lau=list( kk[UDEA_authors].apply(lambda d: d.get( WOS_author )
                                         ).dropna().apply(pd.Series).stack().unique() )
        except AttributeError:
            laff=[];lau=[]

        if len(laff)>0 and len(lau)>0:
            tmpupdate=kk['UDEA_authors'].apply(lambda d: d.update({WOS_author:lau,WOS_affiliation:laff}) )

            kk['tmp_str']=kk[UDEA_authors].astype(str)

            kk=kk.drop_duplicates('tmp_str')

            kk['tmp_len']=kk['tmp_str'].apply(len)#.astype(str)

            aunly=aunly.append( kk.sort_values('tmp_len',ascending=False).drop(index=kk.index[1:]).drop(
                   ['tmp_str','tmp_len'],axis='columns') ).reset_index(drop=True)
    
    return aunly

def fill_full_wos_author_info(l,WOS_df,full_name='full_name',full_name_column='tmp_author',
                               WOS_column='UDEA_authors',WOS_author='WOS_author',
                               WOS_affiliation='WOS_affiliation'):
    '''
    WOS_df=aunly
    '''
    newl=[]
    if type(l)==list:
        for d in l:
            if d.get('WOS_author'):
                #find in aunly
                mtch=WOS_df[WOS_df[full_name_column]==d.get(full_name)].reset_index(drop=True)
                if mtch.shape[0]==1:
                    #update d
                    if mtch[WOS_column].loc[0].get(WOS_author):
                        d[WOS_author]=mtch[WOS_column].loc[0].get(WOS_author)
                        d[WOS_affiliation]=mtch[WOS_column].loc[0].get(WOS_affiliation)
            newl.append(d)
    else:
        newl=l
    return newl


def find_author_affiliation(author,affiliation,author_df,column='UDEA_authors',
                            author_key='WOS_author',affiliation_key='WOS_affiliation',ratio=0.9):
    '''
    author_df=aunly
    find the WOS+"UDEA puntaje" dictionary for WOS author:   `author`
    and WOS affiliation:                                    `affiliation`
    The information is  searched in 
    WOS+"UDEA puntaje" DataFrame:                            `author_df`, 
    which has the column:                                    `column` 
    which contains a dictionary with list value for the key: `author_key`
    and list value for the key:                              `affiliation_key`.
    Affiliation must be similar until a Levenshtein ratio:   `ratio`
    '''
    if not author_df.empty:
        au=author_df[ author_df[column].apply(
                    lambda d: d.get(author_key) if type(d)==dict else '').apply(
                      lambda l: author in l)]#.reset_index(drop=True).loc[0,column]
    else:
        au=pd.DataFrame()
        
    if au.shape[0]>0:
        #Fast
        auf=au[au[column].apply(
                 lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                 lambda l: affiliation in l)]
        

        if auf.shape[0]>0:
              return auf.reset_index(drop=True).loc[0,column]          
        #Slow
        else:
            aus=au[au[column].apply(
                 lambda d: d.get(affiliation_key) if type(d)==dict else '').apply(
                 lambda l: len( [af for af in l if lv.ratio(af,affiliation) > ratio ] )>0 )]

            if aus.shape[0]==1: #fix 1 to avoid homonymous
                dold=aus.reset_index(drop=True).loc[0,column]
                # Dictionary is automatically updated in author_df!
                dold[affiliation_key]=dold[affiliation_key]+[affiliation]
                return dold
    else:
        return None

def get_UDEA_authors(x,y,author_df,x_author_key='WOS_author',x_affiliation_key='affiliation',
                        column='UDEA_authors',
                        author_key='WOS_author',affiliation_key='WOS_affiliation',
                        ratio=0.9):
    '''
    author_df=aunly
    get the WOS+"UDEA puntaje" list of dictionaries for WOS author list 
    and affiliation list in the  list of dictionaries:      `x`, 
    where each dictionary have the string value for the key: `x_author_key`, 
    and the list value for the key:                          `x_affiliation_key`.
    The information is obtained directly from the 
    WOS+"UDEA puntaje" list:                                 `y` 
    if already there, or searched in 
    WOS+"UDEA puntaje" DataFrame:                            `author_df`, 
    which has the column:                                    `column` 
    which contains a dictionary with list value for the key: `author_key`
    and list value for the key:                              `affiliation_key`.
    If not foun None is returned.
    WOS info can be changed for other standarized dababase info
    and "UDEA puntaje" can be changed from any other full name author
    and affiliation info.
    
    IMPORTANT:
    The list of values in:                                   `affiliation_key` 
    is automatically updated with the similar first 
    affiliation value of the list in:                        `x_affiliation_key`
    according with the Levenshtein similarity ratio:         `ratio`
    '''
    if type(y)==list:
        #already filled:
        return y

    au=[]
    if ( type(x)==list and x):
        for j in range(len(x)):
            xx=find_author_affiliation(x[j].get(x_author_key),x[j].get(x_affiliation_key)[0],
                                        author_df=author_df,
                                        author_key=author_key,
                                        affiliation_key=affiliation_key,
                                        ratio=ratio )
            if xx:
                au.append(xx)
    if au:
        return au
    else:
        return None



In [3]:
import wosplus as wp
import numpy as np
import pandas as pd
import unidecode
import os
import re
import sys
from IPython.display import clear_output
pd.set_option('display.max_colwidth',200)

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [4]:
%%writefile drive.cfg
[FILES]
WOS_SCI_SCP_PTJ_CTR.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs
UDEA_authors_with_WOS_info.json=1o1eVT4JD0FMMICq_oxrTJOzWh47veBMw
produccion_fecha_vig_2003_2018.xlsx=1WbtX4K__TTLxXRjuLvqUYz9tuHCIlS5v

Overwriting drive.cfg


##  Load data bases

In [5]:
affil='Univ Antioquia'
drive_files=wp.wosplus('drive.cfg')

#### DEBUG: if False stop in UDEA_PTJ!!!!

In [6]:
UDEAjsonfile='WOS_SCI_SCP_PTJ_CTR.json.gz'
if os.path.exists(UDEAjsonfile):
    UDEA=               pd.read_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)
else:    
    UDEA=drive_files.read_drive_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)

In [7]:
if REBUILD:
    UDEA=UDEA.drop([ c for c in UDEA.columns if c.find('UDEA_')>-1  ],axis='columns')
    UDEA['UDEA_authors']=None
    UDEA['Tipo']=UDEA['Tipo'].str.replace('_{0,1}UDEA','')

In [8]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

SCI:2892
WOS_SCP:5820
WOS_SCI_SCP:768
SCP:2573
SCI_SCP:1616
WOS:1884
WOS_SCI:147


In [9]:
UDEA.shape

(15700, 153)

## Fill C1 for not WOS entries in WOS format and extract  affiliation from C1

In [10]:
#Fill from SCI_C1
UDEA['C1']=SCI_C1_to_C1(UDEA)

In [11]:
#Fill from SCP_C1='SCP_Authors with affiliations
UDEA['C1']=SCP_Authors_with_affiliations_to_C1(UDEA)

In [12]:
UDEA[UDEA['C1'].isnull()].shape

(0, 153)

In [13]:
UDEA[UDEA.Tipo=='WOS'].reset_index(drop=True).C1.loc[0]

'[Puerta Suarez, Jenniffer; Sanchez, Leonardo R.; Salazar, Florencia C.; Saka, Hector A.; Rivero, Virginia E.; Motrich, Ruben D.] Univ Natl Cordoba, Fac Ciencias Quim, CONICET, CIBICI,Haya Torre Med Allende, RA-5016 Cordoba, Argentina.\n[Puerta Suarez, Jenniffer; Cardona Maya, Walter D.] Univ Antioquia, SIU, Fac Med, Grp Reprod, Lab 534, Medellin 1226, Colombia.\n[Molina, Rosa; Tissera, Andrea] LAR, RA-5016 Cordoba, Argentina.\n'

In [14]:
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

# Improve normalization: remove C1s with only affiliation (from Scielo)
UDEA['authors_WOS']=UDEA['authors_WOS'].apply( 
    lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x )

In [15]:
UDEA[UDEA.Tipo=='SCP'].reset_index(drop=True).loc[0].authors_WOS

[{'WOS_author': 'Páez-Zapata, E.',
  'affiliation': ['Univ Antioquia, Medellín, Colombia'],
  'i': 0},
 {'WOS_author': 'Posada, I. C.',
  'affiliation': ['Univ Antioquia, Medellín, Colombia'],
  'i': 1}]

## Load trained old data 

### Merge WOS_SCP_SCI with trained data set PTJ_CTR

Merge requires split in DI and TI


15700 (15700, 152)
(7072, 169) (8628, 169)

In [16]:
if MERGE_WITH_TRAINED:
    if os.path.exists('WOS_SCP_UDEA_SJR_SIU.xlsx'):
        SIU=pd.read_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
    else:    
        SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
        
    UDEA,SIU=fill_trained_data(UDEA,SIU)

15700 (15700, 152)
(7072, 169) (8628, 169)


In [17]:
if MERGE_WITH_TRAINED:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Merge with official researcher list

In [18]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [19]:
UPDATE_UDEA_authors_with_AU=True
if MERGE_WITH_TRAINED:
    kkn=SIU.copy()
    kkn=update_institutional_authors(kkn,AU)
    print(kkn.shape,SIU.shape)
    SIU=kkn.copy()
    UPDATE_UDEA_authors_with_AU=False

0
1
2
3
4
5
6
7
8
9
(7916, 205) (7916, 205)


In [20]:
if (UDEA['UDEA_authors'].dropna().shape[0] and 
    UPDATE_UDEA_authors_with_AU):
    kkn=UDEA.copy()
    kkn=update_institutional_authors(kkn,AU)
    print(kkn.shape,UDEA.shape)
    UDEA=kkn.copy()

Quality check

In [21]:
if MERGE_WITH_TRAINED:
    print( SIU[ find_key_in_list_of_dictionaries(SIU,'UDEA_authors','full_name','ZAPATA')
              ].UDEA_authors.loc[241] )

[{'full_name': 'ZAPATA NOREÑA OSCAR ALBERTO', 'CÉDULA': 15386534.0, 'NOMBRE COMPLETO': 'Oscar Alberto Zapata Noreña', 'NOMBRES': 'Oscar Alberto ', 'DEPARTAMENTO': 'Instituto de Física', 'SEGUNDO APELLIDO': 'Noreña', 'PRIMER APELLIDO': 'Zapata', 'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales', 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales'}, {'full_name': 'PONCE GUTIERREZ WILLIAM ANTONIO', 'CÉDULA': 8287417.0, 'NOMBRE COMPLETO': 'William Antonio Ponce Gutierrez', 'NOMBRES': 'William Antonio ', 'DEPARTAMENTO': 'Instituto de Física', 'SEGUNDO APELLIDO': 'Gutierrez', 'PRIMER APELLIDO': 'Ponce', 'GRUPO': 'Grupo de Fenomenologia de Interacciones Fundamentales', 'FACULTAD': 'Facultad de Ciencias Exactas y Naturales'}]


In [22]:
key_contains_in_list_of_dictionaries(UDEA,'Restrepo, D',column='authors_WOS',key='WOS_author').loc[1:2]

1    [{'i': 0, 'WOS_author': 'Granda-Restrepo, Diana', 'affiliation': ['Univ Antioquia, Medellin, Colombia.']}]
2    [{'i': 0, 'WOS_author': 'Restrepo, D.', 'affiliation': ['Univ Antioquia, Inst Fis, Medellin, Colombia.']}]
Name: authors_WOS, dtype: object

In [23]:
if UPDATE_UDEA_authors_with_AU:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Add `UDEA.authors_WOS` info* within `UDEA.UDEA_authors` data**
(\*) obtained from `UDEA.C1`

(\*\*) Obtained from [puntaje trained old UDEA data](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-trained-data-set) and the [official researcher list](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-official-researcher-list)

Obtain name parts and initials from full name in `UDEA_authors` dictionary and update `UDEA_authors` with them

In [24]:
import sys
if 'UDEA_authors' not in UDEA.columns and REBUILD==False:
    sys.exit('Make MERGE_WITH_TRAINED True and run again')

In [25]:
# Obtain spanish name parts from full name
dictupdatetmp=UDEA['UDEA_authors'].apply(lambda x: [y.update( 
                split_full_names(y,full_name='full_name')  ) if not pd.isnull(
                y.get('full_name')) else y for y in x] 
                                   if type(x)==list 
                                   else x)

In [26]:
kk=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=combinewos )

In [27]:
UDEA['UDEA_authors'].loc[0]

[{'INICIALES': 'I. C.',
  'NOMBRES': 'Isabel Cristina',
  'PRIMER APELLIDO': 'Posada',
  'SEGUNDO APELLIDO': 'Zapata',
  'WOS_affiliation': ['Univ Antioquia, Medellín, Colombia'],
  'WOS_author': ['Posada, I. C.'],
  'full_name': 'POSADA ZAPATA ISABEL CRISTINA'}]

In [28]:
UDEA.to_json('UDEAtmp.json')

### Load output restuls of previous Cell runs

In [29]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Build a single profile for all

### Fill UDEA_authors with WOS_author info

Obtain UDEA_authors DataFrame: `aunly`

In [31]:
aunly=DataFrame_authors(UDEA)

RAMIREZ OSSA DIANA MILENA


In [32]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [33]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [34]:
UDEA.shape

(15700, 169)

In [35]:
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [36]:
aunly.shape

(1077, 2)

(800, 2)

## Merge UDEA with authors

In [37]:
UDEA['UDEA_authors']=UDEA['UDEA_authors'].apply(lambda l:fill_full_wos_author_info(l,aunly) )

In [38]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [39]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [40]:
UDEA.shape

(15700, 169)

In [41]:
kk=UDEA.authors_WOS.combine(UDEA.UDEA_authors,func=lambda x,y: get_UDEA_authors(x,y,aunly))

In [42]:
UDEA.UDEA_authors.dropna().shape

(7072,)

(7072,)

(10960,)

In [43]:
UDEA['UDEA_authors']=kk

In [44]:
UDEA.UDEA_authors.dropna().shape,UDEA.shape

((8446,), (15700, 169))

((8446,), (15700, 169))

((10963,), (15704, 181))

In [45]:
aunly.shape

(1077, 2)

(1461, 2)

In [46]:
if not aunly.empty:
    print(aunly.drop_duplicates('tmp_author').shape)

(1077, 2)


In [47]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [48]:
RECOVER=True
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [49]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [50]:
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [51]:
print 1

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-51-c94594b6b28f>, line 1)

# Puntaje

UDEA

In [53]:
#TODO: pass pandas.load args
tmp=drive_files.load_biblio('produccion_fecha_vig_2003_2018.xlsx',prefix='UDEA')

In [54]:
RECOVER=False
if RECOVER:
    tmp=drive_files.load_biblio('UDEA_WOS_SCI_SCP_PTJ.json')# TODO CHANGE FOR LAST VERSION IN GOOGLE DRIVE
else:
    tmp=drive_files.load_biblio('UDEAtmp.json')    
#DEBUG
#tmp=drive_files.load_biblio('Sample_WOS.xlsx')



Normalize title translation

In [55]:
import re

In [56]:
septrans=r'(.+)\((.{10,})\)$'

In [57]:
drive_files.biblio['UDEA']['UDEA_título_list']=drive_files.biblio['UDEA'].UDEA_título.str.replace(
    septrans,r'\1;;\2',re.UNICODE).str.split(';;').apply(
   lambda l: [ re.sub( r'^"','',
                      re.sub( r'"$','', s.strip() )
                     ) for s in l] )#.loc[i]

In [58]:
drive_files.biblio['UDEA']['UDEA_doi']=''

In [59]:
drive_files.biblio['UDEA']=drive_files.biblio['UDEA'].reset_index(drop=True)

In [60]:
drive_files.biblio['UDEA'].Tipo.unique()

array(['UDEA'], dtype=object)

Remove duplicated rows

In [61]:
i=0 #1
titlec='UDEA_TI'
authorc='UDEA_nombre'
authorsc='UDEA_autores'
drive_files.biblio['UDEA']['UDEA_TI']=drive_files.biblio['UDEA'].UDEA_título_list.str[i]

In [62]:
drive_files.biblio['UDEA'].shape

(46212, 23)

In [63]:
multi_au=drive_files.biblio['UDEA'][drive_files.biblio['UDEA'].duplicated(subset=[titlec],keep=False)]

In [64]:
single_au=drive_files.biblio['UDEA'][~drive_files.biblio['UDEA'].duplicated(subset=[titlec],keep=False)]

In [65]:
single_au[authorsc]=single_au[authorc]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
multi_au.shape[0]+single_au.shape[0]

46212

In [67]:
multi_au=multi_au.sort_values(titlec).reset_index(drop=True)
multi_au[[authorc,titlec]][:7]

Unnamed: 0,UDEA_nombre,UDEA_TI
0,JIMENEZ DEL RIO MARLENE,"""ANALYSIS OF THE HFE GENE (H63D AND C282Y) MUTATIONS IN PATIENTS WITH IRON OVERLOAD, FAMILY MEMBERS AND CONTROLS FROM ANTIOQUIA, NORTHWEST COLOMBIA"
1,VELEZ PARDO CARLOS ALBERTO,"""ANALYSIS OF THE HFE GENE (H63D AND C282Y) MUTATIONS IN PATIENTS WITH IRON OVERLOAD, FAMILY MEMBERS AND CONTROLS FROM ANTIOQUIA, NORTHWEST COLOMBIA"
2,LATORRE SIERRA GUILLERMO,"""ANALYSIS OF THE HFE GENE (H63D AND C282Y) MUTATIONS IN PATIENTS WITH IRON OVERLOAD, FAMILY MEMBERS AND CONTROLS FROM ANTIOQUIA, NORTHWEST COLOMBIA"
3,VILLEGAS LANAU MARIA ISABEL,"""INFECCION DEL SITIO OPERATORIO EN PACIENTES CON TRAUMA ABDOMINAL SOMETIDOS A CIRUGIA: PREDICCION DEL RIESGOY COMPORTAMIENTO DE LOS INDICES NNIS Y SENIC"
4,MORALES URIBE CARLOS HERNANDO,"""INFECCION DEL SITIO OPERATORIO EN PACIENTES CON TRAUMA ABDOMINAL SOMETIDOS A CIRUGIA: PREDICCION DEL RIESGOY COMPORTAMIENTO DE LOS INDICES NNIS Y SENIC"
5,CARDONA CADAVID HENRY,"""INHERITED TROMBOPHILIA IS ASSOCIATED WITH DEEP VEIN TROMBOSIS IN A COLOMBIAN POPULATION"
6,BEDOYA BERRIO GABRIEL DE JESUS,"""INHERITED TROMBOPHILIA IS ASSOCIATED WITH DEEP VEIN TROMBOSIS IN A COLOMBIAN POPULATION"


In [None]:
t_old=''
au_old=pd.np.nan
for i in multi_au.index:
    t=multi_au.loc[i,titlec]
    if t==t_old:
        au_old=au_old+';'+multi_au.loc[i,authorc]
        multi_au.loc[i-1,authorsc]=pd.np.nan
    else:
        t_old=t
        multi_au.loc[i-1,authorsc]=au_old
        au_old=multi_au.loc[i,authorc]

In [None]:
multi_au[[authorc,authorsc,titlec]][:7]

In [None]:
multi_au.shape[0],multi_au.dropna(subset=[authorsc]).shape[0]

In [None]:
drive_files.biblio['UDEA']=(multi_au.dropna(subset=[authorsc]).append(single_au,sort=False)
                           ).reset_index(drop=True)
drive_files.biblio['UDEA'].shape

WOS

In [None]:
drive_files.WOS.shape

In [None]:
drive_files.biblio['WOS'].Tipo.unique()

Split file

In [None]:
drive_files.biblio['WOS']['UDEA_authors']=drive_files.biblio['WOS'].UDEA_authors.apply(lambda x: 
                                x if type(x)==list else None).reset_index(drop=True)

In [None]:
drive_files.biblio['WOS'].shape

In [None]:
#ALREADY in PTJ
UDEA_PTJ=drive_files.biblio['WOS'][ ~drive_files.biblio['WOS']['UDEA_authors'].isna()].reset_index(drop=True)
# Missing PTJ. To be proccesses below
UDEA_PTJ_NOT    =drive_files.biblio['WOS'][ drive_files.biblio['WOS']['UDEA_authors'].isna()].reset_index(drop=True)

In [None]:
UDEA_PTJ.shape, UDEA_PTJ_NOT.shape

In [None]:
drive_files.biblio['WOS']=UDEA_PTJ_NOT

In [None]:
udea_columns=[c for c in drive_files.biblio['WOS'].columns if c.find('UDEA_')>-1]

In [None]:
drive_files.biblio['WOS']=drive_files.biblio['WOS'].drop( udea_columns, axis='columns' )

In [None]:
drive_files.biblio['WOS'].shape

In [None]:
DEBUG=False
if DEBUG:
    drive_files.biblio['UDEA']=drive_files.biblio['UDEA'][:10]
    drive_files.biblio['WOS']=drive_files.biblio['WOS'][:10]

drive_files.UDEA=drive_files.biblio['UDEA']
drive_files.WOS =drive_files.biblio['WOS']    

In [None]:
kk=drive_files.merge(left="WOS", right="UDEA",
                     left_DOI="DI", left_TI="TI",
                     right_DOI="UDEA_doi", right_TI="UDEA_TI",
                     left_author="AU", left_year="PY",
                     right_author="UDEA_nombre",right_year="UDEA_año realiz",
                     left_extra_journal="SO",
                     right_extra_journal="UDEA_nombre revista o premio"
                     )

Prepare new WOS

In [None]:
drive_files.biblio['WOS_UDEA'].shape

clean pure PTJ entries

In [None]:
newwos=drive_files.biblio['WOS_UDEA'][drive_files.WOS_UDEA.Tipo!='UDEA']

In [None]:
newwos.shape #expected 7094. Ignoring differences

In [None]:
app_to_UDEA_PTJ=newwos[newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)

In [None]:
app_to_UDEA_PTJ.shape

In [None]:
new_UDEA_not_PTJ=newwos[~newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
new_UDEA_not_PTJ.shape[0]+app_to_UDEA_PTJ.shape[0]

In [None]:
drive_files.biblio['WOS']=new_UDEA_not_PTJ

drive_files.biblio['WOS']=drive_files.biblio['WOS'].drop( 
     [ c for c in drive_files.biblio['WOS'].columns if c.find('UDEA_')>-1  ],
       axis='columns')
new_UDEA_not_PTJ.shape

Prepare new UDEA

kkk=app_to_UDEA_PTJ[['UDEA_TI','Tipo']].merge(drive_files.biblio['UDEA'],on='UDEA_TI',how='right')
drive_files.biblio['UDEA']=kkk[kkk.Tipo_x.isna()].drop('Tipo_x',axis='columns'
                                              ).rename({'Tipo_y':'Tipo'},axis='columns'
                                              ).reset_index(drop=True)

In [None]:
i=1
drive_files.biblio['UDEA']['UDEA_TI']=drive_files.biblio['UDEA'].UDEA_título_list.str[i]

drive_files.biblio['UDEA']=drive_files.biblio['UDEA'].dropna(subset=['UDEA_TI']).reset_index(drop=True)

kkk[~kkk.Tipo_x.isna()].shape,drive_files.biblio['UDEA'].shape

In [None]:
drive_files.biblio['UDEA']=wp.fill_NaN(drive_files.biblio['UDEA'])
drive_files.biblio['WOS'] =wp.fill_NaN(drive_files.biblio['WOS'])    
drive_files.UDEA=drive_files.biblio['UDEA']
drive_files.WOS =drive_files.biblio['WOS']    

In [None]:
kk=drive_files.merge(left="WOS", right="UDEA",
                     left_DOI="DI", left_TI="TI",
                     right_DOI="UDEA_doi", right_TI="UDEA_TI",
                     left_author="AU", left_year="PY",
                     right_author="UDEA_nombre",right_year="UDEA_año realiz",
                     left_extra_journal="SO",
                     right_extra_journal="UDEA_nombre revista o premio"
                     )

WOS

In [None]:
newwos=drive_files.WOS_UDEA[drive_files.WOS_UDEA.Tipo!='UDEA']

In [None]:
newwos.shape

In [None]:
app_to_UDEA_PTJ_2=newwos[newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
app_to_UDEA_PTJ_2.shape

In [None]:
new_UDEA_not_PTJ=newwos[~newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
new_UDEA_not_PTJ.shape[0]+app_to_UDEA_PTJ.shape[0]

In [None]:
miss_2=newwos[~newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
drive_files.biblio['WOS']=new_UDEA_not_PTJ

drive_files.biblio['WOS']=drive_files.biblio['WOS'].drop( 
     [ c for c in drive_files.biblio['WOS'].columns if c.find('UDEA_')>-1  ],
       axis='columns')
drive_files.biblio['WOS'].shape

UDEA

kkk=app_to_UDEA_PTJ_2[['UDEA_TI','Tipo']].merge(drive_files.biblio['UDEA'],on='UDEA_TI',how='right')
drive_files.biblio['UDEA']=kkk[kkk.Tipo_x.isna()].drop('Tipo_x',axis='columns'
                                              ).rename({'Tipo_y':'Tipo'},axis='columns'
                                              ).reset_index(drop=True)

drive_files.biblio['UDEA']['UDEA_TI']=drive_files.biblio['UDEA'].UDEA_título
drive_files.biblio['UDEA']=drive_files.biblio['UDEA'].dropna(subset=['UDEA_TI']).reset_index(drop=True)
kkk[~kkk.Tipo_x.isna()].shape,drive_files.biblio['UDEA'].shape

In [None]:
drive_files.biblio['UDEA']=wp.fill_NaN(drive_files.biblio['UDEA'])
drive_files.biblio['WOS'] =wp.fill_NaN(drive_files.biblio['WOS'])    
drive_files.UDEA=drive_files.biblio['UDEA']
drive_files.WOS =drive_files.biblio['WOS']

In [None]:
kk=drive_files.merge(left="WOS", right="UDEA",
                     left_DOI="DI", left_TI="TI",
                     right_DOI="UDEA_doi", right_TI="UDEA_título",
                     left_author="AU", left_year="PY",
                     right_author="UDEA_nombre",right_year="UDEA_año realiz",
                     left_extra_journal="SO",
                     right_extra_journal="UDEA_nombre revista o premio"
                     )

In [None]:
newwos=drive_files.WOS_UDEA[drive_files.WOS_UDEA.Tipo!='UDEA']

In [None]:
newwos.shape

In [None]:
app_to_UDEA_PTJ_tot=newwos[newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
app_to_UDEA_PTJ_tot.shape

In [None]:
new_UDEA_not_PTJ=newwos[~newwos.Tipo.str.contains('UDEA')].reset_index(drop=True)
new_UDEA_not_PTJ.shape[0]+app_to_UDEA_PTJ_tot.shape[0]

Update aunly and pass again

With the total UDEA_PTJ ~2500 runs againt WOS_SCI_SCP_PTJ.json  and add UDEA_authors info and extrapolates new ones ~500. We will have new ~3000 for a total of 11000~

In [None]:
#qq=pd.read_json('WOS_SCI_SCP_PTJ.json.gz',compression='gzip')

In [None]:
[ c for c in drive_files.biblio['WOS'].columns if c.find('UDEA_')>-1  ]

In [None]:
new_UDEA_PTJ=app_to_UDEA_PTJ.append(app_to_UDEA_PTJ_2).append(app_to_UDEA_PTJ_tot).reset_index(drop=True)

In [None]:
new_UDEA_not_PTJ=new_UDEA_not_PTJ.drop([ c for c in new_UDEA_not_PTJ.columns if c.find('UDEA_')>-1  ],axis='columns')

In [None]:
new_UDEA_PTJ=new_UDEA_PTJ.drop_duplicates('TI').reset_index(drop=True)

In [None]:
new_UDEA_PTJ.shape,new_UDEA_not_PTJ.shape,new_UDEA_PTJ.shape[0]+new_UDEA_not_PTJ.shape[0]

In [None]:
#new_UDEA_PTJ=pd.read_json('new_UDEA_PTJ.json').reset_index(drop=True)

In [None]:
UDEA_PTJ.shape

In [None]:
[ c for c in UDEA_PTJ.columns if c.find('UDEA_')>-1  ]

In [None]:
[ c for c in new_UDEA_PTJ.columns if c.find('UDEA_')>-1  ]

In [None]:
#new_UDEA_PTJ=new_UDEA_PTJ.rename({'UDEA_DOI':'UDEA_doi','UDEA_nombres':'UDEA_autores'},axis='columns')
new_UDEA_PTJ=new_UDEA_PTJ.drop('UDEA_TI',axis='columns')
new_UDEA_PTJ=new_UDEA_PTJ.rename({'UDEA_pais prod':'UDEA_país','UDEA_puntos':'UDEA_ptos'},axis='columns')

In [None]:
new_UDEA_PTJ[['UDEA_título','UDEA_nombre','UDEA_autores']]

In [None]:
kkn=new_UDEA_PTJ.copy()

kkn['UDEA_autores']=kkn['UDEA_autores'].str.replace('\s+',' ')
kkn['UDEA_authors']=kkn['UDEA_autores'].str.split(';').apply(lambda x: [{'full_name':y} for y in x ])

In [None]:
kkn.shape

In [None]:
if kkn.shape[0]:
    AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')
    kkn=update_institutional_authors(kkn,AU)

In [None]:
new_UDEA_PTJ=kkn.copy()

In [None]:
(UDEA_PTJ.shape,new_UDEA_PTJ.shape,new_UDEA_not_PTJ.shape,
 UDEA_PTJ.shape[0]+new_UDEA_PTJ.shape[0]+new_UDEA_not_PTJ.shape[0])

In [None]:
UDEA=UDEA_PTJ.append( new_UDEA_PTJ.append( new_UDEA_not_PTJ,sort=False ),
                     sort=False ).reset_index(drop=True)

In [None]:
UDEA.to_json('WOS_SCI_SCP_PTJ_CTR.json.gz',compression='gzip') 

In [None]:
print 1

# TMP